MarkdownSharp: src/MarkdownOld.cs ソースファイル

00001 /*
00002  * Markdown  -  A text-to-HTML conversion tool for web writers
00003  * Copyright (c) 2004 John Gruber
00004  * http://daringfireball.net/projects/markdown/
00005  * 
00006  * Copyright (c) 2004 Michel Fortin - Translation to PHP
00007  * http://www.michelf.com/projects/php-markdown/
00008  * 
00009  * Copyright (c) 2004-2005 Milan Negovan - C# translation to .NET
00010  * http://www.aspnetresources.com
00011  * 
00012  */
00013 
00014 #region Copyright and license
00015 
00016 /*
00017 Copyright (c) 2003-2004 John Gruber   
00018 <http://daringfireball.net/>   
00019 All rights reserved.
00020 
00021 Redistribution and use in source and binary forms, with or without
00022 modification, are permitted provided that the following conditions are
00023 met:
00024 
00025 * Redistributions of source code must retain the above copyright notice,
00026   this list of conditions and the following disclaimer.
00027 
00028 * Redistributions in binary form must reproduce the above copyright
00029   notice, this list of conditions and the following disclaimer in the
00030   documentation and/or other materials provided with the distribution.
00031 
00032 * Neither the name "Markdown" nor the names of its contributors may
00033   be used to endorse or promote products derived from this software
00034   without specific prior written permission.
00035 
00036 This software is provided by the copyright holders and contributors "as
00037 is" and any express or implied warranties, including, but not limited
00038 to, the implied warranties of merchantability and fitness for a
00039 particular purpose are disclaimed. In no event shall the copyright owner
00040 or contributors be liable for any direct, indirect, incidental, special,
00041 exemplary, or consequential damages (including, but not limited to,
00042 procurement of substitute goods or services; loss of use, data, or
00043 profits; or business interruption) however caused and on any theory of
00044 liability, whether in contract, strict liability, or tort (including
00045 negligence or otherwise) arising in any way out of the use of this
00046 software, even if advised of the possibility of such damage.
00047 */
00048 
00049 #endregion
00050 
00051 using System;
00052 using System.Collections;
00053 using System.Security.Cryptography;
00054 using System.Text;
00055 using System.Text.RegularExpressions;
00056 
00057 namespace MarkdownSharp
00058 {
00059     [Obsolete("This old version is included only for historical comparison purposes; use at your own risk!")]
00060     public class MarkdownOld
00061     {
00062         public class Pair
00063         {
00064             public Object First;
00065             public Object Second;
00066         }
00067 
00068         #region Class members
00069 
00070         private const int nestedBracketDepth = 6;
00071         private const string emptyElementSuffix = " />"; // Change to ">" for HTML output
00072         private const int tabWidth = 4;
00073 
00074         private static readonly string markerUL;
00075         private static readonly string markerOL;
00076         private static readonly string markerAny;
00077 
00078         private static readonly string nestedBrackets;
00079         private static readonly Hashtable escapeTable;
00080         private static readonly Hashtable backslashEscapeTable;
00081 
00082         private Hashtable urls;
00083         private Hashtable titles;
00084         private Hashtable htmlBlocks;
00085 
00086         private int listLevel = 0;
00087 
00088         #endregion
00089 
00096         static MarkdownOld()
00097         {
00098             nestedBrackets += RepeatString(@"(?>[^\[\]]+|\[", nestedBracketDepth);
00099             nestedBrackets += RepeatString(@"\])*", nestedBracketDepth);
00100 
00101             markerUL = @"[*+-]";
00102             markerOL = @"\d+[.]";
00103             markerAny = string.Format("(?:{0}|{1})", markerUL, markerOL);
00104 
00105             // Table of hash values for escaped characters:
00106             escapeTable = new Hashtable();
00107 
00108             escapeTable[@"\"] = ComputeMD5(@"\");
00109             escapeTable["`"] = ComputeMD5("`");
00110             escapeTable["*"] = ComputeMD5("*");
00111             escapeTable["_"] = ComputeMD5("_");
00112             escapeTable["{"] = ComputeMD5("{");
00113             escapeTable["}"] = ComputeMD5("}");
00114             escapeTable["["] = ComputeMD5("[");
00115             escapeTable["]"] = ComputeMD5("]");
00116             escapeTable["("] = ComputeMD5("(");
00117             escapeTable[")"] = ComputeMD5(")");
00118             escapeTable[">"] = ComputeMD5(">");
00119             escapeTable["#"] = ComputeMD5("#");
00120             escapeTable["+"] = ComputeMD5("+");
00121             escapeTable["-"] = ComputeMD5("-");
00122             escapeTable["."] = ComputeMD5(".");
00123             escapeTable["!"] = ComputeMD5("!");
00124 
00125             // Create an identical table but for escaped characters.
00126             backslashEscapeTable = new Hashtable();
00127 
00128             foreach (string key in escapeTable.Keys)
00129                 backslashEscapeTable[@"\" + key] = escapeTable[key];
00130         }
00131 
00132         public MarkdownOld()
00133         {
00134             urls = new Hashtable();
00135             titles = new Hashtable();
00136             htmlBlocks = new Hashtable();
00137         }
00138 
00145         public string Transform(string text)
00146         {
00147             // Standardize line endings:
00148             // DOS to Unix and Mac to Unix
00149             text = text.Replace("\r\n", "\n").Replace("\r", "\n");
00150 
00151             // Make sure $text ends with a couple of newlines:
00152             text += "\n\n";
00153 
00154             // Convert all tabs to spaces.
00155             text = Detab(text);
00156 
00157             // Strip any lines consisting only of spaces and tabs.
00158             // This makes subsequent regexen easier to write, because we can
00159             // match consecutive blank lines with /\n+/ instead of something
00160             // contorted like /[ \t]*\n+/ .
00161             text = Regex.Replace(text, @"^[ \t]+$", string.Empty, RegexOptions.Multiline);
00162 
00163             // Turn block-level HTML blocks into hash entries
00164             text = HashHTMLBlocks(text);
00165 
00166             // Strip link definitions, store in hashes.
00167             text = StripLinkDefinitions(text);
00168 
00169             text = RunBlockGamut(text);
00170 
00171             text = UnescapeSpecialChars(text);
00172 
00173             return text + "\n";
00174         }
00175 
00176         #region Process link definitions
00177 
00182         private string StripLinkDefinitions(string text)
00183         {
00184             string pattern = string.Format(@"
00185                         ^[ ]{{0,{0}}}\[(.+)\]:  # id = $1
00186                           [ \t]*
00187                           \n?               # maybe *one* newline
00188                           [ \t]*
00189                         <?(\S+?)>?          # url = $2
00190                           [ \t]*
00191                           \n?               # maybe one newline
00192                           [ \t]*
00193                         (?:
00194                             (?<=\s)         # lookbehind for whitespace
00195                             [\x22(]
00196                             (.+?)           # title = $3
00197                             [\x22)]
00198                             [ \t]*
00199                         )?  # title is optional
00200                         (?:\n+|\Z)", tabWidth - 1);
00201 
00202             text = Regex.Replace(text, pattern, new MatchEvaluator(LinkEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00203             return text;
00204         }
00205 
00206         private string LinkEvaluator(Match match)
00207         {
00208             string linkID = match.Groups[1].Value.ToLower();
00209             urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value);
00210 
00211             if (match.Groups[3] != null && match.Groups[3].Length > 0)
00212                 titles[linkID] = match.Groups[3].Value.Replace("\"", "&quot;");
00213 
00214             return string.Empty;
00215         }
00216 
00217         #endregion
00218 
00219         #region Hashify HTML blocks
00220 
00224         private string HashHTMLBlocks(string text)
00225         {
00226             /*
00227              We only want to do this for block-level HTML tags, such as headers,
00228              lists, and tables. That's because we still want to wrap <p>s around
00229              "paragraphs" that are wrapped in non-block-level tags, such as anchors,
00230              phrase emphasis, and spans. The list of tags we're looking for is
00231              hard-coded:
00232             */
00233             string blockTags1 = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del";
00234             string blockTags2 = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math";
00235 
00236             /*
00237              First, look for nested blocks, e.g.:
00238             <div>
00239                 <div>
00240                 tags for inner block must be indented.
00241                 </div>
00242             </div>
00243             
00244              The outermost tags must start at the left margin for this to match, and
00245              the inner nested divs must be indented.
00246              We need to do this before the next, more liberal match, because the next
00247              match will start at the first `<div>` and stop at the first `</div>`.
00248             */
00249             string pattern = string.Format(@"
00250                 (                       # save in $1
00251                     ^                   # start of line  (with /m)
00252                     <({0})              # start tag = $2
00253                     \b                  # word break
00254                     (.*\n)*?            # any number of lines, minimally matching
00255                     </\2>               # the matching end tag
00256                     [ \t]*              # trailing spaces/tabs
00257                     (?=\n+|\Z)          # followed by a newline or end of document
00258                 )", blockTags1);
00259 
00260             text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00261 
00262             // Now match more liberally, simply from `\n<tag>` to `</tag>\n`
00263             pattern = string.Format(@"
00264                (                        # save in $1
00265                     ^                   # start of line  (with /m)
00266                     <({0})              # start tag = $2
00267                     \b                  # word break
00268                     (.*\n)*?            # any number of lines, minimally matching
00269                     .*</\2>             # the matching end tag
00270                     [ \t]*              # trailing spaces/tabs
00271                     (?=\n+|\Z)          # followed by a newline or end of document
00272                 )", blockTags2);
00273 
00274             text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00275 
00276             // Special case just for <hr />. It was easier to make a special case than
00277             // to make the other regex more complicated.
00278             pattern = string.Format(@"
00279                 (?:
00280                     (?<=\n\n)           # Starting after a blank line
00281                     |                   # or
00282                     \A\n?               # the beginning of the doc
00283                 )
00284                 (                       # save in $1
00285                     [ ]{{0, {0}}}
00286                     <(hr)               # start tag = $2
00287                     \b                  # word break
00288                     ([^<>])*?           #
00289                     /?>                 # the matching end tag
00290                     [ \t]*
00291                     (?=\n{{2,}}|\Z)     # followed by a blank line or end of document
00292                 )", tabWidth - 1);
00293             text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.IgnorePatternWhitespace);
00294 
00295             // Special case for standalone HTML comments:
00296             pattern = string.Format(@"
00297                 (?:
00298                     (?<=\n\n)       # Starting after a blank line
00299                     |               # or
00300                     \A\n?           # the beginning of the doc
00301                 )
00302                 (                       # save in $1
00303                     [ ]{{0,{0}}}
00304                     (?s:
00305                         <!
00306                         (--.*?--\s*)+
00307                         >
00308                     )
00309                     [ \t]*
00310                     (?=\n{{2,}}|\Z)     # followed by a blank line or end of document
00311                 )", tabWidth - 1);
00312             text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.IgnorePatternWhitespace);
00313 
00314             return text;
00315         }
00316 
00317         private string HtmlEvaluator(Match match)
00318         {
00319             string text = match.Groups[1].Value;
00320             string key = ComputeMD5(text);
00321             htmlBlocks[key] = text;
00322 
00323             // # String that will replace the block
00324             return string.Concat("\n\n", key, "\n\n");
00325         }
00326 
00327         #endregion
00328 
00329         #region Run transformations that form block-level elements (RunBlockGamut)
00330 
00335         private string RunBlockGamut(string text)
00336         {
00337             text = DoHeaders(text);
00338 
00339             // Do Horizontal Rules:
00340             text = Regex.Replace(text, @"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00341             text = Regex.Replace(text, @"^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00342             text = Regex.Replace(text, @"^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00343 
00344 
00345             text = DoLists(text);
00346             text = DoCodeBlocks(text);
00347             text = DoBlockQuotes(text);
00348 
00349             /*
00350                 We already ran _HashHTMLBlocks() before, in Markdown(), but that
00351                 was to escape raw HTML in the original Markdown source. This time,
00352                 we're escaping the markup we've just created, so that we don't wrap
00353                 <p> tags around block-level tags.
00354             */
00355             text = HashHTMLBlocks(text);
00356 
00357             text = FormParagraphs(text);
00358 
00359             return text;
00360         }
00361 
00362         #endregion
00363 
00364         #region Run transformations within block-level elements (RunSpanGamut)
00365 
00370         private string RunSpanGamut(string text)
00371         {
00372             text = DoCodeSpans(text);
00373 
00374             text = EscapeSpecialChars(text);
00375 
00376             // Process anchor and image tags. Images must come first,
00377             // because ![foo][f] looks like an anchor.
00378             text = DoImages(text);
00379             text = DoAnchors(text);
00380 
00381             // Make links out of things like `<http://example.com/>`
00382             // Must come after DoAnchors(), because you can use < and >
00383             // delimiters in inline links like [this](<url>).
00384             text = DoAutoLinks(text);
00385 
00386             // Fix unencoded ampersands and <'s:
00387             text = EncodeAmpsAndAngles(text);
00388 
00389             text = DoItalicsAndBold(text);
00390 
00391             // Do hard breaks:  
00392             text = Regex.Replace(text, @" {2,}\n", string.Format("<br{0}\n", emptyElementSuffix));
00393 
00394             return text;
00395         }
00396 
00397         #endregion
00398 
00399         #region Parse HTML into tokens
00400 
00411         private ArrayList TokenizeHTML(string text)
00412         {
00413             // Regular expression derived from the _tokenize() subroutine in 
00414             // Brad Choate's MTRegex plugin.
00415             // http://www.bradchoate.com/past/mtregex.php
00416             int pos = 0;
00417             int depth = 6;
00418             ArrayList tokens = new ArrayList();
00419 
00420 
00421             string nestedTags = string.Concat(RepeatString(@"(?:<[a-z\/!$](?:[^<>]|", depth),
00422                 RepeatString(@")*>)", depth));
00423             string pattern = string.Concat(@"(?s:<!(?:--.*?--\s*)+>)|(?s:<\?.*?\?>)|", nestedTags);
00424 
00425             MatchCollection mc = Regex.Matches(text, pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline);
00426 
00427             foreach (Match m in mc)
00428             {
00429                 string wholeTag = m.Value;
00430                 int tagStart = m.Index;
00431                 Pair token = null;
00432 
00433                 if (pos < tagStart)
00434                 {
00435                     token = new Pair();
00436                     token.First = "text";
00437                     token.Second = text.Substring(pos, tagStart - pos);
00438                     tokens.Add(token);
00439                 }
00440 
00441                 token = new Pair();
00442                 token.First = "tag";
00443                 token.Second = wholeTag;
00444                 tokens.Add(token);
00445 
00446                 pos = m.Index + m.Length;
00447             }
00448 
00449             if (pos < text.Length)
00450             {
00451                 Pair token = new Pair();
00452                 token.First = "text";
00453                 token.Second = text.Substring(pos, text.Length - pos);
00454                 tokens.Add(token);
00455             }
00456 
00457             return tokens;
00458         }
00459 
00460         #endregion
00461 
00462         #region Escape special characters
00463 
00464         private string EscapeSpecialChars(string text)
00465         {
00466             ArrayList tokens = TokenizeHTML(text);
00467 
00468             // Rebuild text from the tokens
00469             text = string.Empty;
00470 
00471             foreach (Pair token in tokens)
00472             {
00473                 string value = token.Second.ToString();
00474 
00475                 if (token.First.Equals("tag"))
00476                     /*
00477                         Within tags, encode * and _ so they don't conflict with their use 
00478                         in Markdown for italics and strong. We're replacing each 
00479                         such character with its corresponding MD5 checksum value; 
00480                         this is likely overkill, but it should prevent us from colliding
00481                         with the escape values by accident.
00482                     */
00483                     value = value.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00484                 else
00485                     value = EncodeBackslashEscapes(value);
00486 
00487                 text += value;
00488             }
00489 
00490             return text;
00491         }
00492 
00493         #endregion
00494 
00495         #region Process referenced and inline anchors
00496 
00500         private string DoAnchors(string text)
00501         {
00502             //
00503             // First, handle reference-style links: [link text] [id]
00504             //
00505             string pattern = string.Format(@"
00506             (                               # wrap whole match in $1
00507                 \[
00508                     ({0})                   # link text = $2
00509                 \]
00510 
00511                 [ ]?                        # one optional space
00512                 (?:\n[ ]*)?                 # one optional newline followed by spaces
00513 
00514                 \[
00515                     (.*?)                   # id = $3
00516                 \]
00517             )", nestedBrackets);
00518 
00519             text = Regex.Replace(text, pattern, new MatchEvaluator(AnchorReferenceEvaluator), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
00520 
00521             //
00522             // Next, inline-style links: [link text](url "optional title")
00523             //
00524             pattern = string.Format(@"
00525                 (                          # wrap whole match in $1
00526                     \[
00527                         ({0})              # link text = $2
00528                     \]
00529                     \(                     # literal paren
00530                         [ \t]*
00531                         <?(.*?)>?          # href = $3
00532                         [ \t]*
00533                         (                  # $4
00534                         (['\x22])          # quote char = $5
00535                         (.*?)              # Title = $6
00536                         \5                 # matching quote
00537                         )?                 # title is optional
00538                     \)
00539                 )", nestedBrackets);
00540 
00541             text = Regex.Replace(text, pattern, new MatchEvaluator(AnchorInlineEvaluator), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
00542 
00543             return text;
00544         }
00545 
00546         private string AnchorReferenceEvaluator(Match match)
00547         {
00548             string wholeMatch = match.Groups[1].Value;
00549             string linkText = match.Groups[2].Value;
00550             string linkID = match.Groups[3].Value.ToLower();
00551             string url = null;
00552             string res = null;
00553             string title = null;
00554 
00555             // for shortcut links like [this][].
00556             if (linkID.Equals(string.Empty))
00557                 linkID = linkText.ToLower();
00558 
00559             if (urls[linkID] != null)
00560             {
00561                 url = urls[linkID].ToString();
00562 
00563                 //We've got to encode these to avoid conflicting with italics/bold.
00564                 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00565                 res = string.Format("<a href=\"{0}\"", url);
00566 
00567                 if (titles[linkID] != null)
00568                 {
00569                     title = titles[linkID].ToString();
00570                     title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00571                     res += string.Format(" title=\"{0}\"", title);
00572                 }
00573 
00574                 res += string.Format(">{0}</a>", linkText);
00575             }
00576             else
00577                 res = wholeMatch;
00578 
00579             return res;
00580         }
00581 
00582         private string AnchorInlineEvaluator(Match match)
00583         {
00584             string linkText = match.Groups[2].Value;
00585             string url = match.Groups[3].Value;
00586             string title = match.Groups[6].Value;
00587             string res = null;
00588 
00589             // We've got to encode these to avoid conflicting with italics/bold.
00590             url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00591             res = string.Format("<a href=\"{0}\"", url);
00592 
00593             if (title != null && title.Length > 0)
00594             {
00595                 title = title.Replace("\"", "&quot;").Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00596                 res += string.Format(" title=\"{0}\"", title);
00597             }
00598 
00599             res += string.Format(">{0}</a>", linkText);
00600             return res;
00601         }
00602 
00603         #endregion
00604 
00605         #region Process inline and referenced images
00606 
00610         private string DoImages(string text)
00611         {
00612             // First, handle reference-style labeled images: ![alt text][id]
00613             string pattern = @"
00614                     (               # wrap whole match in $1
00615                     !\[
00616                         (.*?)       # alt text = $2
00617                     \]
00618 
00619                     [ ]?            # one optional space
00620                     (?:\n[ ]*)?     # one optional newline followed by spaces
00621 
00622                     \[
00623                         (.*?)       # id = $3
00624                     \]
00625 
00626                     )";
00627 
00628             text = Regex.Replace(text, pattern, new MatchEvaluator(ImageReferenceEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
00629 
00630             // Next, handle inline images:  ![alt text](url "optional title")
00631             // Don't forget: encode * and _
00632             pattern = @"
00633                 (               # wrap whole match in $1
00634                 !\[
00635                     (.*?)       # alt text = $2
00636                 \]
00637                 \(              # literal paren
00638                     [ \t]*
00639                     <?(\S+?)>?  # src url = $3
00640                     [ \t]*
00641                     (           # $4
00642                     (['\x22])   # quote char = $5
00643                     (.*?)       # title = $6
00644                     \5          # matching quote
00645                     [ \t]*
00646                     )?          # title is optional
00647                 \)
00648                 )";
00649 
00650             text = Regex.Replace(text, pattern, new MatchEvaluator(ImageInlineEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
00651 
00652             return text;
00653         }
00654 
00655         private string ImageReferenceEvaluator(Match match)
00656         {
00657             string wholeMatch = match.Groups[1].Value;
00658             string altText = match.Groups[2].Value;
00659             string linkID = match.Groups[3].Value.ToLower();
00660             string url = null;
00661             string res = null;
00662             string title = null;
00663 
00664             // for shortcut links like ![this][].
00665             if (linkID.Equals(string.Empty))
00666                 linkID = altText.ToLower();
00667 
00668             altText = altText.Replace("\"", "&quot;");
00669 
00670             if (urls[linkID] != null)
00671             {
00672                 url = urls[linkID].ToString();
00673 
00674                 // We've got to encode these to avoid conflicting with italics/bold.
00675                 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00676                 res = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
00677 
00678                 if (titles[linkID] != null)
00679                 {
00680                     title = titles[linkID].ToString();
00681                     title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00682 
00683                     res += string.Format(" title=\"{0}\"", title);
00684                 }
00685 
00686                 res += emptyElementSuffix;
00687             }
00688             else
00689             {
00690                 // If there's no such link ID, leave intact:
00691                 res = wholeMatch;
00692             }
00693 
00694             return res;
00695         }
00696 
00697         private string ImageInlineEvaluator(Match match)
00698         {
00699             string altText = match.Groups[2].Value;
00700             string url = match.Groups[3].Value;
00701             string title = match.Groups[6].Value;
00702             string res = null;
00703 
00704 
00705             altText = altText.Replace("\"", "&quot;");
00706             title = title.Replace("\"", "&quot;");
00707 
00708             // We've got to encode these to avoid conflicting with italics/bold.
00709             url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00710             res = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText);
00711 
00712             title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString());
00713             res += string.Format(" title=\"{0}\"", title);
00714 
00715             res += emptyElementSuffix;
00716             return res;
00717         }
00718 
00719         #endregion
00720 
00721         #region Process headers
00722 
00723         private string DoHeaders(string text)
00724         {
00725             /*
00726             Setext-style headers:
00727             
00728             Header 1
00729             ========
00730               
00731             Header 2
00732             --------
00733             */
00734 
00735             text = Regex.Replace(text, @"^(.+)[ \t]*\n=+[ \t]*\n+", new MatchEvaluator(SetextHeader1Evaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00736             text = Regex.Replace(text, @"^(.+)[ \t]*\n-+[ \t]*\n+", new MatchEvaluator(SetextHeader2Evaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00737 
00738             /*
00739              atx-style headers:
00740                 # Header 1
00741                 ## Header 2
00742                 ## Header 2 with closing hashes ##
00743                 ...
00744                 ###### Header 6
00745             */
00746             string pattern = @"
00747                 ^(\#{1,6})  # $1 = string of #'s
00748                 [ \t]*
00749                 (.+?)       # $2 = Header text
00750                 [ \t]*
00751                 \#*         # optional closing #'s (not counted)
00752                 \n+";
00753 
00754             text = Regex.Replace(text, pattern, new MatchEvaluator(AtxHeaderEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00755 
00756             return text;
00757         }
00758 
00759         private string SetextHeader1Evaluator(Match match)
00760         {
00761             string header = match.Groups[1].Value;
00762             return string.Concat("<h1>", RunSpanGamut(header), "</h1>\n\n");
00763         }
00764 
00765         private string SetextHeader2Evaluator(Match match)
00766         {
00767             string header = match.Groups[1].Value;
00768             return string.Concat("<h2>", RunSpanGamut(header), "</h2>\n\n");
00769         }
00770 
00771         private string AtxHeaderEvaluator(Match match)
00772         {
00773             string headerSig = match.Groups[1].Value;
00774             string headerText = match.Groups[2].Value;
00775 
00776             return string.Concat("<h", headerSig.Length, ">", RunSpanGamut(headerText), "</h", headerSig.Length, ">\n\n");
00777         }
00778 
00779         #endregion
00780 
00781         #region Process ordered and unordered lists
00782 
00783         private string DoLists(string text)
00784         {
00785             // Re-usable pattern to match any entirel ul or ol list:
00786             string pattern = null;
00787 
00788             string wholeList = string.Format(@"
00789             (                               # $1 = whole list
00790               (                             # $2
00791                 [ ]{{0,{1}}}
00792                 ({0})                       # $3 = first list item marker
00793                 [ \t]+
00794               )
00795               (?s:.+?)
00796               (                             # $4
00797                   \z
00798                 |
00799                   \n{{2,}}
00800                   (?=\S)
00801                   (?!                       # Negative lookahead for another list item marker
00802                     [ \t]*
00803                     {0}[ \t]+
00804                   )
00805               )
00806             )", markerAny, tabWidth - 1);
00807 
00808             // We use a different prefix before nested lists than top-level lists.
00809             // See extended comment in _ProcessListItems().
00810             if (listLevel > 0)
00811             {
00812                 pattern = "^" + wholeList;
00813                 text = Regex.Replace(text, pattern, new MatchEvaluator(ListEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00814             }
00815             else
00816             {
00817                 pattern = @"(?:(?<=\n\n)|\A\n?)" + wholeList;
00818                 text = Regex.Replace(text, pattern, new MatchEvaluator(ListEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00819             }
00820 
00821             return text;
00822         }
00823 
00824         private string ListEvaluator(Match match)
00825         {
00826             string list = match.Groups[1].Value;
00827             string listType = Regex.IsMatch(match.Groups[3].Value, markerUL) ? "ul" : "ol";
00828             string result = null;
00829 
00830             // Turn double returns into triple returns, so that we can make a
00831             // paragraph for the last item in a list, if necessary:
00832             list = Regex.Replace(list, @"\n{2,}", "\n\n\n");
00833             result = ProcessListItems(list, markerAny);
00834             result = string.Format("<{0}>\n{1}</{0}>\n", listType, result);
00835 
00836             return result;
00837         }
00838 
00843         private string ProcessListItems(string list, string marker)
00844         {
00845             /*
00846                 The listLevel global keeps track of when we're inside a list.
00847                 Each time we enter a list, we increment it; when we leave a list,
00848                 we decrement. If it's zero, we're not in a list anymore.
00849             
00850                 We do this because when we're not inside a list, we want to treat
00851                 something like this:
00852             
00853                     I recommend upgrading to version
00854                     8. Oops, now this line is treated
00855                     as a sub-list.
00856             
00857                 As a single paragraph, despite the fact that the second line starts
00858                 with a digit-period-space sequence.
00859             
00860                 Whereas when we're inside a list (or sub-list), that line will be
00861                 treated as the start of a sub-list. What a kludge, huh? This is
00862                 an aspect of Markdown's syntax that's hard to parse perfectly
00863                 without resorting to mind-reading. Perhaps the solution is to
00864                 change the syntax rules such that sub-lists must start with a
00865                 starting cardinal number; e.g. "1." or "a.".
00866             */
00867 
00868             listLevel++;
00869 
00870             // Trim trailing blank lines:
00871             list = Regex.Replace(list, @"\n{2,}\z", "\n");
00872 
00873             string pattern = string.Format(
00874               @"(\n)?                      # leading line = $1
00875                 (^[ \t]*)                  # leading whitespace = $2
00876                 ({0}) [ \t]+               # list marker = $3
00877                 ((?s:.+?)                  # list item text = $4
00878                 (\n{{1,2}}))      
00879                 (?= \n* (\z | \2 ({0}) [ \t]+))", marker);
00880 
00881             list = Regex.Replace(list, pattern, new MatchEvaluator(ListEvaluator2),
00882                                   RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
00883             listLevel--;
00884             return list;
00885         }
00886 
00887         private string ListEvaluator2(Match match)
00888         {
00889             string item = match.Groups[4].Value;
00890             string leadingLine = match.Groups[1].Value;
00891 
00892 
00893             if ((leadingLine != null && leadingLine != string.Empty) || Regex.IsMatch(item, @"\n{2,}"))
00894                 item = RunBlockGamut(Outdent(item));
00895             else
00896             {
00897                 // Recursion for sub-lists:
00898                 item = DoLists(Outdent(item));
00899                 item = item.TrimEnd('\n');
00900                 item = RunSpanGamut(item);
00901             }
00902 
00903             return string.Format("<li>{0}</li>\n", item);
00904         }
00905 
00906         #endregion
00907 
00908         #region Process code blocks
00909 
00910         private string DoCodeBlocks(string text)
00911         {
00912             // TODO: Should we allow 2 empty lines here or only one?
00913             string pattern = string.Format(@"
00914                     (?:\n\n|\A)
00915                     (                        # $1 = the code block -- one or more lines, starting with a space/tab
00916                     (?:
00917                         (?:[ ]{{{0}}} | \t)  # Lines must start with a tab or a tab-width of spaces
00918                         .*\n+
00919                     )+
00920                     )
00921                     ((?=^[ ]{{0,{0}}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc",
00922                                             tabWidth);
00923 
00924             text = Regex.Replace(text, pattern,
00925                                   new MatchEvaluator(CodeBlockEvaluator),
00926                                   RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);
00927 
00928             return text;
00929         }
00930 
00931         private string CodeBlockEvaluator(Match match)
00932         {
00933             string codeBlock = match.Groups[1].Value;
00934             codeBlock = EncodeCode(Outdent(codeBlock));
00935 
00936             // Trim leading newlines and trailing whitespace
00937             codeBlock = Regex.Replace(codeBlock, @"^\n+", string.Empty);
00938             codeBlock = Regex.Replace(codeBlock, @"\s+\z", string.Empty);
00939 
00940             return string.Concat("\n\n<pre><code>", codeBlock, "\n</code></pre>\n\n");
00941         }
00942 
00943         #endregion
00944 
00945         #region Process code spans
00946 
00947         private string DoCodeSpans(string text)
00948         {
00949             /*
00950                 *   Backtick quotes are used for <code></code> spans.
00951                 *   You can use multiple backticks as the delimiters if you want to
00952                     include literal backticks in the code span. So, this input:
00953 
00954                     Just type ``foo `bar` baz`` at the prompt.
00955         
00956                     Will translate to:
00957         
00958                       <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
00959         
00960                     There's no arbitrary limit to the number of backticks you
00961                     can use as delimters. If you need three consecutive backticks
00962                     in your code, use four for delimiters, etc.
00963         
00964                 *   You can use spaces to get literal backticks at the edges:
00965         
00966                       ... type `` `bar` `` ...
00967         
00968                     Turns to:
00969         
00970                       ... type <code>`bar`</code> ...           
00971             */
00972 
00973             string pattern = @"
00974                     (`+)        # $1 = Opening run of `
00975                     (.+?)       # $2 = The code block
00976                     (?<!`)
00977                     \1
00978                     (?!`)";
00979             text = Regex.Replace(text, pattern,
00980                                   new MatchEvaluator(CodeSpanEvaluator),
00981                                   RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
00982 
00983             return text;
00984         }
00985 
00986         private string CodeSpanEvaluator(Match match)
00987         {
00988             string s = match.Groups[2].Value;
00989             s = s.Replace(@"^[ \t]*", string.Empty).Replace(@"[ \t]*$", string.Empty);
00990             s = EncodeCode(s);
00991 
00992             return string.Concat("<code>", s, "</code>");
00993         }
00994 
00995         #endregion
00996 
00997         #region Encode/escape certain characters inside Markdown code runs
00998 
01006         private string EncodeCode(string code)
01007         {
01008             code = code.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;");
01009 
01010             foreach (string key in escapeTable.Keys)
01011                 code = code.Replace(key, escapeTable[key].ToString());
01012 
01013             return code;
01014         }
01015 
01016         #endregion
01017 
01018         #region Process bold and italics
01019 
01020         private string DoItalicsAndBold(string text)
01021         {
01022             // <strong> must go first:
01023             text = Regex.Replace(text, @"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1",
01024                                   new MatchEvaluator(BoldEvaluator),
01025                                   RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
01026 
01027             // Then <em>:
01028             text = Regex.Replace(text, @"(\*|_) (?=\S) (.+?) (?<=\S) \1",
01029                                   new MatchEvaluator(ItalicsEvaluator),
01030                                   RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
01031             return text;
01032         }
01033 
01034         private string ItalicsEvaluator(Match match)
01035         {
01036             return string.Format("<em>{0}</em>", match.Groups[2].Value);
01037         }
01038 
01039         private string BoldEvaluator(Match match)
01040         {
01041             return string.Format("<strong>{0}</strong>", match.Groups[2].Value);
01042         }
01043 
01044         #endregion
01045 
01046         #region Process blockquotes
01047 
01048         private string DoBlockQuotes(string text)
01049         {
01050             string pattern =
01051                 @"(                     # Wrap whole match in $1
01052                 (
01053                 ^[ \t]*>[ \t]?          # '>' at the start of a line
01054                     .+\n                # rest of the first line
01055                 (.+\n)*                 # subsequent consecutive lines
01056                 \n*                     # blanks
01057                 )+
01058             )";
01059 
01060             text = Regex.Replace(text, pattern, new MatchEvaluator(BlockQuoteEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
01061             return text;
01062         }
01063 
01064         private string BlockQuoteEvaluator(Match match)
01065         {
01066             string bq = match.Groups[1].Value;
01067 
01068             // Trim one level of quoting - trim whitespace-only lines
01069             bq = Regex.Replace(bq, @"^[ \t]*>[ \t]?", string.Empty, RegexOptions.Multiline);
01070             bq = Regex.Replace(bq, @"^[ \t]+$", string.Empty, RegexOptions.Multiline);
01071 
01072             bq = RunBlockGamut(bq);
01073             bq = Regex.Replace(bq, @"^", "  ", RegexOptions.Multiline);
01074 
01075             // These leading spaces screw with <pre> content, so we need to fix that:
01076             bq = Regex.Replace(bq, @"(\s*<pre>.+?</pre>)", new MatchEvaluator(BlockQuoteEvaluator2), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline);
01077 
01078             return string.Format("<blockquote>\n{0}\n</blockquote>\n\n", bq);
01079         }
01080 
01081         private string BlockQuoteEvaluator2(Match match)
01082         {
01083             string pre = match.Groups[1].Value;
01084             pre = Regex.Replace(pre, @"^  ", string.Empty, RegexOptions.Multiline);
01085 
01086             return pre;
01087         }
01088 
01089         #endregion
01090 
01091         #region Create paragraph tags
01092 
01093         private string FormParagraphs(string text)
01094         {
01095             // Strip leading and trailing lines:
01096             text = Regex.Replace(text, @"^\n+", string.Empty);
01097             text = Regex.Replace(text, @"\n+\z", string.Empty);
01098 
01099             string[] grafs = Regex.Split(text, @"\n{2,}");
01100 
01101             // Wrap <p> tags.
01102             for (int i = 0; i < grafs.Length; i++)
01103             {
01104                 // Milan Negovan: I'm adding an additional check for an empty block of code.
01105                 // Otherwise an empty <p></p> is created.
01106                 if (htmlBlocks[grafs[i]] == null && grafs[i].Length > 0)
01107                 {
01108                     string block = grafs[i];
01109 
01110                     block = RunSpanGamut(block);
01111                     block = Regex.Replace(block, @"^([ \t]*)", "<p>");
01112                     block += "</p>";
01113 
01114                     grafs[i] = block;
01115                 }
01116             }
01117 
01118             // Unhashify HTML blocks
01119             for (int i = 0; i < grafs.Length; i++)
01120             {
01121                 string block = (string)htmlBlocks[grafs[i]];
01122 
01123                 if (block != null)
01124                     grafs[i] = block;
01125             }
01126 
01127             return string.Join("\n\n", grafs);
01128 
01129         }
01130 
01131         #endregion
01132 
01133         #region Process emails and links
01134 
01135         private string DoAutoLinks(string text)
01136         {
01137             text = Regex.Replace(text, "<((https?|ftp):[^'\">\\s]+)>", new MatchEvaluator(HyperlinkEvaluator));
01138 
01139             // Email addresses: <address@domain.foo>
01140             string pattern =
01141                 @"<
01142                 (?:mailto:)?
01143                 (
01144                     [-.\w]+
01145                     \@
01146                     [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
01147                 )
01148                 >";
01149 
01150             text = Regex.Replace(text, pattern, new MatchEvaluator(EmailEvaluator), RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
01151 
01152             return text;
01153         }
01154 
01155         private string HyperlinkEvaluator(Match match)
01156         {
01157             string link = match.Groups[1].Value;
01158             return string.Format("<a href=\"{0}\">{0}</a>", link);
01159         }
01160 
01161         private string EmailEvaluator(Match match)
01162         {
01163             string email = UnescapeSpecialChars(match.Groups[1].Value);
01164 
01165             /*
01166                 Input: an email address, e.g. "foo@example.com"
01167             
01168                 Output: the email address as a mailto link, with each character
01169                         of the address encoded as either a decimal or hex entity, in
01170                         the hopes of foiling most address harvesting spam bots. E.g.:
01171             
01172                   <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
01173                     x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
01174                     &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
01175             
01176                 Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
01177                 mailing list: <http://tinyurl.com/yu7ue>
01178             
01179              */
01180             email = "mailto:" + email;
01181 
01182             // leave ':' alone (to spot mailto: later) 
01183             email = Regex.Replace(email, @"([^\:])", new MatchEvaluator(EncodeEmailEvaluator));
01184 
01185             email = string.Format("<a href=\"{0}\">{0}</a>", email);
01186 
01187             // strip the mailto: from the visible part
01188             email = Regex.Replace(email, "\">.+?:", "\">");
01189             return email;
01190         }
01191 
01192         private string EncodeEmailEvaluator(Match match)
01193         {
01194             char c = Convert.ToChar(match.Groups[1].Value);
01195 
01196             Random rnd = new Random();
01197             int r = rnd.Next(0, 100);
01198 
01199             // Original author note:
01200             // Roughly 10% raw, 45% hex, 45% dec 
01201             // '@' *must* be encoded. I insist.
01202             if (r > 90 && c != '@') return c.ToString();
01203             if (r < 45) return string.Format("&#x{0:x};", (int)c);
01204 
01205             return string.Format("&#x{0:x};", (int)c);
01206         }
01207 
01208         #endregion
01209 
01210         #region EncodeAmpsAndAngles, EncodeBackslashEscapes, UnescapeSpecialChars, Outdent, UnslashQuotes
01211 
01215         private string EncodeAmpsAndAngles(string text)
01216         {
01217             // Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
01218             // http://bumppo.net/projects/amputator/
01219 
01220             text = Regex.Replace(text, @"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)", "&amp;");
01221 
01222             // Encode naked <'s
01223             text = Regex.Replace(text, @"<(?![a-z/?\$!])", "&lt;", RegexOptions.IgnoreCase);
01224 
01225             return text;
01226         }
01227 
01228         private string EncodeBackslashEscapes(string value)
01229         {
01230             // Must process escaped backslashes first.
01231             foreach (string key in backslashEscapeTable.Keys)
01232                 value = value.Replace(key, backslashEscapeTable[key].ToString());
01233 
01234             return value;
01235         }
01236 
01240         private string UnescapeSpecialChars(string text)
01241         {
01242             foreach (string key in escapeTable.Keys)
01243                 text = text.Replace(escapeTable[key].ToString(), key);
01244 
01245             return text;
01246         }
01247 
01251         private string Outdent(string block)
01252         {
01253             return Regex.Replace(block, @"^(\t|[ ]{1," + tabWidth.ToString() + @"})", string.Empty, RegexOptions.Multiline);
01254         }
01255         #endregion
01256 
01257         #region Replace tabs with spaces and pad them to tab width
01258 
01259         private string Detab(string text)
01260         {
01261             // Inspired from a post by Bart Lateur: 
01262             // http://www.nntp.perl.org/group/perl.macperl.anyperl/154
01263             return Regex.Replace(text, @"^(.*?)\t", new MatchEvaluator(TabEvaluator), RegexOptions.Multiline);
01264         }
01265 
01266         private string TabEvaluator(Match match)
01267         {
01268             string leading = match.Groups[1].Value;
01269             return string.Concat(leading, RepeatString(" ", tabWidth - leading.Length % tabWidth));
01270         }
01271 
01272         #endregion
01273 
01274         #region Helper methods (RepeatString & ComputeMD5)
01275 
01282         private static string RepeatString(string text, int count)
01283         {
01284             string res = null;
01285 
01286             for (int i = 0; i < count; i++)
01287                 res += text;
01288 
01289             return res;
01290         }
01291 
01297         private static string ComputeMD5(string text)
01298         {
01299             MD5 algo = MD5.Create();
01300             byte[] plainText = Encoding.UTF8.GetBytes(text);
01301             byte[] hashedText = algo.ComputeHash(plainText);
01302             string res = null;
01303 
01304             foreach (byte b in hashedText)
01305                 res += b.ToString("x2");
01306 
01307             return res;
01308         }
01309         #endregion
01310     }
01311 }