MarkdownSharp
1.12
|
00001 /* 00002 * Markdown - A text-to-HTML conversion tool for web writers 00003 * Copyright (c) 2004 John Gruber 00004 * http://daringfireball.net/projects/markdown/ 00005 * 00006 * Copyright (c) 2004 Michel Fortin - Translation to PHP 00007 * http://www.michelf.com/projects/php-markdown/ 00008 * 00009 * Copyright (c) 2004-2005 Milan Negovan - C# translation to .NET 00010 * http://www.aspnetresources.com 00011 * 00012 */ 00013 00014 #region Copyright and license 00015 00016 /* 00017 Copyright (c) 2003-2004 John Gruber 00018 <http://daringfireball.net/> 00019 All rights reserved. 00020 00021 Redistribution and use in source and binary forms, with or without 00022 modification, are permitted provided that the following conditions are 00023 met: 00024 00025 * Redistributions of source code must retain the above copyright notice, 00026 this list of conditions and the following disclaimer. 00027 00028 * Redistributions in binary form must reproduce the above copyright 00029 notice, this list of conditions and the following disclaimer in the 00030 documentation and/or other materials provided with the distribution. 00031 00032 * Neither the name "Markdown" nor the names of its contributors may 00033 be used to endorse or promote products derived from this software 00034 without specific prior written permission. 00035 00036 This software is provided by the copyright holders and contributors "as 00037 is" and any express or implied warranties, including, but not limited 00038 to, the implied warranties of merchantability and fitness for a 00039 particular purpose are disclaimed. In no event shall the copyright owner 00040 or contributors be liable for any direct, indirect, incidental, special, 00041 exemplary, or consequential damages (including, but not limited to, 00042 procurement of substitute goods or services; loss of use, data, or 00043 profits; or business interruption) however caused and on any theory of 00044 liability, whether in contract, strict liability, or tort (including 00045 negligence or otherwise) arising in any way out of the use of this 00046 software, even if advised of the possibility of such damage. 00047 */ 00048 00049 #endregion 00050 00051 using System; 00052 using System.Collections; 00053 using System.Security.Cryptography; 00054 using System.Text; 00055 using System.Text.RegularExpressions; 00056 00057 namespace MarkdownSharp 00058 { 00059 [Obsolete("This old version is included only for historical comparison purposes; use at your own risk!")] 00060 public class MarkdownOld 00061 { 00062 public class Pair 00063 { 00064 public Object First; 00065 public Object Second; 00066 } 00067 00068 #region Class members 00069 00070 private const int nestedBracketDepth = 6; 00071 private const string emptyElementSuffix = " />"; // Change to ">" for HTML output 00072 private const int tabWidth = 4; 00073 00074 private static readonly string markerUL; 00075 private static readonly string markerOL; 00076 private static readonly string markerAny; 00077 00078 private static readonly string nestedBrackets; 00079 private static readonly Hashtable escapeTable; 00080 private static readonly Hashtable backslashEscapeTable; 00081 00082 private Hashtable urls; 00083 private Hashtable titles; 00084 private Hashtable htmlBlocks; 00085 00086 private int listLevel = 0; 00087 00088 #endregion 00089 00096 static MarkdownOld() 00097 { 00098 nestedBrackets += RepeatString(@"(?>[^\[\]]+|\[", nestedBracketDepth); 00099 nestedBrackets += RepeatString(@"\])*", nestedBracketDepth); 00100 00101 markerUL = @"[*+-]"; 00102 markerOL = @"\d+[.]"; 00103 markerAny = string.Format("(?:{0}|{1})", markerUL, markerOL); 00104 00105 // Table of hash values for escaped characters: 00106 escapeTable = new Hashtable(); 00107 00108 escapeTable[@"\"] = ComputeMD5(@"\"); 00109 escapeTable["`"] = ComputeMD5("`"); 00110 escapeTable["*"] = ComputeMD5("*"); 00111 escapeTable["_"] = ComputeMD5("_"); 00112 escapeTable["{"] = ComputeMD5("{"); 00113 escapeTable["}"] = ComputeMD5("}"); 00114 escapeTable["["] = ComputeMD5("["); 00115 escapeTable["]"] = ComputeMD5("]"); 00116 escapeTable["("] = ComputeMD5("("); 00117 escapeTable[")"] = ComputeMD5(")"); 00118 escapeTable[">"] = ComputeMD5(">"); 00119 escapeTable["#"] = ComputeMD5("#"); 00120 escapeTable["+"] = ComputeMD5("+"); 00121 escapeTable["-"] = ComputeMD5("-"); 00122 escapeTable["."] = ComputeMD5("."); 00123 escapeTable["!"] = ComputeMD5("!"); 00124 00125 // Create an identical table but for escaped characters. 00126 backslashEscapeTable = new Hashtable(); 00127 00128 foreach (string key in escapeTable.Keys) 00129 backslashEscapeTable[@"\" + key] = escapeTable[key]; 00130 } 00131 00132 public MarkdownOld() 00133 { 00134 urls = new Hashtable(); 00135 titles = new Hashtable(); 00136 htmlBlocks = new Hashtable(); 00137 } 00138 00145 public string Transform(string text) 00146 { 00147 // Standardize line endings: 00148 // DOS to Unix and Mac to Unix 00149 text = text.Replace("\r\n", "\n").Replace("\r", "\n"); 00150 00151 // Make sure $text ends with a couple of newlines: 00152 text += "\n\n"; 00153 00154 // Convert all tabs to spaces. 00155 text = Detab(text); 00156 00157 // Strip any lines consisting only of spaces and tabs. 00158 // This makes subsequent regexen easier to write, because we can 00159 // match consecutive blank lines with /\n+/ instead of something 00160 // contorted like /[ \t]*\n+/ . 00161 text = Regex.Replace(text, @"^[ \t]+$", string.Empty, RegexOptions.Multiline); 00162 00163 // Turn block-level HTML blocks into hash entries 00164 text = HashHTMLBlocks(text); 00165 00166 // Strip link definitions, store in hashes. 00167 text = StripLinkDefinitions(text); 00168 00169 text = RunBlockGamut(text); 00170 00171 text = UnescapeSpecialChars(text); 00172 00173 return text + "\n"; 00174 } 00175 00176 #region Process link definitions 00177 00182 private string StripLinkDefinitions(string text) 00183 { 00184 string pattern = string.Format(@" 00185 ^[ ]{{0,{0}}}\[(.+)\]: # id = $1 00186 [ \t]* 00187 \n? # maybe *one* newline 00188 [ \t]* 00189 <?(\S+?)>? # url = $2 00190 [ \t]* 00191 \n? # maybe one newline 00192 [ \t]* 00193 (?: 00194 (?<=\s) # lookbehind for whitespace 00195 [\x22(] 00196 (.+?) # title = $3 00197 [\x22)] 00198 [ \t]* 00199 )? # title is optional 00200 (?:\n+|\Z)", tabWidth - 1); 00201 00202 text = Regex.Replace(text, pattern, new MatchEvaluator(LinkEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00203 return text; 00204 } 00205 00206 private string LinkEvaluator(Match match) 00207 { 00208 string linkID = match.Groups[1].Value.ToLower(); 00209 urls[linkID] = EncodeAmpsAndAngles(match.Groups[2].Value); 00210 00211 if (match.Groups[3] != null && match.Groups[3].Length > 0) 00212 titles[linkID] = match.Groups[3].Value.Replace("\"", """); 00213 00214 return string.Empty; 00215 } 00216 00217 #endregion 00218 00219 #region Hashify HTML blocks 00220 00224 private string HashHTMLBlocks(string text) 00225 { 00226 /* 00227 We only want to do this for block-level HTML tags, such as headers, 00228 lists, and tables. That's because we still want to wrap <p>s around 00229 "paragraphs" that are wrapped in non-block-level tags, such as anchors, 00230 phrase emphasis, and spans. The list of tags we're looking for is 00231 hard-coded: 00232 */ 00233 string blockTags1 = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del"; 00234 string blockTags2 = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math"; 00235 00236 /* 00237 First, look for nested blocks, e.g.: 00238 <div> 00239 <div> 00240 tags for inner block must be indented. 00241 </div> 00242 </div> 00243 00244 The outermost tags must start at the left margin for this to match, and 00245 the inner nested divs must be indented. 00246 We need to do this before the next, more liberal match, because the next 00247 match will start at the first `<div>` and stop at the first `</div>`. 00248 */ 00249 string pattern = string.Format(@" 00250 ( # save in $1 00251 ^ # start of line (with /m) 00252 <({0}) # start tag = $2 00253 \b # word break 00254 (.*\n)*? # any number of lines, minimally matching 00255 </\2> # the matching end tag 00256 [ \t]* # trailing spaces/tabs 00257 (?=\n+|\Z) # followed by a newline or end of document 00258 )", blockTags1); 00259 00260 text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00261 00262 // Now match more liberally, simply from `\n<tag>` to `</tag>\n` 00263 pattern = string.Format(@" 00264 ( # save in $1 00265 ^ # start of line (with /m) 00266 <({0}) # start tag = $2 00267 \b # word break 00268 (.*\n)*? # any number of lines, minimally matching 00269 .*</\2> # the matching end tag 00270 [ \t]* # trailing spaces/tabs 00271 (?=\n+|\Z) # followed by a newline or end of document 00272 )", blockTags2); 00273 00274 text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00275 00276 // Special case just for <hr />. It was easier to make a special case than 00277 // to make the other regex more complicated. 00278 pattern = string.Format(@" 00279 (?: 00280 (?<=\n\n) # Starting after a blank line 00281 | # or 00282 \A\n? # the beginning of the doc 00283 ) 00284 ( # save in $1 00285 [ ]{{0, {0}}} 00286 <(hr) # start tag = $2 00287 \b # word break 00288 ([^<>])*? # 00289 /?> # the matching end tag 00290 [ \t]* 00291 (?=\n{{2,}}|\Z) # followed by a blank line or end of document 00292 )", tabWidth - 1); 00293 text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.IgnorePatternWhitespace); 00294 00295 // Special case for standalone HTML comments: 00296 pattern = string.Format(@" 00297 (?: 00298 (?<=\n\n) # Starting after a blank line 00299 | # or 00300 \A\n? # the beginning of the doc 00301 ) 00302 ( # save in $1 00303 [ ]{{0,{0}}} 00304 (?s: 00305 <! 00306 (--.*?--\s*)+ 00307 > 00308 ) 00309 [ \t]* 00310 (?=\n{{2,}}|\Z) # followed by a blank line or end of document 00311 )", tabWidth - 1); 00312 text = Regex.Replace(text, pattern, new MatchEvaluator(HtmlEvaluator), RegexOptions.IgnorePatternWhitespace); 00313 00314 return text; 00315 } 00316 00317 private string HtmlEvaluator(Match match) 00318 { 00319 string text = match.Groups[1].Value; 00320 string key = ComputeMD5(text); 00321 htmlBlocks[key] = text; 00322 00323 // # String that will replace the block 00324 return string.Concat("\n\n", key, "\n\n"); 00325 } 00326 00327 #endregion 00328 00329 #region Run transformations that form block-level elements (RunBlockGamut) 00330 00335 private string RunBlockGamut(string text) 00336 { 00337 text = DoHeaders(text); 00338 00339 // Do Horizontal Rules: 00340 text = Regex.Replace(text, @"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00341 text = Regex.Replace(text, @"^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00342 text = Regex.Replace(text, @"^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$", "<hr" + emptyElementSuffix + "\n", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00343 00344 00345 text = DoLists(text); 00346 text = DoCodeBlocks(text); 00347 text = DoBlockQuotes(text); 00348 00349 /* 00350 We already ran _HashHTMLBlocks() before, in Markdown(), but that 00351 was to escape raw HTML in the original Markdown source. This time, 00352 we're escaping the markup we've just created, so that we don't wrap 00353 <p> tags around block-level tags. 00354 */ 00355 text = HashHTMLBlocks(text); 00356 00357 text = FormParagraphs(text); 00358 00359 return text; 00360 } 00361 00362 #endregion 00363 00364 #region Run transformations within block-level elements (RunSpanGamut) 00365 00370 private string RunSpanGamut(string text) 00371 { 00372 text = DoCodeSpans(text); 00373 00374 text = EscapeSpecialChars(text); 00375 00376 // Process anchor and image tags. Images must come first, 00377 // because ![foo][f] looks like an anchor. 00378 text = DoImages(text); 00379 text = DoAnchors(text); 00380 00381 // Make links out of things like `<http://example.com/>` 00382 // Must come after DoAnchors(), because you can use < and > 00383 // delimiters in inline links like [this](<url>). 00384 text = DoAutoLinks(text); 00385 00386 // Fix unencoded ampersands and <'s: 00387 text = EncodeAmpsAndAngles(text); 00388 00389 text = DoItalicsAndBold(text); 00390 00391 // Do hard breaks: 00392 text = Regex.Replace(text, @" {2,}\n", string.Format("<br{0}\n", emptyElementSuffix)); 00393 00394 return text; 00395 } 00396 00397 #endregion 00398 00399 #region Parse HTML into tokens 00400 00411 private ArrayList TokenizeHTML(string text) 00412 { 00413 // Regular expression derived from the _tokenize() subroutine in 00414 // Brad Choate's MTRegex plugin. 00415 // http://www.bradchoate.com/past/mtregex.php 00416 int pos = 0; 00417 int depth = 6; 00418 ArrayList tokens = new ArrayList(); 00419 00420 00421 string nestedTags = string.Concat(RepeatString(@"(?:<[a-z\/!$](?:[^<>]|", depth), 00422 RepeatString(@")*>)", depth)); 00423 string pattern = string.Concat(@"(?s:<!(?:--.*?--\s*)+>)|(?s:<\?.*?\?>)|", nestedTags); 00424 00425 MatchCollection mc = Regex.Matches(text, pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline); 00426 00427 foreach (Match m in mc) 00428 { 00429 string wholeTag = m.Value; 00430 int tagStart = m.Index; 00431 Pair token = null; 00432 00433 if (pos < tagStart) 00434 { 00435 token = new Pair(); 00436 token.First = "text"; 00437 token.Second = text.Substring(pos, tagStart - pos); 00438 tokens.Add(token); 00439 } 00440 00441 token = new Pair(); 00442 token.First = "tag"; 00443 token.Second = wholeTag; 00444 tokens.Add(token); 00445 00446 pos = m.Index + m.Length; 00447 } 00448 00449 if (pos < text.Length) 00450 { 00451 Pair token = new Pair(); 00452 token.First = "text"; 00453 token.Second = text.Substring(pos, text.Length - pos); 00454 tokens.Add(token); 00455 } 00456 00457 return tokens; 00458 } 00459 00460 #endregion 00461 00462 #region Escape special characters 00463 00464 private string EscapeSpecialChars(string text) 00465 { 00466 ArrayList tokens = TokenizeHTML(text); 00467 00468 // Rebuild text from the tokens 00469 text = string.Empty; 00470 00471 foreach (Pair token in tokens) 00472 { 00473 string value = token.Second.ToString(); 00474 00475 if (token.First.Equals("tag")) 00476 /* 00477 Within tags, encode * and _ so they don't conflict with their use 00478 in Markdown for italics and strong. We're replacing each 00479 such character with its corresponding MD5 checksum value; 00480 this is likely overkill, but it should prevent us from colliding 00481 with the escape values by accident. 00482 */ 00483 value = value.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00484 else 00485 value = EncodeBackslashEscapes(value); 00486 00487 text += value; 00488 } 00489 00490 return text; 00491 } 00492 00493 #endregion 00494 00495 #region Process referenced and inline anchors 00496 00500 private string DoAnchors(string text) 00501 { 00502 // 00503 // First, handle reference-style links: [link text] [id] 00504 // 00505 string pattern = string.Format(@" 00506 ( # wrap whole match in $1 00507 \[ 00508 ({0}) # link text = $2 00509 \] 00510 00511 [ ]? # one optional space 00512 (?:\n[ ]*)? # one optional newline followed by spaces 00513 00514 \[ 00515 (.*?) # id = $3 00516 \] 00517 )", nestedBrackets); 00518 00519 text = Regex.Replace(text, pattern, new MatchEvaluator(AnchorReferenceEvaluator), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); 00520 00521 // 00522 // Next, inline-style links: [link text](url "optional title") 00523 // 00524 pattern = string.Format(@" 00525 ( # wrap whole match in $1 00526 \[ 00527 ({0}) # link text = $2 00528 \] 00529 \( # literal paren 00530 [ \t]* 00531 <?(.*?)>? # href = $3 00532 [ \t]* 00533 ( # $4 00534 (['\x22]) # quote char = $5 00535 (.*?) # Title = $6 00536 \5 # matching quote 00537 )? # title is optional 00538 \) 00539 )", nestedBrackets); 00540 00541 text = Regex.Replace(text, pattern, new MatchEvaluator(AnchorInlineEvaluator), RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); 00542 00543 return text; 00544 } 00545 00546 private string AnchorReferenceEvaluator(Match match) 00547 { 00548 string wholeMatch = match.Groups[1].Value; 00549 string linkText = match.Groups[2].Value; 00550 string linkID = match.Groups[3].Value.ToLower(); 00551 string url = null; 00552 string res = null; 00553 string title = null; 00554 00555 // for shortcut links like [this][]. 00556 if (linkID.Equals(string.Empty)) 00557 linkID = linkText.ToLower(); 00558 00559 if (urls[linkID] != null) 00560 { 00561 url = urls[linkID].ToString(); 00562 00563 //We've got to encode these to avoid conflicting with italics/bold. 00564 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00565 res = string.Format("<a href=\"{0}\"", url); 00566 00567 if (titles[linkID] != null) 00568 { 00569 title = titles[linkID].ToString(); 00570 title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00571 res += string.Format(" title=\"{0}\"", title); 00572 } 00573 00574 res += string.Format(">{0}</a>", linkText); 00575 } 00576 else 00577 res = wholeMatch; 00578 00579 return res; 00580 } 00581 00582 private string AnchorInlineEvaluator(Match match) 00583 { 00584 string linkText = match.Groups[2].Value; 00585 string url = match.Groups[3].Value; 00586 string title = match.Groups[6].Value; 00587 string res = null; 00588 00589 // We've got to encode these to avoid conflicting with italics/bold. 00590 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00591 res = string.Format("<a href=\"{0}\"", url); 00592 00593 if (title != null && title.Length > 0) 00594 { 00595 title = title.Replace("\"", """).Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00596 res += string.Format(" title=\"{0}\"", title); 00597 } 00598 00599 res += string.Format(">{0}</a>", linkText); 00600 return res; 00601 } 00602 00603 #endregion 00604 00605 #region Process inline and referenced images 00606 00610 private string DoImages(string text) 00611 { 00612 // First, handle reference-style labeled images: ![alt text][id] 00613 string pattern = @" 00614 ( # wrap whole match in $1 00615 !\[ 00616 (.*?) # alt text = $2 00617 \] 00618 00619 [ ]? # one optional space 00620 (?:\n[ ]*)? # one optional newline followed by spaces 00621 00622 \[ 00623 (.*?) # id = $3 00624 \] 00625 00626 )"; 00627 00628 text = Regex.Replace(text, pattern, new MatchEvaluator(ImageReferenceEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 00629 00630 // Next, handle inline images: ![alt text](url "optional title") 00631 // Don't forget: encode * and _ 00632 pattern = @" 00633 ( # wrap whole match in $1 00634 !\[ 00635 (.*?) # alt text = $2 00636 \] 00637 \( # literal paren 00638 [ \t]* 00639 <?(\S+?)>? # src url = $3 00640 [ \t]* 00641 ( # $4 00642 (['\x22]) # quote char = $5 00643 (.*?) # title = $6 00644 \5 # matching quote 00645 [ \t]* 00646 )? # title is optional 00647 \) 00648 )"; 00649 00650 text = Regex.Replace(text, pattern, new MatchEvaluator(ImageInlineEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 00651 00652 return text; 00653 } 00654 00655 private string ImageReferenceEvaluator(Match match) 00656 { 00657 string wholeMatch = match.Groups[1].Value; 00658 string altText = match.Groups[2].Value; 00659 string linkID = match.Groups[3].Value.ToLower(); 00660 string url = null; 00661 string res = null; 00662 string title = null; 00663 00664 // for shortcut links like ![this][]. 00665 if (linkID.Equals(string.Empty)) 00666 linkID = altText.ToLower(); 00667 00668 altText = altText.Replace("\"", """); 00669 00670 if (urls[linkID] != null) 00671 { 00672 url = urls[linkID].ToString(); 00673 00674 // We've got to encode these to avoid conflicting with italics/bold. 00675 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00676 res = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText); 00677 00678 if (titles[linkID] != null) 00679 { 00680 title = titles[linkID].ToString(); 00681 title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00682 00683 res += string.Format(" title=\"{0}\"", title); 00684 } 00685 00686 res += emptyElementSuffix; 00687 } 00688 else 00689 { 00690 // If there's no such link ID, leave intact: 00691 res = wholeMatch; 00692 } 00693 00694 return res; 00695 } 00696 00697 private string ImageInlineEvaluator(Match match) 00698 { 00699 string altText = match.Groups[2].Value; 00700 string url = match.Groups[3].Value; 00701 string title = match.Groups[6].Value; 00702 string res = null; 00703 00704 00705 altText = altText.Replace("\"", """); 00706 title = title.Replace("\"", """); 00707 00708 // We've got to encode these to avoid conflicting with italics/bold. 00709 url = url.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00710 res = string.Format("<img src=\"{0}\" alt=\"{1}\"", url, altText); 00711 00712 title = title.Replace("*", escapeTable["*"].ToString()).Replace("_", escapeTable["_"].ToString()); 00713 res += string.Format(" title=\"{0}\"", title); 00714 00715 res += emptyElementSuffix; 00716 return res; 00717 } 00718 00719 #endregion 00720 00721 #region Process headers 00722 00723 private string DoHeaders(string text) 00724 { 00725 /* 00726 Setext-style headers: 00727 00728 Header 1 00729 ======== 00730 00731 Header 2 00732 -------- 00733 */ 00734 00735 text = Regex.Replace(text, @"^(.+)[ \t]*\n=+[ \t]*\n+", new MatchEvaluator(SetextHeader1Evaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00736 text = Regex.Replace(text, @"^(.+)[ \t]*\n-+[ \t]*\n+", new MatchEvaluator(SetextHeader2Evaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00737 00738 /* 00739 atx-style headers: 00740 # Header 1 00741 ## Header 2 00742 ## Header 2 with closing hashes ## 00743 ... 00744 ###### Header 6 00745 */ 00746 string pattern = @" 00747 ^(\#{1,6}) # $1 = string of #'s 00748 [ \t]* 00749 (.+?) # $2 = Header text 00750 [ \t]* 00751 \#* # optional closing #'s (not counted) 00752 \n+"; 00753 00754 text = Regex.Replace(text, pattern, new MatchEvaluator(AtxHeaderEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00755 00756 return text; 00757 } 00758 00759 private string SetextHeader1Evaluator(Match match) 00760 { 00761 string header = match.Groups[1].Value; 00762 return string.Concat("<h1>", RunSpanGamut(header), "</h1>\n\n"); 00763 } 00764 00765 private string SetextHeader2Evaluator(Match match) 00766 { 00767 string header = match.Groups[1].Value; 00768 return string.Concat("<h2>", RunSpanGamut(header), "</h2>\n\n"); 00769 } 00770 00771 private string AtxHeaderEvaluator(Match match) 00772 { 00773 string headerSig = match.Groups[1].Value; 00774 string headerText = match.Groups[2].Value; 00775 00776 return string.Concat("<h", headerSig.Length, ">", RunSpanGamut(headerText), "</h", headerSig.Length, ">\n\n"); 00777 } 00778 00779 #endregion 00780 00781 #region Process ordered and unordered lists 00782 00783 private string DoLists(string text) 00784 { 00785 // Re-usable pattern to match any entirel ul or ol list: 00786 string pattern = null; 00787 00788 string wholeList = string.Format(@" 00789 ( # $1 = whole list 00790 ( # $2 00791 [ ]{{0,{1}}} 00792 ({0}) # $3 = first list item marker 00793 [ \t]+ 00794 ) 00795 (?s:.+?) 00796 ( # $4 00797 \z 00798 | 00799 \n{{2,}} 00800 (?=\S) 00801 (?! # Negative lookahead for another list item marker 00802 [ \t]* 00803 {0}[ \t]+ 00804 ) 00805 ) 00806 )", markerAny, tabWidth - 1); 00807 00808 // We use a different prefix before nested lists than top-level lists. 00809 // See extended comment in _ProcessListItems(). 00810 if (listLevel > 0) 00811 { 00812 pattern = "^" + wholeList; 00813 text = Regex.Replace(text, pattern, new MatchEvaluator(ListEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00814 } 00815 else 00816 { 00817 pattern = @"(?:(?<=\n\n)|\A\n?)" + wholeList; 00818 text = Regex.Replace(text, pattern, new MatchEvaluator(ListEvaluator), RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00819 } 00820 00821 return text; 00822 } 00823 00824 private string ListEvaluator(Match match) 00825 { 00826 string list = match.Groups[1].Value; 00827 string listType = Regex.IsMatch(match.Groups[3].Value, markerUL) ? "ul" : "ol"; 00828 string result = null; 00829 00830 // Turn double returns into triple returns, so that we can make a 00831 // paragraph for the last item in a list, if necessary: 00832 list = Regex.Replace(list, @"\n{2,}", "\n\n\n"); 00833 result = ProcessListItems(list, markerAny); 00834 result = string.Format("<{0}>\n{1}</{0}>\n", listType, result); 00835 00836 return result; 00837 } 00838 00843 private string ProcessListItems(string list, string marker) 00844 { 00845 /* 00846 The listLevel global keeps track of when we're inside a list. 00847 Each time we enter a list, we increment it; when we leave a list, 00848 we decrement. If it's zero, we're not in a list anymore. 00849 00850 We do this because when we're not inside a list, we want to treat 00851 something like this: 00852 00853 I recommend upgrading to version 00854 8. Oops, now this line is treated 00855 as a sub-list. 00856 00857 As a single paragraph, despite the fact that the second line starts 00858 with a digit-period-space sequence. 00859 00860 Whereas when we're inside a list (or sub-list), that line will be 00861 treated as the start of a sub-list. What a kludge, huh? This is 00862 an aspect of Markdown's syntax that's hard to parse perfectly 00863 without resorting to mind-reading. Perhaps the solution is to 00864 change the syntax rules such that sub-lists must start with a 00865 starting cardinal number; e.g. "1." or "a.". 00866 */ 00867 00868 listLevel++; 00869 00870 // Trim trailing blank lines: 00871 list = Regex.Replace(list, @"\n{2,}\z", "\n"); 00872 00873 string pattern = string.Format( 00874 @"(\n)? # leading line = $1 00875 (^[ \t]*) # leading whitespace = $2 00876 ({0}) [ \t]+ # list marker = $3 00877 ((?s:.+?) # list item text = $4 00878 (\n{{1,2}})) 00879 (?= \n* (\z | \2 ({0}) [ \t]+))", marker); 00880 00881 list = Regex.Replace(list, pattern, new MatchEvaluator(ListEvaluator2), 00882 RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); 00883 listLevel--; 00884 return list; 00885 } 00886 00887 private string ListEvaluator2(Match match) 00888 { 00889 string item = match.Groups[4].Value; 00890 string leadingLine = match.Groups[1].Value; 00891 00892 00893 if ((leadingLine != null && leadingLine != string.Empty) || Regex.IsMatch(item, @"\n{2,}")) 00894 item = RunBlockGamut(Outdent(item)); 00895 else 00896 { 00897 // Recursion for sub-lists: 00898 item = DoLists(Outdent(item)); 00899 item = item.TrimEnd('\n'); 00900 item = RunSpanGamut(item); 00901 } 00902 00903 return string.Format("<li>{0}</li>\n", item); 00904 } 00905 00906 #endregion 00907 00908 #region Process code blocks 00909 00910 private string DoCodeBlocks(string text) 00911 { 00912 // TODO: Should we allow 2 empty lines here or only one? 00913 string pattern = string.Format(@" 00914 (?:\n\n|\A) 00915 ( # $1 = the code block -- one or more lines, starting with a space/tab 00916 (?: 00917 (?:[ ]{{{0}}} | \t) # Lines must start with a tab or a tab-width of spaces 00918 .*\n+ 00919 )+ 00920 ) 00921 ((?=^[ ]{{0,{0}}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc", 00922 tabWidth); 00923 00924 text = Regex.Replace(text, pattern, 00925 new MatchEvaluator(CodeBlockEvaluator), 00926 RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); 00927 00928 return text; 00929 } 00930 00931 private string CodeBlockEvaluator(Match match) 00932 { 00933 string codeBlock = match.Groups[1].Value; 00934 codeBlock = EncodeCode(Outdent(codeBlock)); 00935 00936 // Trim leading newlines and trailing whitespace 00937 codeBlock = Regex.Replace(codeBlock, @"^\n+", string.Empty); 00938 codeBlock = Regex.Replace(codeBlock, @"\s+\z", string.Empty); 00939 00940 return string.Concat("\n\n<pre><code>", codeBlock, "\n</code></pre>\n\n"); 00941 } 00942 00943 #endregion 00944 00945 #region Process code spans 00946 00947 private string DoCodeSpans(string text) 00948 { 00949 /* 00950 * Backtick quotes are used for <code></code> spans. 00951 * You can use multiple backticks as the delimiters if you want to 00952 include literal backticks in the code span. So, this input: 00953 00954 Just type ``foo `bar` baz`` at the prompt. 00955 00956 Will translate to: 00957 00958 <p>Just type <code>foo `bar` baz</code> at the prompt.</p> 00959 00960 There's no arbitrary limit to the number of backticks you 00961 can use as delimters. If you need three consecutive backticks 00962 in your code, use four for delimiters, etc. 00963 00964 * You can use spaces to get literal backticks at the edges: 00965 00966 ... type `` `bar` `` ... 00967 00968 Turns to: 00969 00970 ... type <code>`bar`</code> ... 00971 */ 00972 00973 string pattern = @" 00974 (`+) # $1 = Opening run of ` 00975 (.+?) # $2 = The code block 00976 (?<!`) 00977 \1 00978 (?!`)"; 00979 text = Regex.Replace(text, pattern, 00980 new MatchEvaluator(CodeSpanEvaluator), 00981 RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 00982 00983 return text; 00984 } 00985 00986 private string CodeSpanEvaluator(Match match) 00987 { 00988 string s = match.Groups[2].Value; 00989 s = s.Replace(@"^[ \t]*", string.Empty).Replace(@"[ \t]*$", string.Empty); 00990 s = EncodeCode(s); 00991 00992 return string.Concat("<code>", s, "</code>"); 00993 } 00994 00995 #endregion 00996 00997 #region Encode/escape certain characters inside Markdown code runs 00998 01006 private string EncodeCode(string code) 01007 { 01008 code = code.Replace("&", "&").Replace("<", "<").Replace(">", ">"); 01009 01010 foreach (string key in escapeTable.Keys) 01011 code = code.Replace(key, escapeTable[key].ToString()); 01012 01013 return code; 01014 } 01015 01016 #endregion 01017 01018 #region Process bold and italics 01019 01020 private string DoItalicsAndBold(string text) 01021 { 01022 // <strong> must go first: 01023 text = Regex.Replace(text, @"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", 01024 new MatchEvaluator(BoldEvaluator), 01025 RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 01026 01027 // Then <em>: 01028 text = Regex.Replace(text, @"(\*|_) (?=\S) (.+?) (?<=\S) \1", 01029 new MatchEvaluator(ItalicsEvaluator), 01030 RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 01031 return text; 01032 } 01033 01034 private string ItalicsEvaluator(Match match) 01035 { 01036 return string.Format("<em>{0}</em>", match.Groups[2].Value); 01037 } 01038 01039 private string BoldEvaluator(Match match) 01040 { 01041 return string.Format("<strong>{0}</strong>", match.Groups[2].Value); 01042 } 01043 01044 #endregion 01045 01046 #region Process blockquotes 01047 01048 private string DoBlockQuotes(string text) 01049 { 01050 string pattern = 01051 @"( # Wrap whole match in $1 01052 ( 01053 ^[ \t]*>[ \t]? # '>' at the start of a line 01054 .+\n # rest of the first line 01055 (.+\n)* # subsequent consecutive lines 01056 \n* # blanks 01057 )+ 01058 )"; 01059 01060 text = Regex.Replace(text, pattern, new MatchEvaluator(BlockQuoteEvaluator), RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); 01061 return text; 01062 } 01063 01064 private string BlockQuoteEvaluator(Match match) 01065 { 01066 string bq = match.Groups[1].Value; 01067 01068 // Trim one level of quoting - trim whitespace-only lines 01069 bq = Regex.Replace(bq, @"^[ \t]*>[ \t]?", string.Empty, RegexOptions.Multiline); 01070 bq = Regex.Replace(bq, @"^[ \t]+$", string.Empty, RegexOptions.Multiline); 01071 01072 bq = RunBlockGamut(bq); 01073 bq = Regex.Replace(bq, @"^", " ", RegexOptions.Multiline); 01074 01075 // These leading spaces screw with <pre> content, so we need to fix that: 01076 bq = Regex.Replace(bq, @"(\s*<pre>.+?</pre>)", new MatchEvaluator(BlockQuoteEvaluator2), RegexOptions.IgnorePatternWhitespace | RegexOptions.Singleline); 01077 01078 return string.Format("<blockquote>\n{0}\n</blockquote>\n\n", bq); 01079 } 01080 01081 private string BlockQuoteEvaluator2(Match match) 01082 { 01083 string pre = match.Groups[1].Value; 01084 pre = Regex.Replace(pre, @"^ ", string.Empty, RegexOptions.Multiline); 01085 01086 return pre; 01087 } 01088 01089 #endregion 01090 01091 #region Create paragraph tags 01092 01093 private string FormParagraphs(string text) 01094 { 01095 // Strip leading and trailing lines: 01096 text = Regex.Replace(text, @"^\n+", string.Empty); 01097 text = Regex.Replace(text, @"\n+\z", string.Empty); 01098 01099 string[] grafs = Regex.Split(text, @"\n{2,}"); 01100 01101 // Wrap <p> tags. 01102 for (int i = 0; i < grafs.Length; i++) 01103 { 01104 // Milan Negovan: I'm adding an additional check for an empty block of code. 01105 // Otherwise an empty <p></p> is created. 01106 if (htmlBlocks[grafs[i]] == null && grafs[i].Length > 0) 01107 { 01108 string block = grafs[i]; 01109 01110 block = RunSpanGamut(block); 01111 block = Regex.Replace(block, @"^([ \t]*)", "<p>"); 01112 block += "</p>"; 01113 01114 grafs[i] = block; 01115 } 01116 } 01117 01118 // Unhashify HTML blocks 01119 for (int i = 0; i < grafs.Length; i++) 01120 { 01121 string block = (string)htmlBlocks[grafs[i]]; 01122 01123 if (block != null) 01124 grafs[i] = block; 01125 } 01126 01127 return string.Join("\n\n", grafs); 01128 01129 } 01130 01131 #endregion 01132 01133 #region Process emails and links 01134 01135 private string DoAutoLinks(string text) 01136 { 01137 text = Regex.Replace(text, "<((https?|ftp):[^'\">\\s]+)>", new MatchEvaluator(HyperlinkEvaluator)); 01138 01139 // Email addresses: <address@domain.foo> 01140 string pattern = 01141 @"< 01142 (?:mailto:)? 01143 ( 01144 [-.\w]+ 01145 \@ 01146 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ 01147 ) 01148 >"; 01149 01150 text = Regex.Replace(text, pattern, new MatchEvaluator(EmailEvaluator), RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace); 01151 01152 return text; 01153 } 01154 01155 private string HyperlinkEvaluator(Match match) 01156 { 01157 string link = match.Groups[1].Value; 01158 return string.Format("<a href=\"{0}\">{0}</a>", link); 01159 } 01160 01161 private string EmailEvaluator(Match match) 01162 { 01163 string email = UnescapeSpecialChars(match.Groups[1].Value); 01164 01165 /* 01166 Input: an email address, e.g. "foo@example.com" 01167 01168 Output: the email address as a mailto link, with each character 01169 of the address encoded as either a decimal or hex entity, in 01170 the hopes of foiling most address harvesting spam bots. E.g.: 01171 01172 <a href="mailto:foo@e 01173 xample.com">foo 01174 @example.com</a> 01175 01176 Based by a filter by Matthew Wickline, posted to the BBEdit-Talk 01177 mailing list: <http://tinyurl.com/yu7ue> 01178 01179 */ 01180 email = "mailto:" + email; 01181 01182 // leave ':' alone (to spot mailto: later) 01183 email = Regex.Replace(email, @"([^\:])", new MatchEvaluator(EncodeEmailEvaluator)); 01184 01185 email = string.Format("<a href=\"{0}\">{0}</a>", email); 01186 01187 // strip the mailto: from the visible part 01188 email = Regex.Replace(email, "\">.+?:", "\">"); 01189 return email; 01190 } 01191 01192 private string EncodeEmailEvaluator(Match match) 01193 { 01194 char c = Convert.ToChar(match.Groups[1].Value); 01195 01196 Random rnd = new Random(); 01197 int r = rnd.Next(0, 100); 01198 01199 // Original author note: 01200 // Roughly 10% raw, 45% hex, 45% dec 01201 // '@' *must* be encoded. I insist. 01202 if (r > 90 && c != '@') return c.ToString(); 01203 if (r < 45) return string.Format("&#x{0:x};", (int)c); 01204 01205 return string.Format("&#x{0:x};", (int)c); 01206 } 01207 01208 #endregion 01209 01210 #region EncodeAmpsAndAngles, EncodeBackslashEscapes, UnescapeSpecialChars, Outdent, UnslashQuotes 01211 01215 private string EncodeAmpsAndAngles(string text) 01216 { 01217 // Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: 01218 // http://bumppo.net/projects/amputator/ 01219 01220 text = Regex.Replace(text, @"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)", "&"); 01221 01222 // Encode naked <'s 01223 text = Regex.Replace(text, @"<(?![a-z/?\$!])", "<", RegexOptions.IgnoreCase); 01224 01225 return text; 01226 } 01227 01228 private string EncodeBackslashEscapes(string value) 01229 { 01230 // Must process escaped backslashes first. 01231 foreach (string key in backslashEscapeTable.Keys) 01232 value = value.Replace(key, backslashEscapeTable[key].ToString()); 01233 01234 return value; 01235 } 01236 01240 private string UnescapeSpecialChars(string text) 01241 { 01242 foreach (string key in escapeTable.Keys) 01243 text = text.Replace(escapeTable[key].ToString(), key); 01244 01245 return text; 01246 } 01247 01251 private string Outdent(string block) 01252 { 01253 return Regex.Replace(block, @"^(\t|[ ]{1," + tabWidth.ToString() + @"})", string.Empty, RegexOptions.Multiline); 01254 } 01255 #endregion 01256 01257 #region Replace tabs with spaces and pad them to tab width 01258 01259 private string Detab(string text) 01260 { 01261 // Inspired from a post by Bart Lateur: 01262 // http://www.nntp.perl.org/group/perl.macperl.anyperl/154 01263 return Regex.Replace(text, @"^(.*?)\t", new MatchEvaluator(TabEvaluator), RegexOptions.Multiline); 01264 } 01265 01266 private string TabEvaluator(Match match) 01267 { 01268 string leading = match.Groups[1].Value; 01269 return string.Concat(leading, RepeatString(" ", tabWidth - leading.Length % tabWidth)); 01270 } 01271 01272 #endregion 01273 01274 #region Helper methods (RepeatString & ComputeMD5) 01275 01282 private static string RepeatString(string text, int count) 01283 { 01284 string res = null; 01285 01286 for (int i = 0; i < count; i++) 01287 res += text; 01288 01289 return res; 01290 } 01291 01297 private static string ComputeMD5(string text) 01298 { 01299 MD5 algo = MD5.Create(); 01300 byte[] plainText = Encoding.UTF8.GetBytes(text); 01301 byte[] hashedText = algo.ComputeHash(plainText); 01302 string res = null; 01303 01304 foreach (byte b in hashedText) 01305 res += b.ToString("x2"); 01306 01307 return res; 01308 } 01309 #endregion 01310 } 01311 }