3 # Markdown - A text-to-HTML conversion tool for web writers
6 # Copyright (c) 2004-2015 Michel Fortin
7 # <https://michelf.ca/projects/php-markdown/>
10 # Copyright (c) 2004-2006 John Gruber
11 # <https://daringfireball.net/projects/markdown/>
17 # Markdown Parser Class
20 class Markdown implements MarkdownInterface {
24 const MARKDOWNLIB_VERSION = "1.6.0";
26 ### Simple Function Interface ###
28 public static function defaultTransform($text) {
30 # Initialize the parser and return the result of its transform method.
31 # This will work fine for derived classes too.
33 # Take parser class on which this function was called.
34 $parser_class = \get_called_class();
36 # try to take parser from the static parser list
38 $parser =& $parser_list[$parser_class];
40 # create the parser it not already set
42 $parser = new $parser_class;
44 # Transform text using parser.
45 return $parser->transform($text);
48 ### Configuration Variables ###
50 # Change to ">" for HTML output.
51 public $empty_element_suffix = " />";
52 public $tab_width = 4;
54 # Change to `true` to disallow markup or entities.
55 public $no_markup = false;
56 public $no_entities = false;
58 # Predefined urls and titles for reference links and images.
59 public $predef_urls = array();
60 public $predef_titles = array();
62 # Optional filter function for URLs
63 public $url_filter_func = null;
65 # Optional header id="" generation callback function.
66 public $header_id_func = null;
68 # Optional function for converting code block content to HTML
69 public $code_block_content_func = null;
71 # Class attribute to toggle "enhanced ordered list" behaviour
72 # setting this to true will allow ordered lists to start from the index
73 # number that is defined first. For example:
79 # <li>List item two</li>
80 # <li>List item three</li>
82 public $enhanced_ordered_list = false;
84 ### Parser Implementation ###
86 # Regex to match balanced [brackets].
87 # Needed to insert a maximum bracked depth while converting to PHP.
88 protected $nested_brackets_depth = 6;
89 protected $nested_brackets_re;
91 protected $nested_url_parenthesis_depth = 4;
92 protected $nested_url_parenthesis_re;
94 # Table of hash values for escaped characters:
95 protected $escape_chars = '\`*_{}[]()>#+-.!';
96 protected $escape_chars_re;
99 public function __construct() {
101 # Constructor function. Initialize appropriate member variables.
104 $this->prepareItalicsAndBold();
106 $this->nested_brackets_re =
107 str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
108 str_repeat('\])*', $this->nested_brackets_depth);
110 $this->nested_url_parenthesis_re =
111 str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
112 str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
114 $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
116 # Sort document, block, and span gamut in ascendent priority order.
117 asort($this->document_gamut);
118 asort($this->block_gamut);
119 asort($this->span_gamut);
123 # Internal hashes used during transformation.
124 protected $urls = array();
125 protected $titles = array();
126 protected $html_hashes = array();
128 # Status flag to avoid invalid nesting.
129 protected $in_anchor = false;
132 protected function setup() {
134 # Called before the transformation process starts to setup parser
137 # Clear global hashes.
138 $this->urls = $this->predef_urls;
139 $this->titles = $this->predef_titles;
140 $this->html_hashes = array();
142 $this->in_anchor = false;
145 protected function teardown() {
147 # Called after the transformation process to clear any variable
148 # which may be taking up memory unnecessarly.
150 $this->urls = array();
151 $this->titles = array();
152 $this->html_hashes = array();
156 public function transform($text) {
158 # Main function. Performs some preprocessing on the input text
159 # and pass it through the document gamut.
163 # Remove UTF-8 BOM and marker character in input, if present.
164 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
166 # Standardize line endings:
167 # DOS to Unix and Mac to Unix
168 $text = preg_replace('{\r\n?}', "\n", $text);
170 # Make sure $text ends with a couple of newlines:
173 # Convert all tabs to spaces.
174 $text = $this->detab($text);
176 # Turn block-level HTML blocks into hash entries
177 $text = $this->hashHTMLBlocks($text);
179 # Strip any lines consisting only of spaces and tabs.
180 # This makes subsequent regexen easier to write, because we can
181 # match consecutive blank lines with /\n+/ instead of something
182 # contorted like /[ ]*\n+/ .
183 $text = preg_replace('/^[ ]+$/m', '', $text);
185 # Run document gamut methods.
186 foreach ($this->document_gamut as $method => $priority) {
187 $text = $this->$method($text);
195 protected $document_gamut = array(
196 # Strip link definitions, store in hashes.
197 "stripLinkDefinitions" => 20,
199 "runBasicBlockGamut" => 30,
203 protected function stripLinkDefinitions($text) {
205 # Strips link definitions from text, stores the URLs and titles in
208 $less_than_tab = $this->tab_width - 1;
210 # Link defs are in the form: ^[id]: url "optional title"
211 $text = preg_replace_callback('{
212 ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
214 \n? # maybe *one* newline
222 \n? # maybe one newline
225 (?<=\s) # lookbehind for whitespace
230 )? # title is optional
233 array($this, '_stripLinkDefinitions_callback'),
237 protected function _stripLinkDefinitions_callback($matches) {
238 $link_id = strtolower($matches[1]);
239 $url = $matches[2] == '' ? $matches[3] : $matches[2];
240 $this->urls[$link_id] = $url;
241 $this->titles[$link_id] =& $matches[4];
242 return ''; # String that will replace the block
246 protected function hashHTMLBlocks($text) {
247 if ($this->no_markup) return $text;
249 $less_than_tab = $this->tab_width - 1;
251 # Hashify HTML blocks:
252 # We only want to do this for block-level HTML tags, such as headers,
253 # lists, and tables. That's because we still want to wrap <p>s around
254 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
255 # phrase emphasis, and spans. The list of tags we're looking for is
258 # * List "a" is made of tags which can be both inline or block-level.
259 # These will be treated block-level when the start tag is alone on
260 # its line, otherwise they're not matched here and will be taken as
262 # * List "b" is made of tags which are always block-level;
264 $block_tags_a_re = 'ins|del';
265 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
266 'script|noscript|style|form|fieldset|iframe|math|svg|'.
267 'article|section|nav|aside|hgroup|header|footer|'.
270 # Regular expression for the content of a block tag.
271 $nested_tags_level = 4;
273 (?> # optional tag attributes
274 \s # starts with whitespace
276 [^>"/]+ # text outside quotes
278 /+(?!>) # slash not followed by ">"
280 "[^"]*" # text inside double quotes (tolerate ">")
282 \'[^\']*\' # text inside single quotes (tolerate ">")
289 [^<]+ # content without tag
291 <\2 # nested opening tag
292 '.$attr.' # attributes
296 >', $nested_tags_level). # end of opening tag
297 '.*?'. # last level nested tag content
299 </\2\s*> # closing nested tag
302 <(?!/\2\s*> # other tags with a different name
306 $content2 = str_replace('\2', '\3', $content);
308 # First, look for nested blocks, e.g.:
311 # tags for inner block must be indented.
315 # The outermost tags must start at the left margin for this to match, and
316 # the inner nested divs must be indented.
317 # We need to do this before the next, more liberal match, because the next
318 # match will start at the first `<div>` and stop at the first `</div>`.
319 $text = preg_replace_callback('{(?>
321 (?<=\n) # Starting on its own line
323 \A\n? # the at beginning of the doc
327 # Match from `\n<tag>` to `</tag>\n`, handling nested tags
330 [ ]{0,'.$less_than_tab.'}
331 <('.$block_tags_b_re.')# start tag = $2
332 '.$attr.'> # attributes followed by > and \n
333 '.$content.' # content, support nesting
334 </\2> # the matching end tag
335 [ ]* # trailing spaces/tabs
336 (?=\n+|\Z) # followed by a newline or end of document
338 | # Special version for tags of group a.
340 [ ]{0,'.$less_than_tab.'}
341 <('.$block_tags_a_re.')# start tag = $3
342 '.$attr.'>[ ]*\n # attributes followed by >
343 '.$content2.' # content, support nesting
344 </\3> # the matching end tag
345 [ ]* # trailing spaces/tabs
346 (?=\n+|\Z) # followed by a newline or end of document
348 | # Special case just for <hr />. It was easier to make a special
349 # case than to make the other regex more complicated.
351 [ ]{0,'.$less_than_tab.'}
352 <(hr) # start tag = $2
353 '.$attr.' # attributes
354 /?> # the matching end tag
356 (?=\n{2,}|\Z) # followed by a blank line or end of document
358 | # Special case for standalone HTML comments:
360 [ ]{0,'.$less_than_tab.'}
365 (?=\n{2,}|\Z) # followed by a blank line or end of document
367 | # PHP and ASP-style processor instructions (<? and <%)
369 [ ]{0,'.$less_than_tab.'}
376 (?=\n{2,}|\Z) # followed by a blank line or end of document
380 array($this, '_hashHTMLBlocks_callback'),
385 protected function _hashHTMLBlocks_callback($matches) {
387 $key = $this->hashBlock($text);
388 return "\n\n$key\n\n";
392 protected function hashPart($text, $boundary = 'X') {
394 # Called whenever a tag must be hashed when a function insert an atomic
395 # element in the text stream. Passing $text to through this function gives
396 # a unique text-token which will be reverted back when calling unhash.
398 # The $boundary argument specify what character should be used to surround
399 # the token. By convension, "B" is used for block elements that needs not
400 # to be wrapped into paragraph tags at the end, ":" is used for elements
401 # that are word separators and "X" is used in the general case.
403 # Swap back any tag hash found in $text so we do not have to `unhash`
404 # multiple times at the end.
405 $text = $this->unhash($text);
407 # Then hash the block.
409 $key = "$boundary\x1A" . ++$i . $boundary;
410 $this->html_hashes[$key] = $text;
411 return $key; # String that will replace the tag.
415 protected function hashBlock($text) {
417 # Shortcut function for hashPart with block-level boundaries.
419 return $this->hashPart($text, 'B');
423 protected $block_gamut = array(
425 # These are all the transformations that form block-level
426 # tags like paragraphs, headers, and list items.
429 "doHorizontalRules" => 20,
432 "doCodeBlocks" => 50,
433 "doBlockQuotes" => 60,
436 protected function runBlockGamut($text) {
438 # Run block gamut tranformations.
440 # We need to escape raw HTML in Markdown source before doing anything
441 # else. This need to be done for each block, and not only at the
442 # begining in the Markdown function since hashed blocks can be part of
443 # list items and could have been indented. Indented blocks would have
444 # been seen as a code block in a previous pass of hashHTMLBlocks.
445 $text = $this->hashHTMLBlocks($text);
447 return $this->runBasicBlockGamut($text);
450 protected function runBasicBlockGamut($text) {
452 # Run block gamut tranformations, without hashing HTML blocks. This is
453 # useful when HTML blocks are known to be already hashed, like in the first
454 # whole-document pass.
456 foreach ($this->block_gamut as $method => $priority) {
457 $text = $this->$method($text);
460 # Finally form paragraph and restore hashed blocks.
461 $text = $this->formParagraphs($text);
467 protected function doHorizontalRules($text) {
468 # Do Horizontal Rules:
471 ^[ ]{0,3} # Leading space
472 ([-*_]) # $1: First marker
473 (?> # Repeated marker group
474 [ ]{0,2} # Zero, one, or two spaces.
475 \1 # Marker character
476 ){2,} # Group repeated at least twice
477 [ ]* # Tailing spaces
480 "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
485 protected $span_gamut = array(
487 # These are all the transformations that occur *within* block-level
488 # tags like paragraphs, headers, and list items.
490 # Process character escapes, code spans, and inline HTML
494 # Process anchor and image tags. Images must come first,
495 # because ![foo][f] looks like an anchor.
499 # Make links out of things like `<https://example.com/>`
500 # Must come after doAnchors, because you can use < and >
501 # delimiters in inline links like [this](<url>).
503 "encodeAmpsAndAngles" => 40,
505 "doItalicsAndBold" => 50,
506 "doHardBreaks" => 60,
509 protected function runSpanGamut($text) {
511 # Run span gamut tranformations.
513 foreach ($this->span_gamut as $method => $priority) {
514 $text = $this->$method($text);
521 protected function doHardBreaks($text) {
523 return preg_replace_callback('/ {2,}\n/',
524 array($this, '_doHardBreaks_callback'), $text);
526 protected function _doHardBreaks_callback($matches) {
527 return $this->hashPart("<br$this->empty_element_suffix\n");
531 protected function doAnchors($text) {
533 # Turn Markdown link shortcuts into XHTML <a> tags.
535 if ($this->in_anchor) return $text;
536 $this->in_anchor = true;
539 # First, handle reference-style links: [link text] [id]
541 $text = preg_replace_callback('{
542 ( # wrap whole match in $1
544 ('.$this->nested_brackets_re.') # link text = $2
547 [ ]? # one optional space
548 (?:\n[ ]*)? # one optional newline followed by spaces
555 array($this, '_doAnchors_reference_callback'), $text);
558 # Next, inline-style links: [link text](url "optional title")
560 $text = preg_replace_callback('{
561 ( # wrap whole match in $1
563 ('.$this->nested_brackets_re.') # link text = $2
570 ('.$this->nested_url_parenthesis_re.') # href = $4
574 ([\'"]) # quote char = $6
577 [ \n]* # ignore any spaces/tabs between closing quote and )
578 )? # title is optional
582 array($this, '_doAnchors_inline_callback'), $text);
585 # Last, handle reference-style shortcuts: [link text]
586 # These must come last in case you've also got [link text][1]
587 # or [link text](/foo)
589 $text = preg_replace_callback('{
590 ( # wrap whole match in $1
592 ([^\[\]]+) # link text = $2; can\'t contain [ or ]
596 array($this, '_doAnchors_reference_callback'), $text);
598 $this->in_anchor = false;
601 protected function _doAnchors_reference_callback($matches) {
602 $whole_match = $matches[1];
603 $link_text = $matches[2];
604 $link_id =& $matches[3];
606 if ($link_id == "") {
607 # for shortcut links like [this][] or [this].
608 $link_id = $link_text;
611 # lower-case and turn embedded newlines into spaces
612 $link_id = strtolower($link_id);
613 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
615 if (isset($this->urls[$link_id])) {
616 $url = $this->urls[$link_id];
617 $url = $this->encodeURLAttribute($url);
619 $result = "<a href=\"$url\"";
620 if ( isset( $this->titles[$link_id] ) ) {
621 $title = $this->titles[$link_id];
622 $title = $this->encodeAttribute($title);
623 $result .= " title=\"$title\"";
626 $link_text = $this->runSpanGamut($link_text);
627 $result .= ">$link_text</a>";
628 $result = $this->hashPart($result);
631 $result = $whole_match;
635 protected function _doAnchors_inline_callback($matches) {
636 $whole_match = $matches[1];
637 $link_text = $this->runSpanGamut($matches[2]);
638 $url = $matches[3] == '' ? $matches[4] : $matches[3];
639 $title =& $matches[7];
641 // if the URL was of the form <s p a c e s> it got caught by the HTML
642 // tag parser and hashed. Need to reverse the process before using the URL.
643 $unhashed = $this->unhash($url);
644 if ($unhashed != $url)
645 $url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
647 $url = $this->encodeURLAttribute($url);
649 $result = "<a href=\"$url\"";
651 $title = $this->encodeAttribute($title);
652 $result .= " title=\"$title\"";
655 $link_text = $this->runSpanGamut($link_text);
656 $result .= ">$link_text</a>";
658 return $this->hashPart($result);
662 protected function doImages($text) {
664 # Turn Markdown image shortcuts into <img> tags.
667 # First, handle reference-style labeled images: ![alt text][id]
669 $text = preg_replace_callback('{
670 ( # wrap whole match in $1
672 ('.$this->nested_brackets_re.') # alt text = $2
675 [ ]? # one optional space
676 (?:\n[ ]*)? # one optional newline followed by spaces
684 array($this, '_doImages_reference_callback'), $text);
687 # Next, handle inline images: ![alt text](url "optional title")
688 # Don't forget: encode * and _
690 $text = preg_replace_callback('{
691 ( # wrap whole match in $1
693 ('.$this->nested_brackets_re.') # alt text = $2
695 \s? # One optional whitespace character
699 <(\S*)> # src url = $3
701 ('.$this->nested_url_parenthesis_re.') # src url = $4
705 ([\'"]) # quote char = $6
709 )? # title is optional
713 array($this, '_doImages_inline_callback'), $text);
717 protected function _doImages_reference_callback($matches) {
718 $whole_match = $matches[1];
719 $alt_text = $matches[2];
720 $link_id = strtolower($matches[3]);
722 if ($link_id == "") {
723 $link_id = strtolower($alt_text); # for shortcut links like ![this][].
726 $alt_text = $this->encodeAttribute($alt_text);
727 if (isset($this->urls[$link_id])) {
728 $url = $this->encodeURLAttribute($this->urls[$link_id]);
729 $result = "<img src=\"$url\" alt=\"$alt_text\"";
730 if (isset($this->titles[$link_id])) {
731 $title = $this->titles[$link_id];
732 $title = $this->encodeAttribute($title);
733 $result .= " title=\"$title\"";
735 $result .= $this->empty_element_suffix;
736 $result = $this->hashPart($result);
739 # If there's no such link ID, leave intact:
740 $result = $whole_match;
745 protected function _doImages_inline_callback($matches) {
746 $whole_match = $matches[1];
747 $alt_text = $matches[2];
748 $url = $matches[3] == '' ? $matches[4] : $matches[3];
749 $title =& $matches[7];
751 $alt_text = $this->encodeAttribute($alt_text);
752 $url = $this->encodeURLAttribute($url);
753 $result = "<img src=\"$url\" alt=\"$alt_text\"";
755 $title = $this->encodeAttribute($title);
756 $result .= " title=\"$title\""; # $title already quoted
758 $result .= $this->empty_element_suffix;
760 return $this->hashPart($result);
764 protected function doHeaders($text) {
765 # Setext-style headers:
772 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
773 array($this, '_doHeaders_callback_setext'), $text);
778 # ## Header 2 with closing hashes ##
782 $text = preg_replace_callback('{
783 ^(\#{1,6}) # $1 = string of #\'s
785 (.+?) # $2 = Header text
787 \#* # optional closing #\'s (not counted)
790 array($this, '_doHeaders_callback_atx'), $text);
795 protected function _doHeaders_callback_setext($matches) {
796 # Terrible hack to check we haven't found an empty list item.
797 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
800 $level = $matches[2]{0} == '=' ? 1 : 2;
802 # id attribute generation
803 $idAtt = $this->_generateIdFromHeaderValue($matches[1]);
805 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[1])."</h$level>";
806 return "\n" . $this->hashBlock($block) . "\n\n";
808 protected function _doHeaders_callback_atx($matches) {
810 # id attribute generation
811 $idAtt = $this->_generateIdFromHeaderValue($matches[2]);
813 $level = strlen($matches[1]);
814 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[2])."</h$level>";
815 return "\n" . $this->hashBlock($block) . "\n\n";
818 protected function _generateIdFromHeaderValue($headerValue) {
820 # if a header_id_func property is set, we can use it to automatically
821 # generate an id attribute.
823 # This method returns a string in the form id="foo", or an empty string
825 if (!is_callable($this->header_id_func)) {
828 $idValue = call_user_func($this->header_id_func, $headerValue);
829 if (!$idValue) return "";
831 return ' id="' . $this->encodeAttribute($idValue) . '"';
835 protected function doLists($text) {
837 # Form HTML ordered (numbered) and unordered (bulleted) lists.
839 $less_than_tab = $this->tab_width - 1;
841 # Re-usable patterns to match list item bullets and number markers:
842 $marker_ul_re = '[*+-]';
843 $marker_ol_re = '\d+[\.]';
845 $markers_relist = array(
846 $marker_ul_re => $marker_ol_re,
847 $marker_ol_re => $marker_ul_re,
850 foreach ($markers_relist as $marker_re => $other_marker_re) {
851 # Re-usable pattern to match any entirel ul or ol list:
855 ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
856 ('.$marker_re.') # $4 = first list item marker
865 (?! # Negative lookahead for another list item marker
870 (?= # Lookahead for another kind of list
872 \3 # Must have the same indentation
873 '.$other_marker_re.'[ ]+
879 # We use a different prefix before nested lists than top-level lists.
880 # See extended comment in _ProcessListItems().
882 if ($this->list_level) {
883 $text = preg_replace_callback('{
887 array($this, '_doLists_callback'), $text);
890 $text = preg_replace_callback('{
891 (?:(?<=\n)\n|\A\n?) # Must eat the newline
894 array($this, '_doLists_callback'), $text);
900 protected function _doLists_callback($matches) {
901 # Re-usable patterns to match list item bullets and number markers:
902 $marker_ul_re = '[*+-]';
903 $marker_ol_re = '\d+[\.]';
904 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
905 $marker_ol_start_re = '[0-9]+';
908 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
910 $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
913 $result = $this->processListItems($list, $marker_any_re);
916 if ($this->enhanced_ordered_list) {
917 # Get the start number for ordered list.
918 if ($list_type == 'ol') {
919 $ol_start_array = array();
920 $ol_start_check = preg_match("/$marker_ol_start_re/", $matches[4], $ol_start_array);
921 if ($ol_start_check){
922 $ol_start = $ol_start_array[0];
927 if ($ol_start > 1 && $list_type == 'ol'){
928 $result = $this->hashBlock("<$list_type start=\"$ol_start\">\n" . $result . "</$list_type>");
930 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
932 return "\n". $result ."\n\n";
935 protected $list_level = 0;
937 protected function processListItems($list_str, $marker_any_re) {
939 # Process the contents of a single ordered or unordered list, splitting it
940 # into individual list items.
942 # The $this->list_level global keeps track of when we're inside a list.
943 # Each time we enter a list, we increment it; when we leave a list,
944 # we decrement. If it's zero, we're not in a list anymore.
946 # We do this because when we're not inside a list, we want to treat
947 # something like this:
949 # I recommend upgrading to version
950 # 8. Oops, now this line is treated
953 # As a single paragraph, despite the fact that the second line starts
954 # with a digit-period-space sequence.
956 # Whereas when we're inside a list (or sub-list), that line will be
957 # treated as the start of a sub-list. What a kludge, huh? This is
958 # an aspect of Markdown's syntax that's hard to parse perfectly
959 # without resorting to mind-reading. Perhaps the solution is to
960 # change the syntax rules such that sub-lists must start with a
961 # starting cardinal number; e.g. "1." or "a.".
965 # trim trailing blank lines:
966 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
968 $list_str = preg_replace_callback('{
969 (\n)? # leading line = $1
970 (^[ ]*) # leading whitespace = $2
971 ('.$marker_any_re.' # list marker and space = $3
972 (?:[ ]+|(?=\n)) # space only required if item is not empty
974 ((?s:.*?)) # list item text = $4
975 (?:(\n+(?=\n))|\n) # tailing blank line = $5
976 (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
978 array($this, '_processListItems_callback'), $list_str);
983 protected function _processListItems_callback($matches) {
985 $leading_line =& $matches[1];
986 $leading_space =& $matches[2];
987 $marker_space = $matches[3];
988 $tailing_blank_line =& $matches[5];
990 if ($leading_line || $tailing_blank_line ||
991 preg_match('/\n{2,}/', $item))
993 # Replace marker with the appropriate whitespace indentation
994 $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
995 $item = $this->runBlockGamut($this->outdent($item)."\n");
998 # Recursion for sub-lists:
999 $item = $this->doLists($this->outdent($item));
1000 $item = preg_replace('/\n+$/', '', $item);
1001 $item = $this->runSpanGamut($item);
1004 return "<li>" . $item . "</li>\n";
1008 protected function doCodeBlocks($text) {
1010 # Process Markdown `<pre><code>` blocks.
1012 $text = preg_replace_callback('{
1014 ( # $1 = the code block -- one or more lines, starting with a space/tab
1016 [ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces
1020 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1022 array($this, '_doCodeBlocks_callback'), $text);
1026 protected function _doCodeBlocks_callback($matches) {
1027 $codeblock = $matches[1];
1029 $codeblock = $this->outdent($codeblock);
1030 if ($this->code_block_content_func) {
1031 $codeblock = call_user_func($this->code_block_content_func, $codeblock, "");
1033 $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
1036 # trim leading newlines and trailing newlines
1037 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
1039 $codeblock = "<pre><code>$codeblock\n</code></pre>";
1040 return "\n\n".$this->hashBlock($codeblock)."\n\n";
1044 protected function makeCodeSpan($code) {
1046 # Create a code span markup for $code. Called from handleSpanToken.
1048 $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
1049 return $this->hashPart("<code>$code</code>");
1053 protected $em_relist = array(
1054 '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?![\.,:;]?\s)',
1055 '*' => '(?<![\s*])\*(?!\*)',
1056 '_' => '(?<![\s_])_(?!_)',
1058 protected $strong_relist = array(
1059 '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?![\.,:;]?\s)',
1060 '**' => '(?<![\s*])\*\*(?!\*)',
1061 '__' => '(?<![\s_])__(?!_)',
1063 protected $em_strong_relist = array(
1064 '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?![\.,:;]?\s)',
1065 '***' => '(?<![\s*])\*\*\*(?!\*)',
1066 '___' => '(?<![\s_])___(?!_)',
1068 protected $em_strong_prepared_relist;
1070 protected function prepareItalicsAndBold() {
1072 # Prepare regular expressions for searching emphasis tokens in any
1075 foreach ($this->em_relist as $em => $em_re) {
1076 foreach ($this->strong_relist as $strong => $strong_re) {
1077 # Construct list of allowed token expressions.
1078 $token_relist = array();
1079 if (isset($this->em_strong_relist["$em$strong"])) {
1080 $token_relist[] = $this->em_strong_relist["$em$strong"];
1082 $token_relist[] = $em_re;
1083 $token_relist[] = $strong_re;
1085 # Construct master expression from list.
1086 $token_re = '{('. implode('|', $token_relist) .')}';
1087 $this->em_strong_prepared_relist["$em$strong"] = $token_re;
1092 protected function doItalicsAndBold($text) {
1093 $token_stack = array('');
1094 $text_stack = array('');
1097 $tree_char_em = false;
1101 # Get prepared regular expression for seraching emphasis tokens
1102 # in current context.
1104 $token_re = $this->em_strong_prepared_relist["$em$strong"];
1107 # Each loop iteration search for the next emphasis token.
1108 # Each token is then passed to handleSpanToken.
1110 $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
1111 $text_stack[0] .= $parts[0];
1112 $token =& $parts[1];
1115 if (empty($token)) {
1116 # Reached end of text span: empty stack without emitting.
1117 # any more emphasis.
1118 while ($token_stack[0]) {
1119 $text_stack[1] .= array_shift($token_stack);
1120 $text_stack[0] .= array_shift($text_stack);
1125 $token_len = strlen($token);
1126 if ($tree_char_em) {
1127 # Reached closing marker while inside a three-char emphasis.
1128 if ($token_len == 3) {
1129 # Three-char closing marker, close em and strong.
1130 array_shift($token_stack);
1131 $span = array_shift($text_stack);
1132 $span = $this->runSpanGamut($span);
1133 $span = "<strong><em>$span</em></strong>";
1134 $text_stack[0] .= $this->hashPart($span);
1138 # Other closing marker: close one em or strong and
1139 # change current token state to match the other
1140 $token_stack[0] = str_repeat($token{0}, 3-$token_len);
1141 $tag = $token_len == 2 ? "strong" : "em";
1142 $span = $text_stack[0];
1143 $span = $this->runSpanGamut($span);
1144 $span = "<$tag>$span</$tag>";
1145 $text_stack[0] = $this->hashPart($span);
1146 $$tag = ''; # $$tag stands for $em or $strong
1148 $tree_char_em = false;
1149 } else if ($token_len == 3) {
1151 # Reached closing marker for both em and strong.
1152 # Closing strong marker:
1153 for ($i = 0; $i < 2; ++$i) {
1154 $shifted_token = array_shift($token_stack);
1155 $tag = strlen($shifted_token) == 2 ? "strong" : "em";
1156 $span = array_shift($text_stack);
1157 $span = $this->runSpanGamut($span);
1158 $span = "<$tag>$span</$tag>";
1159 $text_stack[0] .= $this->hashPart($span);
1160 $$tag = ''; # $$tag stands for $em or $strong
1163 # Reached opening three-char emphasis marker. Push on token
1164 # stack; will be handled by the special condition above.
1167 array_unshift($token_stack, $token);
1168 array_unshift($text_stack, '');
1169 $tree_char_em = true;
1171 } else if ($token_len == 2) {
1173 # Unwind any dangling emphasis marker:
1174 if (strlen($token_stack[0]) == 1) {
1175 $text_stack[1] .= array_shift($token_stack);
1176 $text_stack[0] .= array_shift($text_stack);
1178 # Closing strong marker:
1179 array_shift($token_stack);
1180 $span = array_shift($text_stack);
1181 $span = $this->runSpanGamut($span);
1182 $span = "<strong>$span</strong>";
1183 $text_stack[0] .= $this->hashPart($span);
1186 array_unshift($token_stack, $token);
1187 array_unshift($text_stack, '');
1191 # Here $token_len == 1
1193 if (strlen($token_stack[0]) == 1) {
1194 # Closing emphasis marker:
1195 array_shift($token_stack);
1196 $span = array_shift($text_stack);
1197 $span = $this->runSpanGamut($span);
1198 $span = "<em>$span</em>";
1199 $text_stack[0] .= $this->hashPart($span);
1202 $text_stack[0] .= $token;
1205 array_unshift($token_stack, $token);
1206 array_unshift($text_stack, '');
1211 return $text_stack[0];
1215 protected function doBlockQuotes($text) {
1216 $text = preg_replace_callback('/
1217 ( # Wrap whole match in $1
1219 ^[ ]*>[ ]? # ">" at the start of a line
1220 .+\n # rest of the first line
1221 (.+\n)* # subsequent consecutive lines
1226 array($this, '_doBlockQuotes_callback'), $text);
1230 protected function _doBlockQuotes_callback($matches) {
1232 # trim one level of quoting - trim whitespace-only lines
1233 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
1234 $bq = $this->runBlockGamut($bq); # recurse
1236 $bq = preg_replace('/^/m', " ", $bq);
1237 # These leading spaces cause problem with <pre> content,
1238 # so we need to fix that:
1239 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
1240 array($this, '_doBlockQuotes_callback2'), $bq);
1242 return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
1244 protected function _doBlockQuotes_callback2($matches) {
1246 $pre = preg_replace('/^ /m', '', $pre);
1251 protected function formParagraphs($text) {
1254 # $text - string to process with html <p> tags
1256 # Strip leading and trailing lines:
1257 $text = preg_replace('/\A\n+|\n+\z/', '', $text);
1259 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
1262 # Wrap <p> tags and unhashify HTML blocks
1264 foreach ($grafs as $key => $value) {
1265 if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
1267 $value = $this->runSpanGamut($value);
1268 $value = preg_replace('/^([ ]*)/', "<p>", $value);
1270 $grafs[$key] = $this->unhash($value);
1274 # Modify elements of @grafs in-place...
1276 $block = $this->html_hashes[$graf];
1278 // if (preg_match('{
1280 // ( # $1 = <div> tag
1284 // markdown\s*=\s* ([\'"]) # $2 = attr quote char
1290 // ( # $3 = contents
1293 // (</div>) # $4 = closing tag
1295 // }xs', $block, $matches))
1297 // list(, $div_open, , $div_content, $div_close) = $matches;
1299 // # We can't call Markdown(), because that resets the hash;
1300 // # that initialization code should be pulled into its own sub, though.
1301 // $div_content = $this->hashHTMLBlocks($div_content);
1303 // # Run document gamut methods on the content.
1304 // foreach ($this->document_gamut as $method => $priority) {
1305 // $div_content = $this->$method($div_content);
1308 // $div_open = preg_replace(
1309 // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1311 // $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1313 $grafs[$key] = $graf;
1317 return implode("\n\n", $grafs);
1321 protected function encodeAttribute($text) {
1323 # Encode text for a double-quoted HTML attribute. This function
1324 # is *not* suitable for attributes enclosed in single quotes.
1326 $text = $this->encodeAmpsAndAngles($text);
1327 $text = str_replace('"', '"', $text);
1332 protected function encodeURLAttribute($url, &$text = null) {
1334 # Encode text for a double-quoted HTML attribute containing a URL,
1335 # applying the URL filter if set. Also generates the textual
1336 # representation for the URL (removing mailto: or tel:) storing it in $text.
1337 # This function is *not* suitable for attributes enclosed in single quotes.
1339 if ($this->url_filter_func)
1340 $url = call_user_func($this->url_filter_func, $url);
1342 if (preg_match('{^mailto:}i', $url))
1343 $url = $this->encodeEntityObfuscatedAttribute($url, $text, 7);
1344 else if (preg_match('{^tel:}i', $url))
1346 $url = $this->encodeAttribute($url);
1347 $text = substr($url, 4);
1351 $url = $this->encodeAttribute($url);
1359 protected function encodeAmpsAndAngles($text) {
1361 # Smart processing for ampersands and angle brackets that need to
1362 # be encoded. Valid character entities are left alone unless the
1363 # no-entities mode is set.
1365 if ($this->no_entities) {
1366 $text = str_replace('&', '&', $text);
1368 # Ampersand-encoding based entirely on Nat Irons's Amputator
1369 # MT plugin: <http://bumppo.net/projects/amputator/>
1370 $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
1373 # Encode remaining <'s
1374 $text = str_replace('<', '<', $text);
1380 protected function doAutoLinks($text) {
1381 $text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i',
1382 array($this, '_doAutoLinks_url_callback'), $text);
1384 # Email addresses: <address@domain.foo>
1385 $text = preg_replace_callback('{
1390 [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
1396 [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1398 \[[\d.a-fA-F:]+\] # IPv4 & IPv6
1403 array($this, '_doAutoLinks_email_callback'), $text);
1407 protected function _doAutoLinks_url_callback($matches) {
1408 $url = $this->encodeURLAttribute($matches[1], $text);
1409 $link = "<a href=\"$url\">$text</a>";
1410 return $this->hashPart($link);
1412 protected function _doAutoLinks_email_callback($matches) {
1413 $addr = $matches[1];
1414 $url = $this->encodeURLAttribute("mailto:$addr", $text);
1415 $link = "<a href=\"$url\">$text</a>";
1416 return $this->hashPart($link);
1420 protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) {
1422 # Input: some text to obfuscate, e.g. "mailto:foo@example.com"
1424 # Output: the same text but with most characters encoded as either a
1425 # decimal or hex entity, in the hopes of foiling most address
1426 # harvesting spam bots. E.g.:
1428 # mailto:foo
1429 # @example.co
1432 # Note: the additional output $tail is assigned the same value as the
1433 # ouput, minus the number of characters specified by $head_length.
1435 # Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1436 # With some optimizations by Milian Wolff. Forced encoding of HTML
1437 # attribute special characters by Allan Odgaard.
1439 if ($text == "") return $tail = "";
1441 $chars = preg_split('/(?<!^)(?!$)/', $text);
1442 $seed = (int)abs(crc32($text) / strlen($text)); # Deterministic seed.
1444 foreach ($chars as $key => $char) {
1446 # Ignore non-ascii chars.
1448 $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
1449 # roughly 10% raw, 45% hex, 45% dec
1450 # '@' *must* be encoded. I insist.
1451 # '"' and '>' have to be encoded inside the attribute
1452 if ($r > 90 && strpos('@"&>', $char) === false) /* do nothing */;
1453 else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
1454 else $chars[$key] = '&#'.$ord.';';
1458 $text = implode('', $chars);
1459 $tail = $head_length ? implode('', array_slice($chars, $head_length)) : $text;
1465 protected function parseSpan($str) {
1467 # Take the string $str and parse it into tokens, hashing embeded HTML,
1468 # escaped characters and handling code spans.
1474 \\\\'.$this->escape_chars_re.'
1477 `+ # code span marker
1478 '.( $this->no_markup ? '' : '
1480 <!-- .*? --> # comment
1482 <\?.*?\?> | <%.*?%> # processing instruction
1484 <[!$]?[-a-zA-Z0-9:_]+ # regular tags
1487 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1491 <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag
1493 </[-a-zA-Z0-9:_]+\s*> # closing tag
1500 # Each loop iteration seach for either the next tag, the next
1501 # openning code span marker, or the next escaped character.
1502 # Each token is then passed to handleSpanToken.
1504 $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
1506 # Create token from text preceding tag.
1507 if ($parts[0] != "") {
1508 $output .= $parts[0];
1511 # Check if we reach the end.
1512 if (isset($parts[1])) {
1513 $output .= $this->handleSpanToken($parts[1], $parts[2]);
1525 protected function handleSpanToken($token, &$str) {
1527 # Handle $token provided by parseSpan by determining its nature and
1528 # returning the corresponding value that should replace it.
1530 switch ($token{0}) {
1532 return $this->hashPart("&#". ord($token{1}). ";");
1534 # Search for end marker in remaining text.
1535 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
1539 $codespan = $this->makeCodeSpan($matches[1]);
1540 return $this->hashPart($codespan);
1542 return $token; // return as text since no ending marker found.
1544 return $this->hashPart($token);
1549 protected function outdent($text) {
1551 # Remove one level of line-leading tabs or spaces
1553 return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
1557 # String length function for detab. `_initDetab` will create a function to
1558 # hanlde UTF-8 if the default function does not exist.
1559 protected $utf8_strlen = 'mb_strlen';
1561 protected function detab($text) {
1563 # Replace tabs with the appropriate amount of space.
1565 # For each line we separate the line in blocks delemited by
1566 # tab characters. Then we reconstruct every line by adding the
1567 # appropriate number of space between each blocks.
1569 $text = preg_replace_callback('/^.*\t.*$/m',
1570 array($this, '_detab_callback'), $text);
1574 protected function _detab_callback($matches) {
1575 $line = $matches[0];
1576 $strlen = $this->utf8_strlen; # strlen function for UTF-8.
1579 $blocks = explode("\t", $line);
1580 # Add each blocks to the line.
1582 unset($blocks[0]); # Do not add first block twice.
1583 foreach ($blocks as $block) {
1584 # Calculate amount of space, insert spaces, insert block.
1585 $amount = $this->tab_width -
1586 $strlen($line, 'UTF-8') % $this->tab_width;
1587 $line .= str_repeat(" ", $amount) . $block;
1591 protected function _initDetab() {
1593 # Check for the availability of the function in the `utf8_strlen` property
1594 # (initially `mb_strlen`). If the function is not available, create a
1595 # function that will loosely count the number of UTF-8 characters with a
1596 # regular expression.
1598 if (function_exists($this->utf8_strlen)) return;
1599 $this->utf8_strlen = create_function('$text', 'return preg_match_all(
1600 "/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
1605 protected function unhash($text) {
1607 # Swap back in all the tags hashed by _HashHTMLBlocks.
1609 return preg_replace_callback('/(.)\x1A[0-9]+\1/',
1610 array($this, '_unhash_callback'), $text);
1612 protected function _unhash_callback($matches) {
1613 return $this->html_hashes[$matches[0]];