extlib/Michelf/Markdown.php

   1 <?php
   2 #
   3 # Markdown  -  A text-to-HTML conversion tool for web writers
   4 #
   5 # PHP Markdown
   6 # Copyright (c) 2004-2015 Michel Fortin
   7 # <https://michelf.ca/projects/php-markdown/>
   8 #
   9 # Original Markdown
  10 # Copyright (c) 2004-2006 John Gruber
  11 # <https://daringfireball.net/projects/markdown/>
  12 #
  13 namespace Michelf;
  14
  15
  16 #
  17 # Markdown Parser Class
  18 #
  19
  20 class Markdown implements MarkdownInterface {
  21
  22         ### Version ###
  23
  24         const  MARKDOWNLIB_VERSION  =  "1.6.0";
  25
  26         ### Simple Function Interface ###
  27
  28         public static function defaultTransform($text) {
  29         #
  30         # Initialize the parser and return the result of its transform method.
  31         # This will work fine for derived classes too.
  32         #
  33                 # Take parser class on which this function was called.
  34                 $parser_class = \get_called_class();
  35
  36                 # try to take parser from the static parser list
  37                 static $parser_list;
  38                 $parser =& $parser_list[$parser_class];
  39
  40                 # create the parser it not already set
  41                 if (!$parser)
  42                         $parser = new $parser_class;
  43
  44                 # Transform text using parser.
  45                 return $parser->transform($text);
  46         }
  47
  48         ### Configuration Variables ###
  49
  50         # Change to ">" for HTML output.
  51         public $empty_element_suffix = " />";
  52         public $tab_width = 4;
  53
  54         # Change to `true` to disallow markup or entities.
  55         public $no_markup = false;
  56         public $no_entities = false;
  57
  58         # Predefined urls and titles for reference links and images.
  59         public $predef_urls = array();
  60         public $predef_titles = array();
  61
  62         # Optional filter function for URLs
  63         public $url_filter_func = null;
  64
  65         # Optional header id="" generation callback function.
  66         public $header_id_func = null;
  67
  68         # Optional function for converting code block content to HTML
  69         public $code_block_content_func = null;
  70
  71         # Class attribute to toggle "enhanced ordered list" behaviour
  72         # setting this to true will allow ordered lists to start from the index
  73         # number that is defined first.  For example:
  74         # 2. List item two
  75         # 3. List item three
  76         #
  77         # becomes
  78         # <ol start="2">
  79         # <li>List item two</li>
  80         # <li>List item three</li>
  81         # </ol>
  82         public $enhanced_ordered_list = false;
  83
  84         ### Parser Implementation ###
  85
  86         # Regex to match balanced [brackets].
  87         # Needed to insert a maximum bracked depth while converting to PHP.
  88         protected $nested_brackets_depth = 6;
  89         protected $nested_brackets_re;
  90
  91         protected $nested_url_parenthesis_depth = 4;
  92         protected $nested_url_parenthesis_re;
  93
  94         # Table of hash values for escaped characters:
  95         protected $escape_chars = '\`*_{}[]()>#+-.!';
  96         protected $escape_chars_re;
  97
  98
  99         public function __construct() {
 100         #
 101         # Constructor function. Initialize appropriate member variables.
 102         #
 103                 $this->_initDetab();
 104                 $this->prepareItalicsAndBold();
 105
 106                 $this->nested_brackets_re =
 107                         str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
 108                         str_repeat('\])*', $this->nested_brackets_depth);
 109
 110                 $this->nested_url_parenthesis_re =
 111                         str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
 112                         str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
 113
 114                 $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
 115
 116                 # Sort document, block, and span gamut in ascendent priority order.
 117                 asort($this->document_gamut);
 118                 asort($this->block_gamut);
 119                 asort($this->span_gamut);
 120         }
 121
 122
 123         # Internal hashes used during transformation.
 124         protected $urls = array();
 125         protected $titles = array();
 126         protected $html_hashes = array();
 127
 128         # Status flag to avoid invalid nesting.
 129         protected $in_anchor = false;
 130
 131
 132         protected function setup() {
 133         #
 134         # Called before the transformation process starts to setup parser
 135         # states.
 136         #
 137                 # Clear global hashes.
 138                 $this->urls = $this->predef_urls;
 139                 $this->titles = $this->predef_titles;
 140                 $this->html_hashes = array();
 141
 142                 $this->in_anchor = false;
 143         }
 144
 145         protected function teardown() {
 146         #
 147         # Called after the transformation process to clear any variable
 148         # which may be taking up memory unnecessarly.
 149         #
 150                 $this->urls = array();
 151                 $this->titles = array();
 152                 $this->html_hashes = array();
 153         }
 154
 155
 156         public function transform($text) {
 157         #
 158         # Main function. Performs some preprocessing on the input text
 159         # and pass it through the document gamut.
 160         #
 161                 $this->setup();
 162
 163                 # Remove UTF-8 BOM and marker character in input, if present.
 164                 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
 165
 166                 # Standardize line endings:
 167                 #   DOS to Unix and Mac to Unix
 168                 $text = preg_replace('{\r\n?}', "\n", $text);
 169
 170                 # Make sure $text ends with a couple of newlines:
 171                 $text .= "\n\n";
 172
 173                 # Convert all tabs to spaces.
 174                 $text = $this->detab($text);
 175
 176                 # Turn block-level HTML blocks into hash entries
 177                 $text = $this->hashHTMLBlocks($text);
 178
 179                 # Strip any lines consisting only of spaces and tabs.
 180                 # This makes subsequent regexen easier to write, because we can
 181                 # match consecutive blank lines with /\n+/ instead of something
 182                 # contorted like /[ ]*\n+/ .
 183                 $text = preg_replace('/^[ ]+$/m', '', $text);
 184
 185                 # Run document gamut methods.
 186                 foreach ($this->document_gamut as $method => $priority) {
 187                         $text = $this->$method($text);
 188                 }
 189
 190                 $this->teardown();
 191
 192                 return $text . "\n";
 193         }
 194
 195         protected $document_gamut = array(
 196                 # Strip link definitions, store in hashes.
 197                 "stripLinkDefinitions" => 20,
 198
 199                 "runBasicBlockGamut"   => 30,
 200                 );
 201
 202
 203         protected function stripLinkDefinitions($text) {
 204         #
 205         # Strips link definitions from text, stores the URLs and titles in
 206         # hash references.
 207         #
 208                 $less_than_tab = $this->tab_width - 1;
 209
 210                 # Link defs are in the form: ^[id]: url "optional title"
 211                 $text = preg_replace_callback('{
 212                                                         ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
 213                                                           [ ]*
 214                                                           \n?                           # maybe *one* newline
 215                                                           [ ]*
 216                                                         (?:
 217                                                           <(.+?)>                       # url = $2
 218                                                         |
 219                                                           (\S+?)                        # url = $3
 220                                                         )
 221                                                           [ ]*
 222                                                           \n?                           # maybe one newline
 223                                                           [ ]*
 224                                                         (?:
 225                                                                 (?<=\s)                 # lookbehind for whitespace
 226                                                                 ["(]
 227                                                                 (.*?)                   # title = $4
 228                                                                 [")]
 229                                                                 [ ]*
 230                                                         )?      # title is optional
 231                                                         (?:\n+|\Z)
 232                         }xm',
 233                         array($this, '_stripLinkDefinitions_callback'),
 234                         $text);
 235                 return $text;
 236         }
 237         protected function _stripLinkDefinitions_callback($matches) {
 238                 $link_id = strtolower($matches[1]);
 239                 $url = $matches[2] == '' ? $matches[3] : $matches[2];
 240                 $this->urls[$link_id] = $url;
 241                 $this->titles[$link_id] =& $matches[4];
 242                 return ''; # String that will replace the block
 243         }
 244
 245
 246         protected function hashHTMLBlocks($text) {
 247                 if ($this->no_markup)  return $text;
 248
 249                 $less_than_tab = $this->tab_width - 1;
 250
 251                 # Hashify HTML blocks:
 252                 # We only want to do this for block-level HTML tags, such as headers,
 253                 # lists, and tables. That's because we still want to wrap <p>s around
 254                 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 255                 # phrase emphasis, and spans. The list of tags we're looking for is
 256                 # hard-coded:
 257                 #
 258                 # *  List "a" is made of tags which can be both inline or block-level.
 259                 #    These will be treated block-level when the start tag is alone on
 260                 #    its line, otherwise they're not matched here and will be taken as
 261                 #    inline later.
 262                 # *  List "b" is made of tags which are always block-level;
 263                 #
 264                 $block_tags_a_re = 'ins|del';
 265                 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
 266                                                    'script|noscript|style|form|fieldset|iframe|math|svg|'.
 267                                                    'article|section|nav|aside|hgroup|header|footer|'.
 268                                                    'figure';
 269
 270                 # Regular expression for the content of a block tag.
 271                 $nested_tags_level = 4;
 272                 $attr = '
 273                         (?>                             # optional tag attributes
 274                           \s                    # starts with whitespace
 275                           (?>
 276                                 [^>"/]+         # text outside quotes
 277                           |
 278                                 /+(?!>)         # slash not followed by ">"
 279                           |
 280                                 "[^"]*"         # text inside double quotes (tolerate ">")
 281                           |
 282                                 \'[^\']*\'      # text inside single quotes (tolerate ">")
 283                           )*
 284                         )?
 285                         ';
 286                 $content =
 287                         str_repeat('
 288                                 (?>
 289                                   [^<]+                 # content without tag
 290                                 |
 291                                   <\2                   # nested opening tag
 292                                         '.$attr.'       # attributes
 293                                         (?>
 294                                           />
 295                                         |
 296                                           >', $nested_tags_level).      # end of opening tag
 297                                           '.*?'.                                        # last level nested tag content
 298                         str_repeat('
 299                                           </\2\s*>      # closing nested tag
 300                                         )
 301                                   |
 302                                         <(?!/\2\s*>     # other tags with a different name
 303                                   )
 304                                 )*',
 305                                 $nested_tags_level);
 306                 $content2 = str_replace('\2', '\3', $content);
 307
 308                 # First, look for nested blocks, e.g.:
 309                 #       <div>
 310                 #               <div>
 311                 #               tags for inner block must be indented.
 312                 #               </div>
 313                 #       </div>
 314                 #
 315                 # The outermost tags must start at the left margin for this to match, and
 316                 # the inner nested divs must be indented.
 317                 # We need to do this before the next, more liberal match, because the next
 318                 # match will start at the first `<div>` and stop at the first `</div>`.
 319                 $text = preg_replace_callback('{(?>
 320                         (?>
 321                                 (?<=\n)                 # Starting on its own line
 322                                 |                               # or
 323                                 \A\n?                   # the at beginning of the doc
 324                         )
 325                         (                                               # save in $1
 326
 327                           # Match from `\n<tag>` to `</tag>\n`, handling nested tags
 328                           # in between.
 329
 330                                                 [ ]{0,'.$less_than_tab.'}
 331                                                 <('.$block_tags_b_re.')# start tag = $2
 332                                                 '.$attr.'>                      # attributes followed by > and \n
 333                                                 '.$content.'            # content, support nesting
 334                                                 </\2>                           # the matching end tag
 335                                                 [ ]*                            # trailing spaces/tabs
 336                                                 (?=\n+|\Z)      # followed by a newline or end of document
 337
 338                         | # Special version for tags of group a.
 339
 340                                                 [ ]{0,'.$less_than_tab.'}
 341                                                 <('.$block_tags_a_re.')# start tag = $3
 342                                                 '.$attr.'>[ ]*\n        # attributes followed by >
 343                                                 '.$content2.'           # content, support nesting
 344                                                 </\3>                           # the matching end tag
 345                                                 [ ]*                            # trailing spaces/tabs
 346                                                 (?=\n+|\Z)      # followed by a newline or end of document
 347
 348                         | # Special case just for <hr />. It was easier to make a special
 349                           # case than to make the other regex more complicated.
 350
 351                                                 [ ]{0,'.$less_than_tab.'}
 352                                                 <(hr)                           # start tag = $2
 353                                                 '.$attr.'                       # attributes
 354                                                 /?>                                     # the matching end tag
 355                                                 [ ]*
 356                                                 (?=\n{2,}|\Z)           # followed by a blank line or end of document
 357
 358                         | # Special case for standalone HTML comments:
 359
 360                                         [ ]{0,'.$less_than_tab.'}
 361                                         (?s:
 362                                                 <!-- .*? -->
 363                                         )
 364                                         [ ]*
 365                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 366
 367                         | # PHP and ASP-style processor instructions (<? and <%)
 368
 369                                         [ ]{0,'.$less_than_tab.'}
 370                                         (?s:
 371                                                 <([?%])                 # $2
 372                                                 .*?
 373                                                 \2>
 374                                         )
 375                                         [ ]*
 376                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 377
 378                         )
 379                         )}Sxmi',
 380                         array($this, '_hashHTMLBlocks_callback'),
 381                         $text);
 382
 383                 return $text;
 384         }
 385         protected function _hashHTMLBlocks_callback($matches) {
 386                 $text = $matches[1];
 387                 $key  = $this->hashBlock($text);
 388                 return "\n\n$key\n\n";
 389         }
 390
 391
 392         protected function hashPart($text, $boundary = 'X') {
 393         #
 394         # Called whenever a tag must be hashed when a function insert an atomic
 395         # element in the text stream. Passing $text to through this function gives
 396         # a unique text-token which will be reverted back when calling unhash.
 397         #
 398         # The $boundary argument specify what character should be used to surround
 399         # the token. By convension, "B" is used for block elements that needs not
 400         # to be wrapped into paragraph tags at the end, ":" is used for elements
 401         # that are word separators and "X" is used in the general case.
 402         #
 403                 # Swap back any tag hash found in $text so we do not have to `unhash`
 404                 # multiple times at the end.
 405                 $text = $this->unhash($text);
 406
 407                 # Then hash the block.
 408                 static $i = 0;
 409                 $key = "$boundary\x1A" . ++$i . $boundary;
 410                 $this->html_hashes[$key] = $text;
 411                 return $key; # String that will replace the tag.
 412         }
 413
 414
 415         protected function hashBlock($text) {
 416         #
 417         # Shortcut function for hashPart with block-level boundaries.
 418         #
 419                 return $this->hashPart($text, 'B');
 420         }
 421
 422
 423         protected $block_gamut = array(
 424         #
 425         # These are all the transformations that form block-level
 426         # tags like paragraphs, headers, and list items.
 427         #
 428                 "doHeaders"         => 10,
 429                 "doHorizontalRules" => 20,
 430
 431                 "doLists"           => 40,
 432                 "doCodeBlocks"      => 50,
 433                 "doBlockQuotes"     => 60,
 434                 );
 435
 436         protected function runBlockGamut($text) {
 437         #
 438         # Run block gamut tranformations.
 439         #
 440                 # We need to escape raw HTML in Markdown source before doing anything
 441                 # else. This need to be done for each block, and not only at the
 442                 # begining in the Markdown function since hashed blocks can be part of
 443                 # list items and could have been indented. Indented blocks would have
 444                 # been seen as a code block in a previous pass of hashHTMLBlocks.
 445                 $text = $this->hashHTMLBlocks($text);
 446
 447                 return $this->runBasicBlockGamut($text);
 448         }
 449
 450         protected function runBasicBlockGamut($text) {
 451         #
 452         # Run block gamut tranformations, without hashing HTML blocks. This is
 453         # useful when HTML blocks are known to be already hashed, like in the first
 454         # whole-document pass.
 455         #
 456                 foreach ($this->block_gamut as $method => $priority) {
 457                         $text = $this->$method($text);
 458                 }
 459
 460                 # Finally form paragraph and restore hashed blocks.
 461                 $text = $this->formParagraphs($text);
 462
 463                 return $text;
 464         }
 465
 466
 467         protected function doHorizontalRules($text) {
 468                 # Do Horizontal Rules:
 469                 return preg_replace(
 470                         '{
 471                                 ^[ ]{0,3}       # Leading space
 472                                 ([-*_])         # $1: First marker
 473                                 (?>                     # Repeated marker group
 474                                         [ ]{0,2}        # Zero, one, or two spaces.
 475                                         \1                      # Marker character
 476                                 ){2,}           # Group repeated at least twice
 477                                 [ ]*            # Tailing spaces
 478                                 $                       # End of line.
 479                         }mx',
 480                         "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
 481                         $text);
 482         }
 483
 484
 485         protected $span_gamut = array(
 486         #
 487         # These are all the transformations that occur *within* block-level
 488         # tags like paragraphs, headers, and list items.
 489         #
 490                 # Process character escapes, code spans, and inline HTML
 491                 # in one shot.
 492                 "parseSpan"           => -30,
 493
 494                 # Process anchor and image tags. Images must come first,
 495                 # because ![foo][f] looks like an anchor.
 496                 "doImages"            =>  10,
 497                 "doAnchors"           =>  20,
 498
 499                 # Make links out of things like `<https://example.com/>`
 500                 # Must come after doAnchors, because you can use < and >
 501                 # delimiters in inline links like [this](<url>).
 502                 "doAutoLinks"         =>  30,
 503                 "encodeAmpsAndAngles" =>  40,
 504
 505                 "doItalicsAndBold"    =>  50,
 506                 "doHardBreaks"        =>  60,
 507                 );
 508
 509         protected function runSpanGamut($text) {
 510         #
 511         # Run span gamut tranformations.
 512         #
 513                 foreach ($this->span_gamut as $method => $priority) {
 514                         $text = $this->$method($text);
 515                 }
 516
 517                 return $text;
 518         }
 519
 520
 521         protected function doHardBreaks($text) {
 522                 # Do hard breaks:
 523                 return preg_replace_callback('/ {2,}\n/',
 524                         array($this, '_doHardBreaks_callback'), $text);
 525         }
 526         protected function _doHardBreaks_callback($matches) {
 527                 return $this->hashPart("<br$this->empty_element_suffix\n");
 528         }
 529
 530
 531         protected function doAnchors($text) {
 532         #
 533         # Turn Markdown link shortcuts into XHTML <a> tags.
 534         #
 535                 if ($this->in_anchor) return $text;
 536                 $this->in_anchor = true;
 537
 538                 #
 539                 # First, handle reference-style links: [link text] [id]
 540                 #
 541                 $text = preg_replace_callback('{
 542                         (                                       # wrap whole match in $1
 543                           \[
 544                                 ('.$this->nested_brackets_re.') # link text = $2
 545                           \]
 546
 547                           [ ]?                          # one optional space
 548                           (?:\n[ ]*)?           # one optional newline followed by spaces
 549
 550                           \[
 551                                 (.*?)           # id = $3
 552                           \]
 553                         )
 554                         }xs',
 555                         array($this, '_doAnchors_reference_callback'), $text);
 556
 557                 #
 558                 # Next, inline-style links: [link text](url "optional title")
 559                 #
 560                 $text = preg_replace_callback('{
 561                         (                               # wrap whole match in $1
 562                           \[
 563                                 ('.$this->nested_brackets_re.') # link text = $2
 564                           \]
 565                           \(                    # literal paren
 566                                 [ \n]*
 567                                 (?:
 568                                         <(.+?)> # href = $3
 569                                 |
 570                                         ('.$this->nested_url_parenthesis_re.')  # href = $4
 571                                 )
 572                                 [ \n]*
 573                                 (                       # $5
 574                                   ([\'"])       # quote char = $6
 575                                   (.*?)         # Title = $7
 576                                   \6            # matching quote
 577                                   [ \n]*        # ignore any spaces/tabs between closing quote and )
 578                                 )?                      # title is optional
 579                           \)
 580                         )
 581                         }xs',
 582                         array($this, '_doAnchors_inline_callback'), $text);
 583
 584                 #
 585                 # Last, handle reference-style shortcuts: [link text]
 586                 # These must come last in case you've also got [link text][1]
 587                 # or [link text](/foo)
 588                 #
 589                 $text = preg_replace_callback('{
 590                         (                                       # wrap whole match in $1
 591                           \[
 592                                 ([^\[\]]+)              # link text = $2; can\'t contain [ or ]
 593                           \]
 594                         )
 595                         }xs',
 596                         array($this, '_doAnchors_reference_callback'), $text);
 597
 598                 $this->in_anchor = false;
 599                 return $text;
 600         }
 601         protected function _doAnchors_reference_callback($matches) {
 602                 $whole_match =  $matches[1];
 603                 $link_text   =  $matches[2];
 604                 $link_id     =& $matches[3];
 605
 606                 if ($link_id == "") {
 607                         # for shortcut links like [this][] or [this].
 608                         $link_id = $link_text;
 609                 }
 610
 611                 # lower-case and turn embedded newlines into spaces
 612                 $link_id = strtolower($link_id);
 613                 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
 614
 615                 if (isset($this->urls[$link_id])) {
 616                         $url = $this->urls[$link_id];
 617                         $url = $this->encodeURLAttribute($url);
 618
 619                         $result = "<a href=\"$url\"";
 620                         if ( isset( $this->titles[$link_id] ) ) {
 621                                 $title = $this->titles[$link_id];
 622                                 $title = $this->encodeAttribute($title);
 623                                 $result .=  " title=\"$title\"";
 624                         }
 625
 626                         $link_text = $this->runSpanGamut($link_text);
 627                         $result .= ">$link_text</a>";
 628                         $result = $this->hashPart($result);
 629                 }
 630                 else {
 631                         $result = $whole_match;
 632                 }
 633                 return $result;
 634         }
 635         protected function _doAnchors_inline_callback($matches) {
 636                 $whole_match    =  $matches[1];
 637                 $link_text              =  $this->runSpanGamut($matches[2]);
 638                 $url                    =  $matches[3] == '' ? $matches[4] : $matches[3];
 639                 $title                  =& $matches[7];
 640
 641                 // if the URL was of the form <s p a c e s> it got caught by the HTML
 642                 // tag parser and hashed. Need to reverse the process before using the URL.
 643                 $unhashed = $this->unhash($url);
 644                 if ($unhashed != $url)
 645                         $url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
 646
 647                 $url = $this->encodeURLAttribute($url);
 648
 649                 $result = "<a href=\"$url\"";
 650                 if (isset($title)) {
 651                         $title = $this->encodeAttribute($title);
 652                         $result .=  " title=\"$title\"";
 653                 }
 654
 655                 $link_text = $this->runSpanGamut($link_text);
 656                 $result .= ">$link_text</a>";
 657
 658                 return $this->hashPart($result);
 659         }
 660
 661
 662         protected function doImages($text) {
 663         #
 664         # Turn Markdown image shortcuts into <img> tags.
 665         #
 666                 #
 667                 # First, handle reference-style labeled images: ![alt text][id]
 668                 #
 669                 $text = preg_replace_callback('{
 670                         (                               # wrap whole match in $1
 671                           !\[
 672                                 ('.$this->nested_brackets_re.')         # alt text = $2
 673                           \]
 674
 675                           [ ]?                          # one optional space
 676                           (?:\n[ ]*)?           # one optional newline followed by spaces
 677
 678                           \[
 679                                 (.*?)           # id = $3
 680                           \]
 681
 682                         )
 683                         }xs',
 684                         array($this, '_doImages_reference_callback'), $text);
 685
 686                 #
 687                 # Next, handle inline images:  ![alt text](url "optional title")
 688                 # Don't forget: encode * and _
 689                 #
 690                 $text = preg_replace_callback('{
 691                         (                               # wrap whole match in $1
 692                           !\[
 693                                 ('.$this->nested_brackets_re.')         # alt text = $2
 694                           \]
 695                           \s?                   # One optional whitespace character
 696                           \(                    # literal paren
 697                                 [ \n]*
 698                                 (?:
 699                                         <(\S*)> # src url = $3
 700                                 |
 701                                         ('.$this->nested_url_parenthesis_re.')  # src url = $4
 702                                 )
 703                                 [ \n]*
 704                                 (                       # $5
 705                                   ([\'"])       # quote char = $6
 706                                   (.*?)         # title = $7
 707                                   \6            # matching quote
 708                                   [ \n]*
 709                                 )?                      # title is optional
 710                           \)
 711                         )
 712                         }xs',
 713                         array($this, '_doImages_inline_callback'), $text);
 714
 715                 return $text;
 716         }
 717         protected function _doImages_reference_callback($matches) {
 718                 $whole_match = $matches[1];
 719                 $alt_text    = $matches[2];
 720                 $link_id     = strtolower($matches[3]);
 721
 722                 if ($link_id == "") {
 723                         $link_id = strtolower($alt_text); # for shortcut links like ![this][].
 724                 }
 725
 726                 $alt_text = $this->encodeAttribute($alt_text);
 727                 if (isset($this->urls[$link_id])) {
 728                         $url = $this->encodeURLAttribute($this->urls[$link_id]);
 729                         $result = "<img src=\"$url\" alt=\"$alt_text\"";
 730                         if (isset($this->titles[$link_id])) {
 731                                 $title = $this->titles[$link_id];
 732                                 $title = $this->encodeAttribute($title);
 733                                 $result .=  " title=\"$title\"";
 734                         }
 735                         $result .= $this->empty_element_suffix;
 736                         $result = $this->hashPart($result);
 737                 }
 738                 else {
 739                         # If there's no such link ID, leave intact:
 740                         $result = $whole_match;
 741                 }
 742
 743                 return $result;
 744         }
 745         protected function _doImages_inline_callback($matches) {
 746                 $whole_match    = $matches[1];
 747                 $alt_text               = $matches[2];
 748                 $url                    = $matches[3] == '' ? $matches[4] : $matches[3];
 749                 $title                  =& $matches[7];
 750
 751                 $alt_text = $this->encodeAttribute($alt_text);
 752                 $url = $this->encodeURLAttribute($url);
 753                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
 754                 if (isset($title)) {
 755                         $title = $this->encodeAttribute($title);
 756                         $result .=  " title=\"$title\""; # $title already quoted
 757                 }
 758                 $result .= $this->empty_element_suffix;
 759
 760                 return $this->hashPart($result);
 761         }
 762
 763
 764         protected function doHeaders($text) {
 765                 # Setext-style headers:
 766                 #         Header 1
 767                 #         ========
 768                 #
 769                 #         Header 2
 770                 #         --------
 771                 #
 772                 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
 773                         array($this, '_doHeaders_callback_setext'), $text);
 774
 775                 # atx-style headers:
 776                 #       # Header 1
 777                 #       ## Header 2
 778                 #       ## Header 2 with closing hashes ##
 779                 #       ...
 780                 #       ###### Header 6
 781                 #
 782                 $text = preg_replace_callback('{
 783                                 ^(\#{1,6})      # $1 = string of #\'s
 784                                 [ ]*
 785                                 (.+?)           # $2 = Header text
 786                                 [ ]*
 787                                 \#*                     # optional closing #\'s (not counted)
 788                                 \n+
 789                         }xm',
 790                         array($this, '_doHeaders_callback_atx'), $text);
 791
 792                 return $text;
 793         }
 794
 795         protected function _doHeaders_callback_setext($matches) {
 796                 # Terrible hack to check we haven't found an empty list item.
 797                 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
 798                         return $matches[0];
 799
 800                 $level = $matches[2]{0} == '=' ? 1 : 2;
 801
 802                 # id attribute generation
 803                 $idAtt = $this->_generateIdFromHeaderValue($matches[1]);
 804
 805                 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[1])."</h$level>";
 806                 return "\n" . $this->hashBlock($block) . "\n\n";
 807         }
 808         protected function _doHeaders_callback_atx($matches) {
 809
 810                 # id attribute generation
 811                 $idAtt = $this->_generateIdFromHeaderValue($matches[2]);
 812
 813                 $level = strlen($matches[1]);
 814                 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[2])."</h$level>";
 815                 return "\n" . $this->hashBlock($block) . "\n\n";
 816         }
 817
 818         protected function _generateIdFromHeaderValue($headerValue) {
 819
 820                 # if a header_id_func property is set, we can use it to automatically
 821                 # generate an id attribute.
 822                 #
 823                 # This method returns a string in the form id="foo", or an empty string
 824                 # otherwise.
 825                 if (!is_callable($this->header_id_func)) {
 826                         return "";
 827                 }
 828                 $idValue = call_user_func($this->header_id_func, $headerValue);
 829                 if (!$idValue) return "";
 830
 831                 return ' id="' . $this->encodeAttribute($idValue) . '"';
 832
 833         }
 834
 835         protected function doLists($text) {
 836         #
 837         # Form HTML ordered (numbered) and unordered (bulleted) lists.
 838         #
 839                 $less_than_tab = $this->tab_width - 1;
 840
 841                 # Re-usable patterns to match list item bullets and number markers:
 842                 $marker_ul_re  = '[*+-]';
 843                 $marker_ol_re  = '\d+[\.]';
 844
 845                 $markers_relist = array(
 846                         $marker_ul_re => $marker_ol_re,
 847                         $marker_ol_re => $marker_ul_re,
 848                         );
 849
 850                 foreach ($markers_relist as $marker_re => $other_marker_re) {
 851                         # Re-usable pattern to match any entirel ul or ol list:
 852                         $whole_list_re = '
 853                                 (                                                               # $1 = whole list
 854                                   (                                                             # $2
 855                                         ([ ]{0,'.$less_than_tab.'})     # $3 = number of spaces
 856                                         ('.$marker_re.')                        # $4 = first list item marker
 857                                         [ ]+
 858                                   )
 859                                   (?s:.+?)
 860                                   (                                                             # $5
 861                                           \z
 862                                         |
 863                                           \n{2,}
 864                                           (?=\S)
 865                                           (?!                                           # Negative lookahead for another list item marker
 866                                                 [ ]*
 867                                                 '.$marker_re.'[ ]+
 868                                           )
 869                                         |
 870                                           (?=                                           # Lookahead for another kind of list
 871                                             \n
 872                                                 \3                                              # Must have the same indentation
 873                                                 '.$other_marker_re.'[ ]+
 874                                           )
 875                                   )
 876                                 )
 877                         '; // mx
 878
 879                         # We use a different prefix before nested lists than top-level lists.
 880                         # See extended comment in _ProcessListItems().
 881
 882                         if ($this->list_level) {
 883                                 $text = preg_replace_callback('{
 884                                                 ^
 885                                                 '.$whole_list_re.'
 886                                         }mx',
 887                                         array($this, '_doLists_callback'), $text);
 888                         }
 889                         else {
 890                                 $text = preg_replace_callback('{
 891                                                 (?:(?<=\n)\n|\A\n?) # Must eat the newline
 892                                                 '.$whole_list_re.'
 893                                         }mx',
 894                                         array($this, '_doLists_callback'), $text);
 895                         }
 896                 }
 897
 898                 return $text;
 899         }
 900         protected function _doLists_callback($matches) {
 901                 # Re-usable patterns to match list item bullets and number markers:
 902                 $marker_ul_re  = '[*+-]';
 903                 $marker_ol_re  = '\d+[\.]';
 904                 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
 905                 $marker_ol_start_re = '[0-9]+';
 906
 907                 $list = $matches[1];
 908                 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
 909
 910                 $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
 911
 912                 $list .= "\n";
 913                 $result = $this->processListItems($list, $marker_any_re);
 914
 915                 $ol_start = 1;
 916                 if ($this->enhanced_ordered_list) {
 917                         # Get the start number for ordered list.
 918                         if ($list_type == 'ol') {
 919                                 $ol_start_array = array();
 920                                 $ol_start_check = preg_match("/$marker_ol_start_re/", $matches[4], $ol_start_array);
 921                                 if ($ol_start_check){
 922                                         $ol_start = $ol_start_array[0];
 923                                 }
 924                         }
 925                 }
 926
 927                 if ($ol_start > 1 && $list_type == 'ol'){
 928                         $result = $this->hashBlock("<$list_type start=\"$ol_start\">\n" . $result . "</$list_type>");
 929                 } else {
 930                         $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
 931                 }
 932                 return "\n". $result ."\n\n";
 933         }
 934
 935         protected $list_level = 0;
 936
 937         protected function processListItems($list_str, $marker_any_re) {
 938         #
 939         #       Process the contents of a single ordered or unordered list, splitting it
 940         #       into individual list items.
 941         #
 942                 # The $this->list_level global keeps track of when we're inside a list.
 943                 # Each time we enter a list, we increment it; when we leave a list,
 944                 # we decrement. If it's zero, we're not in a list anymore.
 945                 #
 946                 # We do this because when we're not inside a list, we want to treat
 947                 # something like this:
 948                 #
 949                 #               I recommend upgrading to version
 950                 #               8. Oops, now this line is treated
 951                 #               as a sub-list.
 952                 #
 953                 # As a single paragraph, despite the fact that the second line starts
 954                 # with a digit-period-space sequence.
 955                 #
 956                 # Whereas when we're inside a list (or sub-list), that line will be
 957                 # treated as the start of a sub-list. What a kludge, huh? This is
 958                 # an aspect of Markdown's syntax that's hard to parse perfectly
 959                 # without resorting to mind-reading. Perhaps the solution is to
 960                 # change the syntax rules such that sub-lists must start with a
 961                 # starting cardinal number; e.g. "1." or "a.".
 962
 963                 $this->list_level++;
 964
 965                 # trim trailing blank lines:
 966                 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
 967
 968                 $list_str = preg_replace_callback('{
 969                         (\n)?                                                   # leading line = $1
 970                         (^[ ]*)                                                 # leading whitespace = $2
 971                         ('.$marker_any_re.'                             # list marker and space = $3
 972                                 (?:[ ]+|(?=\n)) # space only required if item is not empty
 973                         )
 974                         ((?s:.*?))                                              # list item text   = $4
 975                         (?:(\n+(?=\n))|\n)                              # tailing blank line = $5
 976                         (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
 977                         }xm',
 978                         array($this, '_processListItems_callback'), $list_str);
 979
 980                 $this->list_level--;
 981                 return $list_str;
 982         }
 983         protected function _processListItems_callback($matches) {
 984                 $item = $matches[4];
 985                 $leading_line =& $matches[1];
 986                 $leading_space =& $matches[2];
 987                 $marker_space = $matches[3];
 988                 $tailing_blank_line =& $matches[5];
 989
 990                 if ($leading_line || $tailing_blank_line ||
 991                         preg_match('/\n{2,}/', $item))
 992                 {
 993                         # Replace marker with the appropriate whitespace indentation
 994                         $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
 995                         $item = $this->runBlockGamut($this->outdent($item)."\n");
 996                 }
 997                 else {
 998                         # Recursion for sub-lists:
 999                         $item = $this->doLists($this->outdent($item));
1000                         $item = preg_replace('/\n+$/', '', $item);
1001                         $item = $this->runSpanGamut($item);
1002                 }
1003
1004                 return "<li>" . $item . "</li>\n";
1005         }
1006
1007
1008         protected function doCodeBlocks($text) {
1009         #
1010         #       Process Markdown `<pre><code>` blocks.
1011         #
1012                 $text = preg_replace_callback('{
1013                                 (?:\n\n|\A\n?)
1014                                 (                   # $1 = the code block -- one or more lines, starting with a space/tab
1015                                   (?>
1016                                         [ ]{'.$this->tab_width.'}  # Lines must start with a tab or a tab-width of spaces
1017                                         .*\n+
1018                                   )+
1019                                 )
1020                                 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1021                         }xm',
1022                         array($this, '_doCodeBlocks_callback'), $text);
1023
1024                 return $text;
1025         }
1026         protected function _doCodeBlocks_callback($matches) {
1027                 $codeblock = $matches[1];
1028
1029                 $codeblock = $this->outdent($codeblock);
1030                 if ($this->code_block_content_func) {
1031                         $codeblock = call_user_func($this->code_block_content_func, $codeblock, "");
1032                 } else {
1033                         $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
1034                 }
1035
1036                 # trim leading newlines and trailing newlines
1037                 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
1038
1039                 $codeblock = "<pre><code>$codeblock\n</code></pre>";
1040                 return "\n\n".$this->hashBlock($codeblock)."\n\n";
1041         }
1042
1043
1044         protected function makeCodeSpan($code) {
1045         #
1046         # Create a code span markup for $code. Called from handleSpanToken.
1047         #
1048                 $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
1049                 return $this->hashPart("<code>$code</code>");
1050         }
1051
1052
1053         protected $em_relist = array(
1054                 ''  => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?![\.,:;]?\s)',
1055                 '*' => '(?<![\s*])\*(?!\*)',
1056                 '_' => '(?<![\s_])_(?!_)',
1057                 );
1058         protected $strong_relist = array(
1059                 ''   => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?![\.,:;]?\s)',
1060                 '**' => '(?<![\s*])\*\*(?!\*)',
1061                 '__' => '(?<![\s_])__(?!_)',
1062                 );
1063         protected $em_strong_relist = array(
1064                 ''    => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?![\.,:;]?\s)',
1065                 '***' => '(?<![\s*])\*\*\*(?!\*)',
1066                 '___' => '(?<![\s_])___(?!_)',
1067                 );
1068         protected $em_strong_prepared_relist;
1069
1070         protected function prepareItalicsAndBold() {
1071         #
1072         # Prepare regular expressions for searching emphasis tokens in any
1073         # context.
1074         #
1075                 foreach ($this->em_relist as $em => $em_re) {
1076                         foreach ($this->strong_relist as $strong => $strong_re) {
1077                                 # Construct list of allowed token expressions.
1078                                 $token_relist = array();
1079                                 if (isset($this->em_strong_relist["$em$strong"])) {
1080                                         $token_relist[] = $this->em_strong_relist["$em$strong"];
1081                                 }
1082                                 $token_relist[] = $em_re;
1083                                 $token_relist[] = $strong_re;
1084
1085                                 # Construct master expression from list.
1086                                 $token_re = '{('. implode('|', $token_relist) .')}';
1087                                 $this->em_strong_prepared_relist["$em$strong"] = $token_re;
1088                         }
1089                 }
1090         }
1091
1092         protected function doItalicsAndBold($text) {
1093                 $token_stack = array('');
1094                 $text_stack = array('');
1095                 $em = '';
1096                 $strong = '';
1097                 $tree_char_em = false;
1098
1099                 while (1) {
1100                         #
1101                         # Get prepared regular expression for seraching emphasis tokens
1102                         # in current context.
1103                         #
1104                         $token_re = $this->em_strong_prepared_relist["$em$strong"];
1105
1106                         #
1107                         # Each loop iteration search for the next emphasis token.
1108                         # Each token is then passed to handleSpanToken.
1109                         #
1110                         $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
1111                         $text_stack[0] .= $parts[0];
1112                         $token =& $parts[1];
1113                         $text =& $parts[2];
1114
1115                         if (empty($token)) {
1116                                 # Reached end of text span: empty stack without emitting.
1117                                 # any more emphasis.
1118                                 while ($token_stack[0]) {
1119                                         $text_stack[1] .= array_shift($token_stack);
1120                                         $text_stack[0] .= array_shift($text_stack);
1121                                 }
1122                                 break;
1123                         }
1124
1125                         $token_len = strlen($token);
1126                         if ($tree_char_em) {
1127                                 # Reached closing marker while inside a three-char emphasis.
1128                                 if ($token_len == 3) {
1129                                         # Three-char closing marker, close em and strong.
1130                                         array_shift($token_stack);
1131                                         $span = array_shift($text_stack);
1132                                         $span = $this->runSpanGamut($span);
1133                                         $span = "<strong><em>$span</em></strong>";
1134                                         $text_stack[0] .= $this->hashPart($span);
1135                                         $em = '';
1136                                         $strong = '';
1137                                 } else {
1138                                         # Other closing marker: close one em or strong and
1139                                         # change current token state to match the other
1140                                         $token_stack[0] = str_repeat($token{0}, 3-$token_len);
1141                                         $tag = $token_len == 2 ? "strong" : "em";
1142                                         $span = $text_stack[0];
1143                                         $span = $this->runSpanGamut($span);
1144                                         $span = "<$tag>$span</$tag>";
1145                                         $text_stack[0] = $this->hashPart($span);
1146                                         $$tag = ''; # $$tag stands for $em or $strong
1147                                 }
1148                                 $tree_char_em = false;
1149                         } else if ($token_len == 3) {
1150                                 if ($em) {
1151                                         # Reached closing marker for both em and strong.
1152                                         # Closing strong marker:
1153                                         for ($i = 0; $i < 2; ++$i) {
1154                                                 $shifted_token = array_shift($token_stack);
1155                                                 $tag = strlen($shifted_token) == 2 ? "strong" : "em";
1156                                                 $span = array_shift($text_stack);
1157                                                 $span = $this->runSpanGamut($span);
1158                                                 $span = "<$tag>$span</$tag>";
1159                                                 $text_stack[0] .= $this->hashPart($span);
1160                                                 $$tag = ''; # $$tag stands for $em or $strong
1161                                         }
1162                                 } else {
1163                                         # Reached opening three-char emphasis marker. Push on token
1164                                         # stack; will be handled by the special condition above.
1165                                         $em = $token{0};
1166                                         $strong = "$em$em";
1167                                         array_unshift($token_stack, $token);
1168                                         array_unshift($text_stack, '');
1169                                         $tree_char_em = true;
1170                                 }
1171                         } else if ($token_len == 2) {
1172                                 if ($strong) {
1173                                         # Unwind any dangling emphasis marker:
1174                                         if (strlen($token_stack[0]) == 1) {
1175                                                 $text_stack[1] .= array_shift($token_stack);
1176                                                 $text_stack[0] .= array_shift($text_stack);
1177                                         }
1178                                         # Closing strong marker:
1179                                         array_shift($token_stack);
1180                                         $span = array_shift($text_stack);
1181                                         $span = $this->runSpanGamut($span);
1182                                         $span = "<strong>$span</strong>";
1183                                         $text_stack[0] .= $this->hashPart($span);
1184                                         $strong = '';
1185                                 } else {
1186                                         array_unshift($token_stack, $token);
1187                                         array_unshift($text_stack, '');
1188                                         $strong = $token;
1189                                 }
1190                         } else {
1191                                 # Here $token_len == 1
1192                                 if ($em) {
1193                                         if (strlen($token_stack[0]) == 1) {
1194                                                 # Closing emphasis marker:
1195                                                 array_shift($token_stack);
1196                                                 $span = array_shift($text_stack);
1197                                                 $span = $this->runSpanGamut($span);
1198                                                 $span = "<em>$span</em>";
1199                                                 $text_stack[0] .= $this->hashPart($span);
1200                                                 $em = '';
1201                                         } else {
1202                                                 $text_stack[0] .= $token;
1203                                         }
1204                                 } else {
1205                                         array_unshift($token_stack, $token);
1206                                         array_unshift($text_stack, '');
1207                                         $em = $token;
1208                                 }
1209                         }
1210                 }
1211                 return $text_stack[0];
1212         }
1213
1214
1215         protected function doBlockQuotes($text) {
1216                 $text = preg_replace_callback('/
1217                           (                                                             # Wrap whole match in $1
1218                                 (?>
1219                                   ^[ ]*>[ ]?                    # ">" at the start of a line
1220                                         .+\n                                    # rest of the first line
1221                                   (.+\n)*                                       # subsequent consecutive lines
1222                                   \n*                                           # blanks
1223                                 )+
1224                           )
1225                         /xm',
1226                         array($this, '_doBlockQuotes_callback'), $text);
1227
1228                 return $text;
1229         }
1230         protected function _doBlockQuotes_callback($matches) {
1231                 $bq = $matches[1];
1232                 # trim one level of quoting - trim whitespace-only lines
1233                 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
1234                 $bq = $this->runBlockGamut($bq);                # recurse
1235
1236                 $bq = preg_replace('/^/m', "  ", $bq);
1237                 # These leading spaces cause problem with <pre> content,
1238                 # so we need to fix that:
1239                 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
1240                         array($this, '_doBlockQuotes_callback2'), $bq);
1241
1242                 return "\n". $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
1243         }
1244         protected function _doBlockQuotes_callback2($matches) {
1245                 $pre = $matches[1];
1246                 $pre = preg_replace('/^  /m', '', $pre);
1247                 return $pre;
1248         }
1249
1250
1251         protected function formParagraphs($text) {
1252         #
1253         #       Params:
1254         #               $text - string to process with html <p> tags
1255         #
1256                 # Strip leading and trailing lines:
1257                 $text = preg_replace('/\A\n+|\n+\z/', '', $text);
1258
1259                 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
1260
1261                 #
1262                 # Wrap <p> tags and unhashify HTML blocks
1263                 #
1264                 foreach ($grafs as $key => $value) {
1265                         if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
1266                                 # Is a paragraph.
1267                                 $value = $this->runSpanGamut($value);
1268                                 $value = preg_replace('/^([ ]*)/', "<p>", $value);
1269                                 $value .= "</p>";
1270                                 $grafs[$key] = $this->unhash($value);
1271                         }
1272                         else {
1273                                 # Is a block.
1274                                 # Modify elements of @grafs in-place...
1275                                 $graf = $value;
1276                                 $block = $this->html_hashes[$graf];
1277                                 $graf = $block;
1278 //                              if (preg_match('{
1279 //                                      \A
1280 //                                      (                                                       # $1 = <div> tag
1281 //                                        <div  \s+
1282 //                                        [^>]*
1283 //                                        \b
1284 //                                        markdown\s*=\s*  ([\'"])      #       $2 = attr quote char
1285 //                                        1
1286 //                                        \2
1287 //                                        [^>]*
1288 //                                        >
1289 //                                      )
1290 //                                      (                                                       # $3 = contents
1291 //                                      .*
1292 //                                      )
1293 //                                      (</div>)                                        # $4 = closing tag
1294 //                                      \z
1295 //                                      }xs', $block, $matches))
1296 //                              {
1297 //                                      list(, $div_open, , $div_content, $div_close) = $matches;
1298 //
1299 //                                      # We can't call Markdown(), because that resets the hash;
1300 //                                      # that initialization code should be pulled into its own sub, though.
1301 //                                      $div_content = $this->hashHTMLBlocks($div_content);
1302 //
1303 //                                      # Run document gamut methods on the content.
1304 //                                      foreach ($this->document_gamut as $method => $priority) {
1305 //                                              $div_content = $this->$method($div_content);
1306 //                                      }
1307 //
1308 //                                      $div_open = preg_replace(
1309 //                                              '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1310 //
1311 //                                      $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1312 //                              }
1313                                 $grafs[$key] = $graf;
1314                         }
1315                 }
1316
1317                 return implode("\n\n", $grafs);
1318         }
1319
1320
1321         protected function encodeAttribute($text) {
1322         #
1323         # Encode text for a double-quoted HTML attribute. This function
1324         # is *not* suitable for attributes enclosed in single quotes.
1325         #
1326                 $text = $this->encodeAmpsAndAngles($text);
1327                 $text = str_replace('"', '&quot;', $text);
1328                 return $text;
1329         }
1330
1331
1332         protected function encodeURLAttribute($url, &$text = null) {
1333         #
1334         # Encode text for a double-quoted HTML attribute containing a URL,
1335         # applying the URL filter if set. Also generates the textual
1336         # representation for the URL (removing mailto: or tel:) storing it in $text.
1337         # This function is *not* suitable for attributes enclosed in single quotes.
1338         #
1339                 if ($this->url_filter_func)
1340                         $url = call_user_func($this->url_filter_func, $url);
1341
1342                 if (preg_match('{^mailto:}i', $url))
1343                         $url = $this->encodeEntityObfuscatedAttribute($url, $text, 7);
1344                 else if (preg_match('{^tel:}i', $url))
1345                 {
1346                         $url = $this->encodeAttribute($url);
1347                         $text = substr($url, 4);
1348                 }
1349                 else
1350                 {
1351                         $url = $this->encodeAttribute($url);
1352                         $text = $url;
1353                 }
1354
1355                 return $url;
1356         }
1357
1358
1359         protected function encodeAmpsAndAngles($text) {
1360         #
1361         # Smart processing for ampersands and angle brackets that need to
1362         # be encoded. Valid character entities are left alone unless the
1363         # no-entities mode is set.
1364         #
1365                 if ($this->no_entities) {
1366                         $text = str_replace('&', '&amp;', $text);
1367                 } else {
1368                         # Ampersand-encoding based entirely on Nat Irons's Amputator
1369                         # MT plugin: <http://bumppo.net/projects/amputator/>
1370                         $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
1371                                                                 '&amp;', $text);
1372                 }
1373                 # Encode remaining <'s
1374                 $text = str_replace('<', '&lt;', $text);
1375
1376                 return $text;
1377         }
1378
1379
1380         protected function doAutoLinks($text) {
1381                 $text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i',
1382                         array($this, '_doAutoLinks_url_callback'), $text);
1383
1384                 # Email addresses: <address@domain.foo>
1385                 $text = preg_replace_callback('{
1386                         <
1387                         (?:mailto:)?
1388                         (
1389                                 (?:
1390                                         [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
1391                                 |
1392                                         ".*?"
1393                                 )
1394                                 \@
1395                                 (?:
1396                                         [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1397                                 |
1398                                         \[[\d.a-fA-F:]+\]       # IPv4 & IPv6
1399                                 )
1400                         )
1401                         >
1402                         }xi',
1403                         array($this, '_doAutoLinks_email_callback'), $text);
1404
1405                 return $text;
1406         }
1407         protected function _doAutoLinks_url_callback($matches) {
1408                 $url = $this->encodeURLAttribute($matches[1], $text);
1409                 $link = "<a href=\"$url\">$text</a>";
1410                 return $this->hashPart($link);
1411         }
1412         protected function _doAutoLinks_email_callback($matches) {
1413                 $addr = $matches[1];
1414                 $url = $this->encodeURLAttribute("mailto:$addr", $text);
1415                 $link = "<a href=\"$url\">$text</a>";
1416                 return $this->hashPart($link);
1417         }
1418
1419
1420         protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) {
1421         #
1422         #       Input: some text to obfuscate, e.g. "mailto:foo@example.com"
1423         #
1424         #       Output: the same text but with most characters encoded as either a
1425         #               decimal or hex entity, in the hopes of foiling most address
1426         #               harvesting spam bots. E.g.:
1427         #
1428         #        &#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
1429         #        &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
1430         #        &#x6d;
1431         #
1432         #       Note: the additional output $tail is assigned the same value as the
1433         #       ouput, minus the number of characters specified by $head_length.
1434         #
1435         #       Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1436         #   With some optimizations by Milian Wolff. Forced encoding of HTML
1437         #       attribute special characters by Allan Odgaard.
1438         #
1439                 if ($text == "") return $tail = "";
1440
1441                 $chars = preg_split('/(?<!^)(?!$)/', $text);
1442                 $seed = (int)abs(crc32($text) / strlen($text)); # Deterministic seed.
1443
1444                 foreach ($chars as $key => $char) {
1445                         $ord = ord($char);
1446                         # Ignore non-ascii chars.
1447                         if ($ord < 128) {
1448                                 $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
1449                                 # roughly 10% raw, 45% hex, 45% dec
1450                                 # '@' *must* be encoded. I insist.
1451                                 # '"' and '>' have to be encoded inside the attribute
1452                                 if ($r > 90 && strpos('@"&>', $char) === false) /* do nothing */;
1453                                 else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
1454                                 else              $chars[$key] = '&#'.$ord.';';
1455                         }
1456                 }
1457
1458                 $text = implode('', $chars);
1459                 $tail = $head_length ? implode('', array_slice($chars, $head_length)) : $text;
1460
1461                 return $text;
1462         }
1463
1464
1465         protected function parseSpan($str) {
1466         #
1467         # Take the string $str and parse it into tokens, hashing embeded HTML,
1468         # escaped characters and handling code spans.
1469         #
1470                 $output = '';
1471
1472                 $span_re = '{
1473                                 (
1474                                         \\\\'.$this->escape_chars_re.'
1475                                 |
1476                                         (?<![`\\\\])
1477                                         `+                                              # code span marker
1478                         '.( $this->no_markup ? '' : '
1479                                 |
1480                                         <!--    .*?     -->             # comment
1481                                 |
1482                                         <\?.*?\?> | <%.*?%>             # processing instruction
1483                                 |
1484                                         <[!$]?[-a-zA-Z0-9:_]+   # regular tags
1485                                         (?>
1486                                                 \s
1487                                                 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1488                                         )?
1489                                         >
1490                                 |
1491                                         <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag
1492                                 |
1493                                         </[-a-zA-Z0-9:_]+\s*> # closing tag
1494                         ').'
1495                                 )
1496                                 }xs';
1497
1498                 while (1) {
1499                         #
1500                         # Each loop iteration seach for either the next tag, the next
1501                         # openning code span marker, or the next escaped character.
1502                         # Each token is then passed to handleSpanToken.
1503                         #
1504                         $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
1505
1506                         # Create token from text preceding tag.
1507                         if ($parts[0] != "") {
1508                                 $output .= $parts[0];
1509                         }
1510
1511                         # Check if we reach the end.
1512                         if (isset($parts[1])) {
1513                                 $output .= $this->handleSpanToken($parts[1], $parts[2]);
1514                                 $str = $parts[2];
1515                         }
1516                         else {
1517                                 break;
1518                         }
1519                 }
1520
1521                 return $output;
1522         }
1523
1524
1525         protected function handleSpanToken($token, &$str) {
1526         #
1527         # Handle $token provided by parseSpan by determining its nature and
1528         # returning the corresponding value that should replace it.
1529         #
1530                 switch ($token{0}) {
1531                         case "\\":
1532                                 return $this->hashPart("&#". ord($token{1}). ";");
1533                         case "`":
1534                                 # Search for end marker in remaining text.
1535                                 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
1536                                         $str, $matches))
1537                                 {
1538                                         $str = $matches[2];
1539                                         $codespan = $this->makeCodeSpan($matches[1]);
1540                                         return $this->hashPart($codespan);
1541                                 }
1542                                 return $token; // return as text since no ending marker found.
1543                         default:
1544                                 return $this->hashPart($token);
1545                 }
1546         }
1547
1548
1549         protected function outdent($text) {
1550         #
1551         # Remove one level of line-leading tabs or spaces
1552         #
1553                 return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
1554         }
1555
1556
1557         # String length function for detab. `_initDetab` will create a function to
1558         # hanlde UTF-8 if the default function does not exist.
1559         protected $utf8_strlen = 'mb_strlen';
1560
1561         protected function detab($text) {
1562         #
1563         # Replace tabs with the appropriate amount of space.
1564         #
1565                 # For each line we separate the line in blocks delemited by
1566                 # tab characters. Then we reconstruct every line by adding the
1567                 # appropriate number of space between each blocks.
1568
1569                 $text = preg_replace_callback('/^.*\t.*$/m',
1570                         array($this, '_detab_callback'), $text);
1571
1572                 return $text;
1573         }
1574         protected function _detab_callback($matches) {
1575                 $line = $matches[0];
1576                 $strlen = $this->utf8_strlen; # strlen function for UTF-8.
1577
1578                 # Split in blocks.
1579                 $blocks = explode("\t", $line);
1580                 # Add each blocks to the line.
1581                 $line = $blocks[0];
1582                 unset($blocks[0]); # Do not add first block twice.
1583                 foreach ($blocks as $block) {
1584                         # Calculate amount of space, insert spaces, insert block.
1585                         $amount = $this->tab_width -
1586                                 $strlen($line, 'UTF-8') % $this->tab_width;
1587                         $line .= str_repeat(" ", $amount) . $block;
1588                 }
1589                 return $line;
1590         }
1591         protected function _initDetab() {
1592         #
1593         # Check for the availability of the function in the `utf8_strlen` property
1594         # (initially `mb_strlen`). If the function is not available, create a
1595         # function that will loosely count the number of UTF-8 characters with a
1596         # regular expression.
1597         #
1598                 if (function_exists($this->utf8_strlen)) return;
1599                 $this->utf8_strlen = create_function('$text', 'return preg_match_all(
1600                         "/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
1601                         $text, $m);');
1602         }
1603
1604
1605         protected function unhash($text) {
1606         #
1607         # Swap back in all the tags hashed by _HashHTMLBlocks.
1608         #
1609                 return preg_replace_callback('/(.)\x1A[0-9]+\1/',
1610                         array($this, '_unhash_callback'), $text);
1611         }
1612         protected function _unhash_callback($matches) {
1613                 return $this->html_hashes[$matches[0]];
1614         }
1615
1616 }