library/HTMLPurifier/Lexer/DirectLex.php

   1 <?php
   2
   3 /**
   4  * Our in-house implementation of a parser.
   5  *
   6  * A pure PHP parser, DirectLex has absolutely no dependencies, making
   7  * it a reasonably good default for PHP4.  Written with efficiency in mind,
   8  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
   9  * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  10  *
  11  * @todo Reread XML spec and document differences.
  12  */
  13 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  14 {
  15
  16     public $tracksLineNumbers = true;
  17
  18     /**
  19      * Whitespace characters for str(c)spn.
  20      */
  21     protected $_whitespace = "\x20\x09\x0D\x0A";
  22
  23     /**
  24      * Callback function for script CDATA fudge
  25      * @param $matches, in form of array(opening tag, contents, closing tag)
  26      */
  27     protected function scriptCallback($matches) {
  28         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  29     }
  30
  31     public function tokenizeHTML($html, $config, $context) {
  32
  33         // special normalization for script tags without any armor
  34         // our "armor" heurstic is a < sign any number of whitespaces after
  35         // the first script tag
  36         if ($config->get('HTML.Trusted')) {
  37             $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  38                 array($this, 'scriptCallback'), $html);
  39         }
  40
  41         $html = $this->normalize($html, $config, $context);
  42
  43         $cursor = 0; // our location in the text
  44         $inside_tag = false; // whether or not we're parsing the inside of a tag
  45         $array = array(); // result array
  46
  47         // This is also treated to mean maintain *column* numbers too
  48         $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  49
  50         if ($maintain_line_numbers === null) {
  51             // automatically determine line numbering by checking
  52             // if error collection is on
  53             $maintain_line_numbers = $config->get('Core.CollectErrors');
  54         }
  55
  56         if ($maintain_line_numbers) {
  57             $current_line = 1;
  58             $current_col  = 0;
  59             $length = strlen($html);
  60         } else {
  61             $current_line = false;
  62             $current_col  = false;
  63             $length = false;
  64         }
  65         $context->register('CurrentLine', $current_line);
  66         $context->register('CurrentCol',  $current_col);
  67         $nl = "\n";
  68         // how often to manually recalculate. This will ALWAYS be right,
  69         // but it's pretty wasteful. Set to 0 to turn off
  70         $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  71
  72         $e = false;
  73         if ($config->get('Core.CollectErrors')) {
  74             $e =& $context->get('ErrorCollector');
  75         }
  76
  77         // for testing synchronization
  78         $loops = 0;
  79
  80         while(++$loops) {
  81
  82             // $cursor is either at the start of a token, or inside of
  83             // a tag (i.e. there was a < immediately before it), as indicated
  84             // by $inside_tag
  85
  86             if ($maintain_line_numbers) {
  87
  88                 // $rcursor, however, is always at the start of a token.
  89                 $rcursor = $cursor - (int) $inside_tag;
  90
  91                 // Column number is cheap, so we calculate it every round.
  92                 // We're interested at the *end* of the newline string, so
  93                 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  94                 // from our "rcursor" position.
  95                 $nl_pos = strrpos($html, $nl, $rcursor - $length);
  96                 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  97
  98                 // recalculate lines
  99                 if (
 100                     $synchronize_interval &&  // synchronization is on
 101                     $cursor > 0 &&            // cursor is further than zero
 102                     $loops % $synchronize_interval === 0 // time to synchronize!
 103                 ) {
 104                     $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 105                 }
 106
 107             }
 108
 109             $position_next_lt = strpos($html, '<', $cursor);
 110             $position_next_gt = strpos($html, '>', $cursor);
 111
 112             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 113             // special case to set up context
 114             if ($position_next_lt === $cursor) {
 115                 $inside_tag = true;
 116                 $cursor++;
 117             }
 118
 119             if (!$inside_tag && $position_next_lt !== false) {
 120                 // We are not inside tag and there still is another tag to parse
 121                 $token = new
 122                     HTMLPurifier_Token_Text(
 123                         $this->parseData(
 124                             substr(
 125                                 $html, $cursor, $position_next_lt - $cursor
 126                             )
 127                         )
 128                     );
 129                 if ($maintain_line_numbers) {
 130                     $token->rawPosition($current_line, $current_col);
 131                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 132                 }
 133                 $array[] = $token;
 134                 $cursor  = $position_next_lt + 1;
 135                 $inside_tag = true;
 136                 continue;
 137             } elseif (!$inside_tag) {
 138                 // We are not inside tag but there are no more tags
 139                 // If we're already at the end, break
 140                 if ($cursor === strlen($html)) break;
 141                 // Create Text of rest of string
 142                 $token = new
 143                     HTMLPurifier_Token_Text(
 144                         $this->parseData(
 145                             substr(
 146                                 $html, $cursor
 147                             )
 148                         )
 149                     );
 150                 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
 151                 $array[] = $token;
 152                 break;
 153             } elseif ($inside_tag && $position_next_gt !== false) {
 154                 // We are in tag and it is well formed
 155                 // Grab the internals of the tag
 156                 $strlen_segment = $position_next_gt - $cursor;
 157
 158                 if ($strlen_segment < 1) {
 159                     // there's nothing to process!
 160                     $token = new HTMLPurifier_Token_Text('<');
 161                     $cursor++;
 162                     continue;
 163                 }
 164
 165                 $segment = substr($html, $cursor, $strlen_segment);
 166
 167                 if ($segment === false) {
 168                     // somehow, we attempted to access beyond the end of
 169                     // the string, defense-in-depth, reported by Nate Abele
 170                     break;
 171                 }
 172
 173                 // Check if it's a comment
 174                 if (
 175                     substr($segment, 0, 3) === '!--'
 176                 ) {
 177                     // re-determine segment length, looking for -->
 178                     $position_comment_end = strpos($html, '-->', $cursor);
 179                     if ($position_comment_end === false) {
 180                         // uh oh, we have a comment that extends to
 181                         // infinity. Can't be helped: set comment
 182                         // end position to end of string
 183                         if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
 184                         $position_comment_end = strlen($html);
 185                         $end = true;
 186                     } else {
 187                         $end = false;
 188                     }
 189                     $strlen_segment = $position_comment_end - $cursor;
 190                     $segment = substr($html, $cursor, $strlen_segment);
 191                     $token = new
 192                         HTMLPurifier_Token_Comment(
 193                             substr(
 194                                 $segment, 3, $strlen_segment - 3
 195                             )
 196                         );
 197                     if ($maintain_line_numbers) {
 198                         $token->rawPosition($current_line, $current_col);
 199                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 200                     }
 201                     $array[] = $token;
 202                     $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 203                     $inside_tag = false;
 204                     continue;
 205                 }
 206
 207                 // Check if it's an end tag
 208                 $is_end_tag = (strpos($segment,'/') === 0);
 209                 if ($is_end_tag) {
 210                     $type = substr($segment, 1);
 211                     $token = new HTMLPurifier_Token_End($type);
 212                     if ($maintain_line_numbers) {
 213                         $token->rawPosition($current_line, $current_col);
 214                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 215                     }
 216                     $array[] = $token;
 217                     $inside_tag = false;
 218                     $cursor = $position_next_gt + 1;
 219                     continue;
 220                 }
 221
 222                 // Check leading character is alnum, if not, we may
 223                 // have accidently grabbed an emoticon. Translate into
 224                 // text and go our merry way
 225                 if (!ctype_alpha($segment[0])) {
 226                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 227                     if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 228                     $token = new HTMLPurifier_Token_Text('<');
 229                     if ($maintain_line_numbers) {
 230                         $token->rawPosition($current_line, $current_col);
 231                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 232                     }
 233                     $array[] = $token;
 234                     $inside_tag = false;
 235                     continue;
 236                 }
 237
 238                 // Check if it is explicitly self closing, if so, remove
 239                 // trailing slash. Remember, we could have a tag like <br>, so
 240                 // any later token processing scripts must convert improperly
 241                 // classified EmptyTags from StartTags.
 242                 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
 243                 if ($is_self_closing) {
 244                     $strlen_segment--;
 245                     $segment = substr($segment, 0, $strlen_segment);
 246                 }
 247
 248                 // Check if there are any attributes
 249                 $position_first_space = strcspn($segment, $this->_whitespace);
 250
 251                 if ($position_first_space >= $strlen_segment) {
 252                     if ($is_self_closing) {
 253                         $token = new HTMLPurifier_Token_Empty($segment);
 254                     } else {
 255                         $token = new HTMLPurifier_Token_Start($segment);
 256                     }
 257                     if ($maintain_line_numbers) {
 258                         $token->rawPosition($current_line, $current_col);
 259                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 260                     }
 261                     $array[] = $token;
 262                     $inside_tag = false;
 263                     $cursor = $position_next_gt + 1;
 264                     continue;
 265                 }
 266
 267                 // Grab out all the data
 268                 $type = substr($segment, 0, $position_first_space);
 269                 $attribute_string =
 270                     trim(
 271                         substr(
 272                             $segment, $position_first_space
 273                         )
 274                     );
 275                 if ($attribute_string) {
 276                     $attr = $this->parseAttributeString(
 277                                     $attribute_string
 278                                   , $config, $context
 279                               );
 280                 } else {
 281                     $attr = array();
 282                 }
 283
 284                 if ($is_self_closing) {
 285                     $token = new HTMLPurifier_Token_Empty($type, $attr);
 286                 } else {
 287                     $token = new HTMLPurifier_Token_Start($type, $attr);
 288                 }
 289                 if ($maintain_line_numbers) {
 290                     $token->rawPosition($current_line, $current_col);
 291                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 292                 }
 293                 $array[] = $token;
 294                 $cursor = $position_next_gt + 1;
 295                 $inside_tag = false;
 296                 continue;
 297             } else {
 298                 // inside tag, but there's no ending > sign
 299                 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
 300                 $token = new
 301                     HTMLPurifier_Token_Text(
 302                         '<' .
 303                         $this->parseData(
 304                             substr($html, $cursor)
 305                         )
 306                     );
 307                 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
 308                 // no cursor scroll? Hmm...
 309                 $array[] = $token;
 310                 break;
 311             }
 312             break;
 313         }
 314
 315         $context->destroy('CurrentLine');
 316         $context->destroy('CurrentCol');
 317         return $array;
 318     }
 319
 320     /**
 321      * PHP 5.0.x compatible substr_count that implements offset and length
 322      */
 323     protected function substrCount($haystack, $needle, $offset, $length) {
 324         static $oldVersion;
 325         if ($oldVersion === null) {
 326             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 327         }
 328         if ($oldVersion) {
 329             $haystack = substr($haystack, $offset, $length);
 330             return substr_count($haystack, $needle);
 331         } else {
 332             return substr_count($haystack, $needle, $offset, $length);
 333         }
 334     }
 335
 336     /**
 337      * Takes the inside of an HTML tag and makes an assoc array of attributes.
 338      *
 339      * @param $string Inside of tag excluding name.
 340      * @returns Assoc array of attributes.
 341      */
 342     public function parseAttributeString($string, $config, $context) {
 343         $string = (string) $string; // quick typecast
 344
 345         if ($string == '') return array(); // no attributes
 346
 347         $e = false;
 348         if ($config->get('Core.CollectErrors')) {
 349             $e =& $context->get('ErrorCollector');
 350         }
 351
 352         // let's see if we can abort as quickly as possible
 353         // one equal sign, no spaces => one attribute
 354         $num_equal = substr_count($string, '=');
 355         $has_space = strpos($string, ' ');
 356         if ($num_equal === 0 && !$has_space) {
 357             // bool attribute
 358             return array($string => $string);
 359         } elseif ($num_equal === 1 && !$has_space) {
 360             // only one attribute
 361             list($key, $quoted_value) = explode('=', $string);
 362             $quoted_value = trim($quoted_value);
 363             if (!$key) {
 364                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 365                 return array();
 366             }
 367             if (!$quoted_value) return array($key => '');
 368             $first_char = @$quoted_value[0];
 369             $last_char  = @$quoted_value[strlen($quoted_value)-1];
 370
 371             $same_quote = ($first_char == $last_char);
 372             $open_quote = ($first_char == '"' || $first_char == "'");
 373
 374             if ( $same_quote && $open_quote) {
 375                 // well behaved
 376                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 377             } else {
 378                 // not well behaved
 379                 if ($open_quote) {
 380                     if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
 381                     $value = substr($quoted_value, 1);
 382                 } else {
 383                     $value = $quoted_value;
 384                 }
 385             }
 386             if ($value === false) $value = '';
 387             return array($key => $this->parseData($value));
 388         }
 389
 390         // setup loop environment
 391         $array  = array(); // return assoc array of attributes
 392         $cursor = 0; // current position in string (moves forward)
 393         $size   = strlen($string); // size of the string (stays the same)
 394
 395         // if we have unquoted attributes, the parser expects a terminating
 396         // space, so let's guarantee that there's always a terminating space.
 397         $string .= ' ';
 398
 399         while(true) {
 400
 401             if ($cursor >= $size) {
 402                 break;
 403             }
 404
 405             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 406             // grab the key
 407
 408             $key_begin = $cursor; //we're currently at the start of the key
 409
 410             // scroll past all characters that are the key (not whitespace or =)
 411             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 412
 413             $key_end = $cursor; // now at the end of the key
 414
 415             $key = substr($string, $key_begin, $key_end - $key_begin);
 416
 417             if (!$key) {
 418                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 419                 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 420                 continue; // empty key
 421             }
 422
 423             // scroll past all whitespace
 424             $cursor += strspn($string, $this->_whitespace, $cursor);
 425
 426             if ($cursor >= $size) {
 427                 $array[$key] = $key;
 428                 break;
 429             }
 430
 431             // if the next character is an equal sign, we've got a regular
 432             // pair, otherwise, it's a bool attribute
 433             $first_char = @$string[$cursor];
 434
 435             if ($first_char == '=') {
 436                 // key="value"
 437
 438                 $cursor++;
 439                 $cursor += strspn($string, $this->_whitespace, $cursor);
 440
 441                 if ($cursor === false) {
 442                     $array[$key] = '';
 443                     break;
 444                 }
 445
 446                 // we might be in front of a quote right now
 447
 448                 $char = @$string[$cursor];
 449
 450                 if ($char == '"' || $char == "'") {
 451                     // it's quoted, end bound is $char
 452                     $cursor++;
 453                     $value_begin = $cursor;
 454                     $cursor = strpos($string, $char, $cursor);
 455                     $value_end = $cursor;
 456                 } else {
 457                     // it's not quoted, end bound is whitespace
 458                     $value_begin = $cursor;
 459                     $cursor += strcspn($string, $this->_whitespace, $cursor);
 460                     $value_end = $cursor;
 461                 }
 462
 463                 // we reached a premature end
 464                 if ($cursor === false) {
 465                     $cursor = $size;
 466                     $value_end = $cursor;
 467                 }
 468
 469                 $value = substr($string, $value_begin, $value_end - $value_begin);
 470                 if ($value === false) $value = '';
 471                 $array[$key] = $this->parseData($value);
 472                 $cursor++;
 473
 474             } else {
 475                 // boolattr
 476                 if ($key !== '') {
 477                     $array[$key] = $key;
 478                 } else {
 479                     // purely theoretical
 480                     if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 481                 }
 482
 483             }
 484         }
 485         return $array;
 486     }
 487
 488 }
 489
 490 // vim: et sw=4 sts=4