]> git.mxchange.org Git - friendica.git/blob - library/HTMLPurifier/Lexer/DirectLex.php
Oops, re-kill the dead snake.
[friendica.git] / library / HTMLPurifier / Lexer / DirectLex.php
1 <?php
2
3 /**
4  * Our in-house implementation of a parser.
5  *
6  * A pure PHP parser, DirectLex has absolutely no dependencies, making
7  * it a reasonably good default for PHP4.  Written with efficiency in mind,
8  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
9  * pales in comparison to HTMLPurifier_Lexer_DOMLex.
10  *
11  * @todo Reread XML spec and document differences.
12  */
13 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14 {
15
16     public $tracksLineNumbers = true;
17
18     /**
19      * Whitespace characters for str(c)spn.
20      */
21     protected $_whitespace = "\x20\x09\x0D\x0A";
22
23     /**
24      * Callback function for script CDATA fudge
25      * @param $matches, in form of array(opening tag, contents, closing tag)
26      */
27     protected function scriptCallback($matches) {
28         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
29     }
30
31     public function tokenizeHTML($html, $config, $context) {
32
33         // special normalization for script tags without any armor
34         // our "armor" heurstic is a < sign any number of whitespaces after
35         // the first script tag
36         if ($config->get('HTML.Trusted')) {
37             $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
38                 array($this, 'scriptCallback'), $html);
39         }
40
41         $html = $this->normalize($html, $config, $context);
42
43         $cursor = 0; // our location in the text
44         $inside_tag = false; // whether or not we're parsing the inside of a tag
45         $array = array(); // result array
46
47         // This is also treated to mean maintain *column* numbers too
48         $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
49
50         if ($maintain_line_numbers === null) {
51             // automatically determine line numbering by checking
52             // if error collection is on
53             $maintain_line_numbers = $config->get('Core.CollectErrors');
54         }
55
56         if ($maintain_line_numbers) {
57             $current_line = 1;
58             $current_col  = 0;
59             $length = strlen($html);
60         } else {
61             $current_line = false;
62             $current_col  = false;
63             $length = false;
64         }
65         $context->register('CurrentLine', $current_line);
66         $context->register('CurrentCol',  $current_col);
67         $nl = "\n";
68         // how often to manually recalculate. This will ALWAYS be right,
69         // but it's pretty wasteful. Set to 0 to turn off
70         $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
71
72         $e = false;
73         if ($config->get('Core.CollectErrors')) {
74             $e =& $context->get('ErrorCollector');
75         }
76
77         // for testing synchronization
78         $loops = 0;
79
80         while(++$loops) {
81
82             // $cursor is either at the start of a token, or inside of
83             // a tag (i.e. there was a < immediately before it), as indicated
84             // by $inside_tag
85
86             if ($maintain_line_numbers) {
87
88                 // $rcursor, however, is always at the start of a token.
89                 $rcursor = $cursor - (int) $inside_tag;
90
91                 // Column number is cheap, so we calculate it every round.
92                 // We're interested at the *end* of the newline string, so
93                 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
94                 // from our "rcursor" position.
95                 $nl_pos = strrpos($html, $nl, $rcursor - $length);
96                 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
97
98                 // recalculate lines
99                 if (
100                     $synchronize_interval &&  // synchronization is on
101                     $cursor > 0 &&            // cursor is further than zero
102                     $loops % $synchronize_interval === 0 // time to synchronize!
103                 ) {
104                     $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
105                 }
106
107             }
108
109             $position_next_lt = strpos($html, '<', $cursor);
110             $position_next_gt = strpos($html, '>', $cursor);
111
112             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
113             // special case to set up context
114             if ($position_next_lt === $cursor) {
115                 $inside_tag = true;
116                 $cursor++;
117             }
118
119             if (!$inside_tag && $position_next_lt !== false) {
120                 // We are not inside tag and there still is another tag to parse
121                 $token = new
122                     HTMLPurifier_Token_Text(
123                         $this->parseData(
124                             substr(
125                                 $html, $cursor, $position_next_lt - $cursor
126                             )
127                         )
128                     );
129                 if ($maintain_line_numbers) {
130                     $token->rawPosition($current_line, $current_col);
131                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
132                 }
133                 $array[] = $token;
134                 $cursor  = $position_next_lt + 1;
135                 $inside_tag = true;
136                 continue;
137             } elseif (!$inside_tag) {
138                 // We are not inside tag but there are no more tags
139                 // If we're already at the end, break
140                 if ($cursor === strlen($html)) break;
141                 // Create Text of rest of string
142                 $token = new
143                     HTMLPurifier_Token_Text(
144                         $this->parseData(
145                             substr(
146                                 $html, $cursor
147                             )
148                         )
149                     );
150                 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
151                 $array[] = $token;
152                 break;
153             } elseif ($inside_tag && $position_next_gt !== false) {
154                 // We are in tag and it is well formed
155                 // Grab the internals of the tag
156                 $strlen_segment = $position_next_gt - $cursor;
157
158                 if ($strlen_segment < 1) {
159                     // there's nothing to process!
160                     $token = new HTMLPurifier_Token_Text('<');
161                     $cursor++;
162                     continue;
163                 }
164
165                 $segment = substr($html, $cursor, $strlen_segment);
166
167                 if ($segment === false) {
168                     // somehow, we attempted to access beyond the end of
169                     // the string, defense-in-depth, reported by Nate Abele
170                     break;
171                 }
172
173                 // Check if it's a comment
174                 if (
175                     substr($segment, 0, 3) === '!--'
176                 ) {
177                     // re-determine segment length, looking for -->
178                     $position_comment_end = strpos($html, '-->', $cursor);
179                     if ($position_comment_end === false) {
180                         // uh oh, we have a comment that extends to
181                         // infinity. Can't be helped: set comment
182                         // end position to end of string
183                         if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
184                         $position_comment_end = strlen($html);
185                         $end = true;
186                     } else {
187                         $end = false;
188                     }
189                     $strlen_segment = $position_comment_end - $cursor;
190                     $segment = substr($html, $cursor, $strlen_segment);
191                     $token = new
192                         HTMLPurifier_Token_Comment(
193                             substr(
194                                 $segment, 3, $strlen_segment - 3
195                             )
196                         );
197                     if ($maintain_line_numbers) {
198                         $token->rawPosition($current_line, $current_col);
199                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
200                     }
201                     $array[] = $token;
202                     $cursor = $end ? $position_comment_end : $position_comment_end + 3;
203                     $inside_tag = false;
204                     continue;
205                 }
206
207                 // Check if it's an end tag
208                 $is_end_tag = (strpos($segment,'/') === 0);
209                 if ($is_end_tag) {
210                     $type = substr($segment, 1);
211                     $token = new HTMLPurifier_Token_End($type);
212                     if ($maintain_line_numbers) {
213                         $token->rawPosition($current_line, $current_col);
214                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
215                     }
216                     $array[] = $token;
217                     $inside_tag = false;
218                     $cursor = $position_next_gt + 1;
219                     continue;
220                 }
221
222                 // Check leading character is alnum, if not, we may
223                 // have accidently grabbed an emoticon. Translate into
224                 // text and go our merry way
225                 if (!ctype_alpha($segment[0])) {
226                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
227                     if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
228                     $token = new HTMLPurifier_Token_Text('<');
229                     if ($maintain_line_numbers) {
230                         $token->rawPosition($current_line, $current_col);
231                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
232                     }
233                     $array[] = $token;
234                     $inside_tag = false;
235                     continue;
236                 }
237
238                 // Check if it is explicitly self closing, if so, remove
239                 // trailing slash. Remember, we could have a tag like <br>, so
240                 // any later token processing scripts must convert improperly
241                 // classified EmptyTags from StartTags.
242                 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
243                 if ($is_self_closing) {
244                     $strlen_segment--;
245                     $segment = substr($segment, 0, $strlen_segment);
246                 }
247
248                 // Check if there are any attributes
249                 $position_first_space = strcspn($segment, $this->_whitespace);
250
251                 if ($position_first_space >= $strlen_segment) {
252                     if ($is_self_closing) {
253                         $token = new HTMLPurifier_Token_Empty($segment);
254                     } else {
255                         $token = new HTMLPurifier_Token_Start($segment);
256                     }
257                     if ($maintain_line_numbers) {
258                         $token->rawPosition($current_line, $current_col);
259                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
260                     }
261                     $array[] = $token;
262                     $inside_tag = false;
263                     $cursor = $position_next_gt + 1;
264                     continue;
265                 }
266
267                 // Grab out all the data
268                 $type = substr($segment, 0, $position_first_space);
269                 $attribute_string =
270                     trim(
271                         substr(
272                             $segment, $position_first_space
273                         )
274                     );
275                 if ($attribute_string) {
276                     $attr = $this->parseAttributeString(
277                                     $attribute_string
278                                   , $config, $context
279                               );
280                 } else {
281                     $attr = array();
282                 }
283
284                 if ($is_self_closing) {
285                     $token = new HTMLPurifier_Token_Empty($type, $attr);
286                 } else {
287                     $token = new HTMLPurifier_Token_Start($type, $attr);
288                 }
289                 if ($maintain_line_numbers) {
290                     $token->rawPosition($current_line, $current_col);
291                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
292                 }
293                 $array[] = $token;
294                 $cursor = $position_next_gt + 1;
295                 $inside_tag = false;
296                 continue;
297             } else {
298                 // inside tag, but there's no ending > sign
299                 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
300                 $token = new
301                     HTMLPurifier_Token_Text(
302                         '<' .
303                         $this->parseData(
304                             substr($html, $cursor)
305                         )
306                     );
307                 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
308                 // no cursor scroll? Hmm...
309                 $array[] = $token;
310                 break;
311             }
312             break;
313         }
314
315         $context->destroy('CurrentLine');
316         $context->destroy('CurrentCol');
317         return $array;
318     }
319
320     /**
321      * PHP 5.0.x compatible substr_count that implements offset and length
322      */
323     protected function substrCount($haystack, $needle, $offset, $length) {
324         static $oldVersion;
325         if ($oldVersion === null) {
326             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
327         }
328         if ($oldVersion) {
329             $haystack = substr($haystack, $offset, $length);
330             return substr_count($haystack, $needle);
331         } else {
332             return substr_count($haystack, $needle, $offset, $length);
333         }
334     }
335
336     /**
337      * Takes the inside of an HTML tag and makes an assoc array of attributes.
338      *
339      * @param $string Inside of tag excluding name.
340      * @returns Assoc array of attributes.
341      */
342     public function parseAttributeString($string, $config, $context) {
343         $string = (string) $string; // quick typecast
344
345         if ($string == '') return array(); // no attributes
346
347         $e = false;
348         if ($config->get('Core.CollectErrors')) {
349             $e =& $context->get('ErrorCollector');
350         }
351
352         // let's see if we can abort as quickly as possible
353         // one equal sign, no spaces => one attribute
354         $num_equal = substr_count($string, '=');
355         $has_space = strpos($string, ' ');
356         if ($num_equal === 0 && !$has_space) {
357             // bool attribute
358             return array($string => $string);
359         } elseif ($num_equal === 1 && !$has_space) {
360             // only one attribute
361             list($key, $quoted_value) = explode('=', $string);
362             $quoted_value = trim($quoted_value);
363             if (!$key) {
364                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
365                 return array();
366             }
367             if (!$quoted_value) return array($key => '');
368             $first_char = @$quoted_value[0];
369             $last_char  = @$quoted_value[strlen($quoted_value)-1];
370
371             $same_quote = ($first_char == $last_char);
372             $open_quote = ($first_char == '"' || $first_char == "'");
373
374             if ( $same_quote && $open_quote) {
375                 // well behaved
376                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
377             } else {
378                 // not well behaved
379                 if ($open_quote) {
380                     if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
381                     $value = substr($quoted_value, 1);
382                 } else {
383                     $value = $quoted_value;
384                 }
385             }
386             if ($value === false) $value = '';
387             return array($key => $this->parseData($value));
388         }
389
390         // setup loop environment
391         $array  = array(); // return assoc array of attributes
392         $cursor = 0; // current position in string (moves forward)
393         $size   = strlen($string); // size of the string (stays the same)
394
395         // if we have unquoted attributes, the parser expects a terminating
396         // space, so let's guarantee that there's always a terminating space.
397         $string .= ' ';
398
399         while(true) {
400
401             if ($cursor >= $size) {
402                 break;
403             }
404
405             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
406             // grab the key
407
408             $key_begin = $cursor; //we're currently at the start of the key
409
410             // scroll past all characters that are the key (not whitespace or =)
411             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
412
413             $key_end = $cursor; // now at the end of the key
414
415             $key = substr($string, $key_begin, $key_end - $key_begin);
416
417             if (!$key) {
418                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
419                 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
420                 continue; // empty key
421             }
422
423             // scroll past all whitespace
424             $cursor += strspn($string, $this->_whitespace, $cursor);
425
426             if ($cursor >= $size) {
427                 $array[$key] = $key;
428                 break;
429             }
430
431             // if the next character is an equal sign, we've got a regular
432             // pair, otherwise, it's a bool attribute
433             $first_char = @$string[$cursor];
434
435             if ($first_char == '=') {
436                 // key="value"
437
438                 $cursor++;
439                 $cursor += strspn($string, $this->_whitespace, $cursor);
440
441                 if ($cursor === false) {
442                     $array[$key] = '';
443                     break;
444                 }
445
446                 // we might be in front of a quote right now
447
448                 $char = @$string[$cursor];
449
450                 if ($char == '"' || $char == "'") {
451                     // it's quoted, end bound is $char
452                     $cursor++;
453                     $value_begin = $cursor;
454                     $cursor = strpos($string, $char, $cursor);
455                     $value_end = $cursor;
456                 } else {
457                     // it's not quoted, end bound is whitespace
458                     $value_begin = $cursor;
459                     $cursor += strcspn($string, $this->_whitespace, $cursor);
460                     $value_end = $cursor;
461                 }
462
463                 // we reached a premature end
464                 if ($cursor === false) {
465                     $cursor = $size;
466                     $value_end = $cursor;
467                 }
468
469                 $value = substr($string, $value_begin, $value_end - $value_begin);
470                 if ($value === false) $value = '';
471                 $array[$key] = $this->parseData($value);
472                 $cursor++;
473
474             } else {
475                 // boolattr
476                 if ($key !== '') {
477                     $array[$key] = $key;
478                 } else {
479                     // purely theoretical
480                     if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
481                 }
482
483             }
484         }
485         return $array;
486     }
487
488 }
489
490 // vim: et sw=4 sts=4