]> git.mxchange.org Git - quix0rs-gnu-social.git/commitdiff
Updating HTMLPurifier to 4.9.3
authorMikael Nordfeldth <mmn@hethane.se>
Mon, 10 Jul 2017 11:46:07 +0000 (13:46 +0200)
committerMikael Nordfeldth <mmn@hethane.se>
Mon, 10 Jul 2017 11:46:07 +0000 (13:46 +0200)
Source: https://htmlpurifier.org/download
Release date: 2017-06-19

54 files changed:
extlib/HTMLPurifier/HTMLPurifier.includes.php
extlib/HTMLPurifier/HTMLPurifier.php
extlib/HTMLPurifier/HTMLPurifier.safe-includes.php
extlib/HTMLPurifier/HTMLPurifier/Arborize.php
extlib/HTMLPurifier/HTMLPurifier/AttrCollections.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/CSS.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/CSS/Color.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/CSS/FontFamily.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/CSS/URI.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/HTML/ID.php
extlib/HTMLPurifier/HTMLPurifier/AttrDef/URI/Host.php
extlib/HTMLPurifier/HTMLPurifier/AttrTransform/ImgRequired.php
extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoopener.php [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoreferrer.php [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/CSSDefinition.php
extlib/HTMLPurifier/HTMLPurifier/ChildDef/List.php
extlib/HTMLPurifier/HTMLPurifier/ChildDef/Table.php
extlib/HTMLPurifier/HTMLPurifier/Config.php
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema.php
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema.ser
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Attr.ID.HTML5.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/CSS.AllowDuplicates.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Cache.SerializerPermissions.txt
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyRemoveScript.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoopener.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoreferrer.txt [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/URI.AllowedSchemes.txt
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/URI.DefaultScheme.txt
extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/URI.Munge.txt
extlib/HTMLPurifier/HTMLPurifier/DefinitionCache.php
extlib/HTMLPurifier/HTMLPurifier/DefinitionCache/Serializer.php
extlib/HTMLPurifier/HTMLPurifier/Encoder.php
extlib/HTMLPurifier/HTMLPurifier/EntityParser.php
extlib/HTMLPurifier/HTMLPurifier/Filter/ExtractStyleBlocks.php
extlib/HTMLPurifier/HTMLPurifier/Generator.php
extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoopener.php [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoreferrer.php [new file with mode: 0644]
extlib/HTMLPurifier/HTMLPurifier/HTMLModuleManager.php
extlib/HTMLPurifier/HTMLPurifier/Injector/Linkify.php
extlib/HTMLPurifier/HTMLPurifier/Injector/RemoveEmpty.php
extlib/HTMLPurifier/HTMLPurifier/Injector/SafeObject.php
extlib/HTMLPurifier/HTMLPurifier/Lexer.php
extlib/HTMLPurifier/HTMLPurifier/Lexer/DOMLex.php
extlib/HTMLPurifier/HTMLPurifier/Lexer/DirectLex.php
extlib/HTMLPurifier/HTMLPurifier/Lexer/PH5P.php
extlib/HTMLPurifier/HTMLPurifier/Printer/ConfigForm.php
extlib/HTMLPurifier/HTMLPurifier/Strategy/MakeWellFormed.php
extlib/HTMLPurifier/HTMLPurifier/Token.php
extlib/HTMLPurifier/HTMLPurifier/URI.php
extlib/HTMLPurifier/HTMLPurifier/URIScheme/data.php
extlib/HTMLPurifier/HTMLPurifier/URIScheme/tel.php [new file with mode: 0644]
extlib/HTMLPurifier/VERSION

index fdb58c2d376cd9b09fc9dc1d564e9c34fb30060a..e8bce5c850ca0f29b2cab996b3b449f18f58336d 100644 (file)
@@ -7,7 +7,7 @@
  * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
  * FILE, changes will be overwritten the next time the script is run.
  *
- * @version 4.7.0
+ * @version 4.9.3
  *
  * @warning
  *      You must *not* include any other HTML Purifier files before this file,
@@ -137,6 +137,8 @@ require 'HTMLPurifier/AttrTransform/SafeObject.php';
 require 'HTMLPurifier/AttrTransform/SafeParam.php';
 require 'HTMLPurifier/AttrTransform/ScriptRequired.php';
 require 'HTMLPurifier/AttrTransform/TargetBlank.php';
+require 'HTMLPurifier/AttrTransform/TargetNoopener.php';
+require 'HTMLPurifier/AttrTransform/TargetNoreferrer.php';
 require 'HTMLPurifier/AttrTransform/Textarea.php';
 require 'HTMLPurifier/ChildDef/Chameleon.php';
 require 'HTMLPurifier/ChildDef/Custom.php';
@@ -175,6 +177,8 @@ require 'HTMLPurifier/HTMLModule/StyleAttribute.php';
 require 'HTMLPurifier/HTMLModule/Tables.php';
 require 'HTMLPurifier/HTMLModule/Target.php';
 require 'HTMLPurifier/HTMLModule/TargetBlank.php';
+require 'HTMLPurifier/HTMLModule/TargetNoopener.php';
+require 'HTMLPurifier/HTMLModule/TargetNoreferrer.php';
 require 'HTMLPurifier/HTMLModule/Text.php';
 require 'HTMLPurifier/HTMLModule/Tidy.php';
 require 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
@@ -225,5 +229,6 @@ require 'HTMLPurifier/URIScheme/https.php';
 require 'HTMLPurifier/URIScheme/mailto.php';
 require 'HTMLPurifier/URIScheme/news.php';
 require 'HTMLPurifier/URIScheme/nntp.php';
+require 'HTMLPurifier/URIScheme/tel.php';
 require 'HTMLPurifier/VarParser/Flexible.php';
 require 'HTMLPurifier/VarParser/Native.php';
index c6041bc1130c3db90099e74302fd5688537e2d5e..b4605ebc6e9e400823e20c33a35b82dfdf9653c0 100644 (file)
@@ -19,7 +19,7 @@
  */
 
 /*
-    HTML Purifier 4.7.0 - Standards Compliant HTML Filtering
+    HTML Purifier 4.9.3 - Standards Compliant HTML Filtering
     Copyright (C) 2006-2008 Edward Z. Yang
 
     This library is free software; you can redistribute it and/or
@@ -58,12 +58,12 @@ class HTMLPurifier
      * Version of HTML Purifier.
      * @type string
      */
-    public $version = '4.7.0';
+    public $version = '4.9.3';
 
     /**
      * Constant with version of HTML Purifier.
      */
-    const VERSION = '4.7.0';
+    const VERSION = '4.9.3';
 
     /**
      * Global configuration object.
@@ -104,7 +104,7 @@ class HTMLPurifier
     /**
      * Initializes the purifier.
      *
-     * @param HTMLPurifier_Config $config Optional HTMLPurifier_Config object
+     * @param HTMLPurifier_Config|mixed $config Optional HTMLPurifier_Config object
      *                for all instances of the purifier, if omitted, a default
      *                configuration is supplied (which can be overridden on a
      *                per-use basis).
index 9dea6d1ed55d9f8c69b5a968c62c5f701e5887e4..a3261f8a327a175ec04a5d2c3e931831575c984b 100644 (file)
@@ -131,6 +131,8 @@ require_once $__dir . '/HTMLPurifier/AttrTransform/SafeObject.php';
 require_once $__dir . '/HTMLPurifier/AttrTransform/SafeParam.php';
 require_once $__dir . '/HTMLPurifier/AttrTransform/ScriptRequired.php';
 require_once $__dir . '/HTMLPurifier/AttrTransform/TargetBlank.php';
+require_once $__dir . '/HTMLPurifier/AttrTransform/TargetNoopener.php';
+require_once $__dir . '/HTMLPurifier/AttrTransform/TargetNoreferrer.php';
 require_once $__dir . '/HTMLPurifier/AttrTransform/Textarea.php';
 require_once $__dir . '/HTMLPurifier/ChildDef/Chameleon.php';
 require_once $__dir . '/HTMLPurifier/ChildDef/Custom.php';
@@ -169,6 +171,8 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/StyleAttribute.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/Tables.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/Target.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/TargetBlank.php';
+require_once $__dir . '/HTMLPurifier/HTMLModule/TargetNoopener.php';
+require_once $__dir . '/HTMLPurifier/HTMLModule/TargetNoreferrer.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/Text.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy.php';
 require_once $__dir . '/HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
@@ -219,5 +223,6 @@ require_once $__dir . '/HTMLPurifier/URIScheme/https.php';
 require_once $__dir . '/HTMLPurifier/URIScheme/mailto.php';
 require_once $__dir . '/HTMLPurifier/URIScheme/news.php';
 require_once $__dir . '/HTMLPurifier/URIScheme/nntp.php';
+require_once $__dir . '/HTMLPurifier/URIScheme/tel.php';
 require_once $__dir . '/HTMLPurifier/VarParser/Flexible.php';
 require_once $__dir . '/HTMLPurifier/VarParser/Native.php';
index 9e6617be5de4bc6416d44ff53137dd2320f10ce0..d2e9d22a2076890233a3827d7acefd4759ca118f 100644 (file)
@@ -19,8 +19,8 @@ class HTMLPurifier_Arborize
             if ($token instanceof HTMLPurifier_Token_End) {
                 $token->start = null; // [MUT]
                 $r = array_pop($stack);
-                assert($r->name === $token->name);
-                assert(empty($token->attr));
+                //assert($r->name === $token->name);
+                //assert(empty($token->attr));
                 $r->endCol = $token->col;
                 $r->endLine = $token->line;
                 $r->endArmor = $token->armor;
@@ -32,7 +32,7 @@ class HTMLPurifier_Arborize
                 $stack[] = $node;
             }
         }
-        assert(count($stack) == 1);
+        //assert(count($stack) == 1);
         return $stack[0];
     }
 
index 4f6c2e39a2ed49356859391e114becae2ce54b38..c7b17cf144dbcfd0d96a7d0054e79e907aa895e7 100644 (file)
@@ -21,6 +21,11 @@ class HTMLPurifier_AttrCollections
      * @param HTMLPurifier_HTMLModule[] $modules Hash array of HTMLPurifier_HTMLModule members
      */
     public function __construct($attr_types, $modules)
+    {
+        $this->doConstruct($attr_types, $modules);
+    }
+
+    public function doConstruct($attr_types, $modules)
     {
         // load extensions from the modules
         foreach ($modules as $module) {
index 5ac06522b9eb7dca361042f9c4801371d2d5250c..739646fa7c26c4f4572d33b5af5d85a4be6b2db6 100644 (file)
@@ -86,7 +86,13 @@ abstract class HTMLPurifier_AttrDef
      */
     protected function mungeRgb($string)
     {
-        return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
+        $p = '\s*(\d+(\.\d+)?([%]?))\s*';
+
+        if (preg_match('/(rgba|hsla)\(/', $string)) {
+            return preg_replace('/(rgba|hsla)\('.$p.','.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8,\11)', $string);
+        }
+
+        return preg_replace('/(rgb|hsl)\('.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8)', $string);
     }
 
     /**
index 02c1641fb2a14abb2165996d35eec77b1b9347e1..ad2cb90ad1f32602e5f4d2eacd01fc7cadfefbe4 100644 (file)
@@ -25,15 +25,42 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
         $css = $this->parseCDATA($css);
 
         $definition = $config->getCSSDefinition();
+        $allow_duplicates = $config->get("CSS.AllowDuplicates");
 
-        // we're going to break the spec and explode by semicolons.
-        // This is because semicolon rarely appears in escaped form
-        // Doing this is generally flaky but fast
-        // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
-        // for details
 
-        $declarations = explode(';', $css);
+        // According to the CSS2.1 spec, the places where a
+        // non-delimiting semicolon can appear are in strings
+        // escape sequences.   So here is some dumb hack to
+        // handle quotes.
+        $len = strlen($css);
+        $accum = "";
+        $declarations = array();
+        $quoted = false;
+        for ($i = 0; $i < $len; $i++) {
+            $c = strcspn($css, ";'\"", $i);
+            $accum .= substr($css, $i, $c);
+            $i += $c;
+            if ($i == $len) break;
+            $d = $css[$i];
+            if ($quoted) {
+                $accum .= $d;
+                if ($d == $quoted) {
+                    $quoted = false;
+                }
+            } else {
+                if ($d == ";") {
+                    $declarations[] = $accum;
+                    $accum = "";
+                } else {
+                    $accum .= $d;
+                    $quoted = $d;
+                }
+            }
+        }
+        if ($accum != "") $declarations[] = $accum;
+
         $propvalues = array();
+        $new_declarations = '';
 
         /**
          * Name of the current CSS property being validated.
@@ -83,7 +110,11 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
             if ($result === false) {
                 continue;
             }
-            $propvalues[$property] = $result;
+            if ($allow_duplicates) {
+                $new_declarations .= "$property:$result;";
+            } else {
+                $propvalues[$property] = $result;
+            }
         }
 
         $context->destroy('CurrentCSSProperty');
@@ -92,7 +123,6 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
         // slightly inefficient, but it's the only way of getting rid of
         // duplicates. Perhaps config to optimize it, but not now.
 
-        $new_declarations = '';
         foreach ($propvalues as $prop => $value) {
             $new_declarations .= "$prop:$value;";
         }
index 16d2a6b98c1e8ba09907d8025eae5c8364f85c5d..d7287a00c2e85ef3aef9f52af970788c748f1c6e 100644 (file)
@@ -6,6 +6,16 @@
 class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
 {
 
+    /**
+     * @type HTMLPurifier_AttrDef_CSS_AlphaValue
+     */
+    protected $alpha;
+
+    public function __construct()
+    {
+        $this->alpha = new HTMLPurifier_AttrDef_CSS_AlphaValue();
+    }
+
     /**
      * @param string $color
      * @param HTMLPurifier_Config $config
@@ -29,59 +39,104 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
             return $colors[$lower];
         }
 
-        if (strpos($color, 'rgb(') !== false) {
-            // rgb literal handling
+        if (preg_match('#(rgb|rgba|hsl|hsla)\(#', $color, $matches) === 1) {
             $length = strlen($color);
             if (strpos($color, ')') !== $length - 1) {
                 return false;
             }
-            $triad = substr($color, 4, $length - 4 - 1);
-            $parts = explode(',', $triad);
-            if (count($parts) !== 3) {
+
+            // get used function : rgb, rgba, hsl or hsla
+            $function = $matches[1];
+
+            $parameters_size = 3;
+            $alpha_channel = false;
+            if (substr($function, -1) === 'a') {
+                $parameters_size = 4;
+                $alpha_channel = true;
+            }
+
+            /*
+             * Allowed types for values :
+             * parameter_position => [type => max_value]
+             */
+            $allowed_types = array(
+                1 => array('percentage' => 100, 'integer' => 255),
+                2 => array('percentage' => 100, 'integer' => 255),
+                3 => array('percentage' => 100, 'integer' => 255),
+            );
+            $allow_different_types = false;
+
+            if (strpos($function, 'hsl') !== false) {
+                $allowed_types = array(
+                    1 => array('integer' => 360),
+                    2 => array('percentage' => 100),
+                    3 => array('percentage' => 100),
+                );
+                $allow_different_types = true;
+            }
+
+            $values = trim(str_replace($function, '', $color), ' ()');
+
+            $parts = explode(',', $values);
+            if (count($parts) !== $parameters_size) {
                 return false;
             }
-            $type = false; // to ensure that they're all the same type
+
+            $type = false;
             $new_parts = array();
+            $i = 0;
+
             foreach ($parts as $part) {
+                $i++;
                 $part = trim($part);
+
                 if ($part === '') {
                     return false;
                 }
-                $length = strlen($part);
-                if ($part[$length - 1] === '%') {
-                    // handle percents
-                    if (!$type) {
-                        $type = 'percentage';
-                    } elseif ($type !== 'percentage') {
+
+                // different check for alpha channel
+                if ($alpha_channel === true && $i === count($parts)) {
+                    $result = $this->alpha->validate($part, $config, $context);
+
+                    if ($result === false) {
                         return false;
                     }
-                    $num = (float)substr($part, 0, $length - 1);
-                    if ($num < 0) {
-                        $num = 0;
-                    }
-                    if ($num > 100) {
-                        $num = 100;
-                    }
-                    $new_parts[] = "$num%";
+
+                    $new_parts[] = (string)$result;
+                    continue;
+                }
+
+                if (substr($part, -1) === '%') {
+                    $current_type = 'percentage';
                 } else {
-                    // handle integers
-                    if (!$type) {
-                        $type = 'integer';
-                    } elseif ($type !== 'integer') {
-                        return false;
-                    }
-                    $num = (int)$part;
-                    if ($num < 0) {
-                        $num = 0;
-                    }
-                    if ($num > 255) {
-                        $num = 255;
-                    }
-                    $new_parts[] = (string)$num;
+                    $current_type = 'integer';
+                }
+
+                if (!array_key_exists($current_type, $allowed_types[$i])) {
+                    return false;
+                }
+
+                if (!$type) {
+                    $type = $current_type;
+                }
+
+                if ($allow_different_types === false && $type != $current_type) {
+                    return false;
+                }
+
+                $max_value = $allowed_types[$i][$current_type];
+
+                if ($current_type == 'integer') {
+                    // Return value between range 0 -> $max_value
+                    $new_parts[] = (int)max(min($part, $max_value), 0);
+                } elseif ($current_type == 'percentage') {
+                    $new_parts[] = (float)max(min(rtrim($part, '%'), $max_value), 0) . '%';
                 }
             }
-            $new_triad = implode(',', $new_parts);
-            $color = "rgb($new_triad)";
+
+            $new_values = implode(',', $new_parts);
+
+            $color = $function . '(' . $new_values . ')';
         } else {
             // hexadecimal handling
             if ($color[0] === '#') {
@@ -100,6 +155,7 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
         }
         return $color;
     }
+
 }
 
 // vim: et sw=4 sts=4
index 86101020dc9fa133ffec3c456096bfb6ccc86e94..74e24c8816ec91404b5cf9a29b1330057f18768c 100644 (file)
@@ -130,6 +130,8 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
             //    <http://ja.wikipedia.org/wiki/MS_明朝>.  See
             //    the CSS3 spec for more examples:
             //    <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
+            //    You can see live samples of these on the Internet:
+            //    <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
             //    However, most of these fonts have ASCII equivalents:
             //    for example, 'MS Mincho', and it's considered
             //    professional to use ASCII font names instead of
index f9434230e21a3de557a9edb2af4a752a0ab659dc..6617acace59b37e424af8dbbd3f2767b05d14126 100644 (file)
@@ -33,6 +33,9 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
             return false;
         }
         $uri_string = substr($uri_string, 4);
+        if (strlen($uri_string) == 0) {
+            return false;
+        }
         $new_length = strlen($uri_string) - 1;
         if ($uri_string[$new_length] != ')') {
             return false;
index 3d86efb44c33808da7caaae3fff5a9cfff08ab33..4ba45610feb9c3aaa172dccab1e7c3e4fa83786d 100644 (file)
@@ -72,18 +72,26 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
 
         // we purposely avoid using regex, hopefully this is faster
 
-        if (ctype_alpha($id)) {
-            $result = true;
-        } else {
-            if (!ctype_alpha(@$id[0])) {
+        if ($config->get('Attr.ID.HTML5') === true) {
+            if (preg_match('/[\t\n\x0b\x0c ]/', $id)) {
                 return false;
             }
-            // primitive style of regexps, I suppose
-            $trim = trim(
-                $id,
-                'A..Za..z0..9:-._'
-            );
-            $result = ($trim === '');
+        } else {
+            if (ctype_alpha($id)) {
+                // OK
+            } else {
+                if (!ctype_alpha(@$id[0])) {
+                    return false;
+                }
+                // primitive style of regexps, I suppose
+                $trim = trim(
+                    $id,
+                    'A..Za..z0..9:-._'
+                );
+                if ($trim !== '') {
+                    return false;
+                }
+            }
         }
 
         $regexp = $config->get('Attr.IDBlacklistRegexp');
@@ -91,14 +99,14 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
             return false;
         }
 
-        if (!$this->selector && $result) {
+        if (!$this->selector) {
             $id_accumulator->add($id);
         }
 
         // if no change was made to the ID, return the result
         // else, return the new id if stripping whitespace made it
         //     valid, or return false.
-        return $result ? $id : false;
+        return $id;
     }
 }
 
index e7df800b1eb8e78cc8241678d201e398b693c837..3b4d186743e22111c0ff609c29a29b941fb75b45 100644 (file)
@@ -76,24 +76,33 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
         // fairly well supported.
         $underscore = $config->get('Core.AllowHostnameUnderscore') ? '_' : '';
 
+        // Based off of RFC 1738, but amended so that
+        // as per RFC 3696, the top label need only not be all numeric.
         // The productions describing this are:
         $a   = '[a-z]';     // alpha
         $an  = '[a-z0-9]';  // alphanum
         $and = "[a-z0-9-$underscore]"; // alphanum | "-"
         // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
-        $domainlabel = "$an($and*$an)?";
-        // toplabel    = alpha | alpha *( alphanum | "-" ) alphanum
-        $toplabel = "$a($and*$an)?";
+        $domainlabel = "$an(?:$and*$an)?";
+        // AMENDED as per RFC 3696
+        // toplabel    = alphanum | alphanum *( alphanum | "-" ) alphanum
+        //      side condition: not all numeric
+        $toplabel = "$an(?:$and*$an)?";
         // hostname    = *( domainlabel "." ) toplabel [ "." ]
-        if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
-            return $string;
+        if (preg_match("/^(?:$domainlabel\.)*($toplabel)\.?$/i", $string, $matches)) {
+            if (!ctype_digit($matches[1])) {
+                return $string;
+            }
         }
 
+        // PHP 5.3 and later support this functionality natively
+        if (function_exists('idn_to_ascii')) {
+            $string = idn_to_ascii($string);
+
         // If we have Net_IDNA2 support, we can support IRIs by
         // punycoding them. (This is the most portable thing to do,
         // since otherwise we have to assume browsers support
-
-        if ($config->get('Core.EnableIDNA')) {
+        } elseif ($config->get('Core.EnableIDNA')) {
             $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
             // we need to encode each period separately
             $parts = explode('.', $string);
@@ -114,13 +123,14 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
                     }
                 }
                 $string = implode('.', $new_parts);
-                if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
-                    return $string;
-                }
             } catch (Exception $e) {
                 // XXX error reporting
             }
         }
+        // Try again
+        if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
+            return $string;
+        }
         return false;
     }
 }
index 7df6cb3e1b4dc678ff736d7fac4dbf107fdac92e..235ebb34b60ff22b403b6d03d764650c6dbe65e1 100644 (file)
@@ -32,8 +32,7 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
             if ($src) {
                 $alt = $config->get('Attr.DefaultImageAlt');
                 if ($alt === null) {
-                    // truncate if the alt is too long
-                    $attr['alt'] = substr(basename($attr['src']), 0, 40);
+                    $attr['alt'] = basename($attr['src']);
                 } else {
                     $attr['alt'] = $alt;
                 }
diff --git a/extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoopener.php b/extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoopener.php
new file mode 100644 (file)
index 0000000..1db3c6c
--- /dev/null
@@ -0,0 +1,37 @@
+<?php
+
+// must be called POST validation
+
+/**
+ * Adds rel="noopener" to any links which target a different window
+ * than the current one.  This is used to prevent malicious websites
+ * from silently replacing the original window, which could be used
+ * to do phishing.
+ * This transform is controlled by %HTML.TargetNoopener.
+ */
+class HTMLPurifier_AttrTransform_TargetNoopener extends HTMLPurifier_AttrTransform
+{
+    /**
+     * @param array $attr
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array
+     */
+    public function transform($attr, $config, $context)
+    {
+        if (isset($attr['rel'])) {
+            $rels = explode(' ', $attr['rel']);
+        } else {
+            $rels = array();
+        }
+        if (isset($attr['target']) && !in_array('noopener', $rels)) {
+            $rels[] = 'noopener';
+        }
+        if (!empty($rels) || isset($attr['rel'])) {
+            $attr['rel'] = implode(' ', $rels);
+        }
+
+        return $attr;
+    }
+}
+
diff --git a/extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoreferrer.php b/extlib/HTMLPurifier/HTMLPurifier/AttrTransform/TargetNoreferrer.php
new file mode 100644 (file)
index 0000000..587dc2e
--- /dev/null
@@ -0,0 +1,37 @@
+<?php
+
+// must be called POST validation
+
+/**
+ * Adds rel="noreferrer" to any links which target a different window
+ * than the current one.  This is used to prevent malicious websites
+ * from silently replacing the original window, which could be used
+ * to do phishing.
+ * This transform is controlled by %HTML.TargetNoreferrer.
+ */
+class HTMLPurifier_AttrTransform_TargetNoreferrer extends HTMLPurifier_AttrTransform
+{
+    /**
+     * @param array $attr
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array
+     */
+    public function transform($attr, $config, $context)
+    {
+        if (isset($attr['rel'])) {
+            $rels = explode(' ', $attr['rel']);
+        } else {
+            $rels = array();
+        }
+        if (isset($attr['target']) && !in_array('noreferrer', $rels)) {
+            $rels[] = 'noreferrer';
+        }
+        if (!empty($rels) || isset($attr['rel'])) {
+            $attr['rel'] = implode(' ', $rels);
+        }
+
+        return $attr;
+    }
+}
+
index 07cc941758c78a2e3299baabb50fac5ba17a2181..47dfd1f66609f95e4638ed585a4dc3bda21a9e92 100644 (file)
@@ -225,6 +225,10 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
         );
         $max = $config->get('CSS.MaxImgLength');
 
+        $this->info['min-width'] =
+        $this->info['max-width'] =
+        $this->info['min-height'] =
+        $this->info['max-height'] =
         $this->info['width'] =
         $this->info['height'] =
             $max === null ?
@@ -370,6 +374,19 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
         );
         $this->info['page-break-inside'] = new HTMLPurifier_AttrDef_Enum(array('auto', 'avoid'));
 
+        $border_radius = new HTMLPurifier_AttrDef_CSS_Composite(
+            array(
+                new HTMLPurifier_AttrDef_CSS_Percentage(true), // disallow negative
+                new HTMLPurifier_AttrDef_CSS_Length('0') // disallow negative
+            ));
+
+        $this->info['border-top-left-radius'] =
+        $this->info['border-top-right-radius'] =
+        $this->info['border-bottom-right-radius'] =
+        $this->info['border-bottom-left-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 2);
+        // TODO: support SLASH syntax
+        $this->info['border-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 4);
+
     }
 
     /**
index 891b9f6f5bb0832d81392ac9d8283a76656a4c0e..4fc70e0efa763887596e3ef9af1bd998bc6104a4 100644 (file)
@@ -38,13 +38,19 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
             return false;
         }
 
+        // if li is not allowed, delete parent node
+        if (!isset($config->getHTMLDefinition()->info['li'])) {
+            trigger_error("Cannot allow ul/ol without allowing li", E_USER_WARNING);
+            return false;
+        }
+
         // the new set of children
         $result = array();
 
         // a little sanity check to make sure it's not ALL whitespace
         $all_whitespace = true;
 
-        $current_li = false;
+        $current_li = null;
 
         foreach ($children as $node) {
             if (!empty($node->is_whitespace)) {
@@ -65,7 +71,7 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
                 // to handle non-list elements; non-list elements should
                 // not be appended to an existing li; only li created
                 // for non-list. This distinction is not currently made.
-                if ($current_li === false) {
+                if ($current_li === null) {
                     $current_li = new HTMLPurifier_Node_Element('li');
                     $result[] = $current_li;
                 }
index 3e4a0f21824fadf752c06ec6a0fa90be393c90bb..cb6b3e6cdc25f4a15a9f6fe03f0477c61a8da2fd 100644 (file)
@@ -203,7 +203,7 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                     $current_tr_tbody->children[] = $node;
                     break;
                 case '#PCDATA':
-                    assert($node->is_whitespace);
+                    //assert($node->is_whitespace);
                     if ($current_tr_tbody === null) {
                         $ret[] = $node;
                     } else {
index 2b2db0c26477a6a26273ccd667f15cec7e27670f..3648364b30ff23c103d41bce1af20549ea06546a 100644 (file)
@@ -21,7 +21,7 @@ class HTMLPurifier_Config
      * HTML Purifier's version
      * @type string
      */
-    public $version = '4.7.0';
+    public $version = '4.9.3';
 
     /**
      * Whether or not to automatically finalize
@@ -333,7 +333,7 @@ class HTMLPurifier_Config
         }
 
         // Raw type might be negative when using the fully optimized form
-        // of stdclass, which indicates allow_null == true
+        // of stdClass, which indicates allow_null == true
         $rtype = is_int($def) ? $def : $def->type;
         if ($rtype < 0) {
             $type = -$rtype;
index bfbb0f92f5d5376d33c46712da67010daf0739e2..655c0e97ae657d57ae89fb15b90ae2af59326c54 100644 (file)
@@ -24,11 +24,11 @@ class HTMLPurifier_ConfigSchema
      *
      *  array(
      *      'Namespace' => array(
-     *          'Directive' => new stdclass(),
+     *          'Directive' => new stdClass(),
      *      )
      *  )
      *
-     * The stdclass may have the following properties:
+     * The stdClass may have the following properties:
      *
      *  - If isAlias isn't set:
      *      - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
@@ -39,8 +39,8 @@ class HTMLPurifier_ConfigSchema
      *      - namespace: Namespace this directive aliases to
      *      - name: Directive name this directive aliases to
      *
-     * In certain degenerate cases, stdclass will actually be an integer. In
-     * that case, the value is equivalent to an stdclass with the type
+     * In certain degenerate cases, stdClass will actually be an integer. In
+     * that case, the value is equivalent to an stdClass with the type
      * property set to the integer. If the integer is negative, type is
      * equal to the absolute value of integer, and allow_null is true.
      *
@@ -105,7 +105,7 @@ class HTMLPurifier_ConfigSchema
      */
     public function add($key, $default, $type, $allow_null)
     {
-        $obj = new stdclass();
+        $obj = new stdClass();
         $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
         if ($allow_null) {
             $obj->allow_null = true;
@@ -152,14 +152,14 @@ class HTMLPurifier_ConfigSchema
      */
     public function addAlias($key, $new_key)
     {
-        $obj = new stdclass;
+        $obj = new stdClass;
         $obj->key = $new_key;
         $obj->isAlias = true;
         $this->info[$key] = $obj;
     }
 
     /**
-     * Replaces any stdclass that only has the type property with type integer.
+     * Replaces any stdClass that only has the type property with type integer.
      */
     public function postProcess()
     {
index 1e6ccd22755dfa27722e17f457a00ea42bdc1d6f..371e948f1c76d99bacea65b4735454656858edbf 100644 (file)
Binary files a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema.ser and b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema.ser differ
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Attr.ID.HTML5.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Attr.ID.HTML5.txt
new file mode 100644 (file)
index 0000000..735d4b7
--- /dev/null
@@ -0,0 +1,10 @@
+Attr.ID.HTML5
+TYPE: bool/null
+DEFAULT: null
+VERSION: 4.8.0
+--DESCRIPTION--
+In HTML5, restrictions on the format of the id attribute have been significantly
+relaxed, such that any string is valid so long as it contains no spaces and
+is at least one character.  In lieu of a general HTML5 compatibility flag,
+set this configuration directive to true to use the relaxed rules.
+--# vim: et sw=4 sts=4
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/CSS.AllowDuplicates.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/CSS.AllowDuplicates.txt
new file mode 100644 (file)
index 0000000..4d054b1
--- /dev/null
@@ -0,0 +1,11 @@
+CSS.AllowDuplicates
+TYPE: bool
+DEFAULT: false
+VERSION: 4.8.0
+--DESCRIPTION--
+<p>
+  By default, HTML Purifier removes duplicate CSS properties,
+  like <code>color:red; color:blue</code>.  If this is set to
+  true, duplicate properties are allowed.
+</p>
+--# vim: et sw=4 sts=4
index b2b83d9ab6aba7d0f620ea235afa984baccc79ee..2e0cc81044f6d46cd16b7b0a35ee845f07717487 100644 (file)
@@ -1,5 +1,5 @@
 Cache.SerializerPermissions
-TYPE: int
+TYPE: int/null
 VERSION: 4.3.0
 DEFAULT: 0755
 --DESCRIPTION--
@@ -8,4 +8,9 @@ DEFAULT: 0755
     Directory permissions of the files and directories created inside
     the DefinitionCache/Serializer or other custom serializer path.
 </p>
+<p>
+    In HTML Purifier 4.8.0, this also supports <code>NULL</code>,
+    which means that no chmod'ing or directory creation shall
+    occur.
+</p>
 --# vim: et sw=4 sts=4
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyRemoveScript.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyRemoveScript.txt
new file mode 100644 (file)
index 0000000..b2b6ab1
--- /dev/null
@@ -0,0 +1,16 @@
+Core.AggressivelyRemoveScript
+TYPE: bool
+VERSION: 4.9.0
+DEFAULT: true
+--DESCRIPTION--
+<p>
+    This directive enables aggressive pre-filter removal of
+    script tags.  This is not necessary for security,
+    but it can help work around a bug in libxml where embedded
+    HTML elements inside script sections cause the parser to
+    choke.  To revert to pre-4.9.0 behavior, set this to false.
+    This directive has no effect if %Core.Trusted is true,
+    %Core.RemoveScriptContents is false, or %Core.HiddenElements
+    does not contain script.
+</p>
+--# vim: et sw=4 sts=4
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
new file mode 100644 (file)
index 0000000..392b436
--- /dev/null
@@ -0,0 +1,36 @@
+Core.LegacyEntityDecoder
+TYPE: bool
+VERSION: 4.9.0
+DEFAULT: false
+--DESCRIPTION--
+<p>
+    Prior to HTML Purifier 4.9.0, entities were decoded by performing
+    a global search replace for all entities whose decoded versions
+    did not have special meanings under HTML, and replaced them with
+    their decoded versions.  We would match all entities, even if they did
+    not have a trailing semicolon, but only if there weren't any trailing
+    alphanumeric characters.
+</p>
+<table>
+<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
+<tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yena</td><td>&amp;yena</td><td>&amp;yena</td></tr>
+<tr><td>&amp;yen=</td><td>&yen;=</td><td>&yen;=</td></tr>
+</table>
+<p>
+    In HTML Purifier 4.9.0, we changed the behavior of entity parsing
+    to match entities that had missing trailing semicolons in less
+    cases, to more closely match HTML5 parsing behavior:
+</p>
+<table>
+<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
+<tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yena</td><td>&yen;a</td><td>&amp;yena</td></tr>
+<tr><td>&amp;yen=</td><td>&yen;=</td><td>&amp;yen=</td></tr>
+</table>
+<p>
+    This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
+</p>
+--# vim: et sw=4 sts=4
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoopener.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoopener.txt
new file mode 100644 (file)
index 0000000..dd514c0
--- /dev/null
@@ -0,0 +1,10 @@
+--# vim: et sw=4 sts=4
+HTML.TargetNoopener
+TYPE: bool
+VERSION: 4.8.0
+DEFAULT: TRUE
+--DESCRIPTION--
+If enabled, noopener rel attributes are added to links which have
+a target attribute associated with them.  This prevents malicious
+destinations from overwriting the original window.
+--# vim: et sw=4 sts=4
diff --git a/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoreferrer.txt b/extlib/HTMLPurifier/HTMLPurifier/ConfigSchema/schema/HTML.TargetNoreferrer.txt
new file mode 100644 (file)
index 0000000..cb5a0b0
--- /dev/null
@@ -0,0 +1,9 @@
+HTML.TargetNoreferrer
+TYPE: bool
+VERSION: 4.8.0
+DEFAULT: TRUE
+--DESCRIPTION--
+If enabled, noreferrer rel attributes are added to links which have
+a target attribute associated with them.  This prevents malicious
+destinations from overwriting the original window.
+--# vim: et sw=4 sts=4
index 666635a5ff06e0895c99a06ad0146f65cff19d0b..eb97307e200150b84c31df0a3ce7bf25d53315ad 100644 (file)
@@ -8,6 +8,7 @@ array (
   'ftp' => true,
   'nntp' => true,
   'news' => true,
+  'tel' => true,
 )
 --DESCRIPTION--
 Whitelist that defines the schemes that a URI is allowed to have.  This
index 728e378cbeb462998a10de23946b60e3ebd74cc5..834bc08c0ba626d66973e4d385cd1c401e38c453 100644 (file)
@@ -1,5 +1,5 @@
 URI.DefaultScheme
-TYPE: string
+TYPE: string/null
 DEFAULT: 'http'
 --DESCRIPTION--
 
@@ -7,4 +7,9 @@ DEFAULT: 'http'
     Defines through what scheme the output will be served, in order to
     select the proper object validator when no scheme information is present.
 </p>
+
+<p>
+    Starting with HTML Purifier 4.9.0, the default scheme can be null, in
+    which case we reject all URIs which do not have explicit schemes.
+</p>
 --# vim: et sw=4 sts=4
index 179f94eb03d66abe0da1b4bd8ade550e65ec9c05..58c81dcc441625b4ea966a7dd2b04131c0542e66 100644 (file)
@@ -9,75 +9,75 @@ DEFAULT: NULL
     absolute URIs into another URI, usually a URI redirection service.
     This directive accepts a URI, formatted with a <code>%s</code> where
     the url-encoded original URI should be inserted (sample:
-    <code>https://searx.laquadrature.net/?q=%s</code>).
-       </p>
-       <p>
+    <code>http://www.google.com/url?q=%s</code>).
+</p>
+<p>
     Uses for this directive:
-       </p>
-       <ul>
+</p>
+<ul>
     <li>
-    Prevent PageRank leaks, while being fairly transparent
-    to users (you may also want to add some client side JavaScript to
-    override the text in the statusbar). <strong>Notice</strong>:
-    Many security experts believe that this form of protection does not deter spam-bots.
+        Prevent PageRank leaks, while being fairly transparent
+        to users (you may also want to add some client side JavaScript to
+        override the text in the statusbar). <strong>Notice</strong>:
+        Many security experts believe that this form of protection does not deter spam-bots.
     </li>
     <li>
-    Redirect users to a splash page telling them they are leaving your
-    website. While this is poor usability practice, it is often mandated
-    in corporate environments.
+        Redirect users to a splash page telling them they are leaving your
+        website. While this is poor usability practice, it is often mandated
+        in corporate environments.
     </li>
-       </ul>
-       <p>
+</ul>
+<p>
     Prior to HTML Purifier 3.1.1, this directive also enabled the munging
     of browsable external resources, which could break things if your redirection
     script was a splash page or used <code>meta</code> tags. To revert to
     previous behavior, please use %URI.MungeResources.
-       </p>
-       <p>
+</p>
+<p>
     You may want to also use %URI.MungeSecretKey along with this directive
     in order to enforce what URIs your redirector script allows. Open
     redirector scripts can be a security risk and negatively affect the
     reputation of your domain name.
-       </p>
-       <p>
+</p>
+<p>
     Starting with HTML Purifier 3.1.1, there is also these substitutions:
-       </p>
-       <table>
+</p>
+<table>
     <thead>
-    <tr>
-    <th>Key</th>
-    <th>Description</th>
-    <th>Example <code>&lt;a href=""&gt;</code></th>
-    </tr>
+        <tr>
+            <th>Key</th>
+            <th>Description</th>
+            <th>Example <code>&lt;a href=""&gt;</code></th>
+        </tr>
     </thead>
     <tbody>
-    <tr>
-    <td>%r</td>
-    <td>1 - The URI embeds a resource<br />(blank) - The URI is merely a link</td>
-    <td></td>
-    </tr>
-    <tr>
-    <td>%n</td>
-    <td>The name of the tag this URI came from</td>
-    <td>a</td>
-    </tr>
-    <tr>
-    <td>%m</td>
-    <td>The name of the attribute this URI came from</td>
-    <td>href</td>
-    </tr>
-    <tr>
-    <td>%p</td>
-    <td>The name of the CSS property this URI came from, or blank if irrelevant</td>
-    <td></td>
-    </tr>
+        <tr>
+            <td>%r</td>
+            <td>1 - The URI embeds a resource<br />(blank) - The URI is merely a link</td>
+            <td></td>
+        </tr>
+        <tr>
+            <td>%n</td>
+            <td>The name of the tag this URI came from</td>
+            <td>a</td>
+        </tr>
+        <tr>
+            <td>%m</td>
+            <td>The name of the attribute this URI came from</td>
+            <td>href</td>
+        </tr>
+        <tr>
+            <td>%p</td>
+            <td>The name of the CSS property this URI came from, or blank if irrelevant</td>
+            <td></td>
+        </tr>
     </tbody>
-       </table>
-       <p>
+</table>
+<p>
     Admittedly, these letters are somewhat arbitrary; the only stipulation
     was that they couldn't be a through f. r is for resource (I would have preferred
     e, but you take what you can get), n is for name, m
     was picked because it came after n (and I couldn't use a), p is for
     property.
-       </p>
-       --# vim: et sw=4 sts=4
+</p>
+--# vim: et sw=4 sts=4
index 67bb5b1e69a5dcf3d38df50fede8c14a3594616e..9aa8ff354ff3984dabe64a68acd64fec3f7748e6 100644 (file)
@@ -118,7 +118,7 @@ abstract class HTMLPurifier_DefinitionCache
 
     /**
      * Clears all expired (older version or revision) objects from cache
-     * @note Be carefuly implementing this method as flush. Flush must
+     * @note Be careful implementing this method as flush. Flush must
      *       not interfere with other Definition types, and cleanup()
      *       should not be repeatedly called by userland code.
      * @param HTMLPurifier_Config $config
index ce268d91b429aef4de846b465b3c09631357af6c..952e48d4704c24ff950b05d137451b3500a838ac 100644 (file)
@@ -97,6 +97,12 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
         }
         $dir = $this->generateDirectoryPath($config);
         $dh = opendir($dir);
+        // Apparently, on some versions of PHP, readdir will return
+        // an empty string if you pass an invalid argument to readdir.
+        // So you need this test.  See #49.
+        if (false === $dh) {
+            return false;
+        }
         while (false !== ($filename = readdir($dh))) {
             if (empty($filename)) {
                 continue;
@@ -106,6 +112,8 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
             }
             unlink($dir . '/' . $filename);
         }
+        closedir($dh);
+        return true;
     }
 
     /**
@@ -119,6 +127,10 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
         }
         $dir = $this->generateDirectoryPath($config);
         $dh = opendir($dir);
+        // See #49 (and above).
+        if (false === $dh) {
+            return false;
+        }
         while (false !== ($filename = readdir($dh))) {
             if (empty($filename)) {
                 continue;
@@ -131,6 +143,8 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
                 unlink($dir . '/' . $filename);
             }
         }
+        closedir($dh);
+        return true;
     }
 
     /**
@@ -186,11 +200,9 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
         if ($result !== false) {
             // set permissions of the new file (no execute)
             $chmod = $config->get('Cache.SerializerPermissions');
-            if (!$chmod) {
-                $chmod = 0644; // invalid config or simpletest
+            if ($chmod !== null) {
+                chmod($file, $chmod & 0666);
             }
-            $chmod = $chmod & 0666;
-            chmod($file, $chmod);
         }
         return $result;
     }
@@ -204,8 +216,10 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
     {
         $directory = $this->generateDirectoryPath($config);
         $chmod = $config->get('Cache.SerializerPermissions');
-        if (!$chmod) {
-            $chmod = 0755; // invalid config or simpletest
+        if ($chmod === null) {
+            // TODO: This races
+            if (is_dir($directory)) return true;
+            return mkdir($directory);
         }
         if (!is_dir($directory)) {
             $base = $this->generateBaseDirectoryPath($config);
@@ -219,15 +233,16 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
             } elseif (!$this->_testPermissions($base, $chmod)) {
                 return false;
             }
-            mkdir($directory, $chmod);
-            if (!$this->_testPermissions($directory, $chmod)) {
+            if (!mkdir($directory, $chmod)) {
                 trigger_error(
-                    'Base directory ' . $base . ' does not exist,
-                    please create or change using %Cache.SerializerPath',
+                    'Could not create directory ' . $directory . '',
                     E_USER_WARNING
                 );
                 return false;
             }
+            if (!$this->_testPermissions($directory, $chmod)) {
+                return false;
+            }
         } elseif (!$this->_testPermissions($directory, $chmod)) {
             return false;
         }
@@ -256,7 +271,7 @@ class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCac
             );
             return false;
         }
-        if (function_exists('posix_getuid')) {
+        if (function_exists('posix_getuid') && $chmod !== null) {
             // POSIX system, we can give more specific advice
             if (fileowner($dir) === posix_getuid()) {
                 // we can chmod it ourselves
index fef9b589063a9c38e17a4d26d299fd5a7b40c010..b94f1754234dab034fc6fe19b32c00e13a8c79dc 100644 (file)
@@ -101,6 +101,14 @@ class HTMLPurifier_Encoder
      * It will parse according to UTF-8 and return a valid UTF8 string, with
      * non-SGML codepoints excluded.
      *
+     * Specifically, it will permit:
+     * \x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}
+     * Source: https://www.w3.org/TR/REC-xml/#NT-Char
+     * Arguably this function should be modernized to the HTML5 set
+     * of allowed characters:
+     * https://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+     * which simultaneously expand and restrict the set of allowed characters.
+     *
      * @param string $str The string to clean
      * @param bool $force_php
      * @return string
@@ -122,15 +130,12 @@ class HTMLPurifier_Encoder
      *       function that needs to be able to understand UTF-8 characters.
      *       As of right now, only smart lossless character encoding converters
      *       would need that, and I'm probably not going to implement them.
-     *       Once again, PHP 6 should solve all our problems.
      */
     public static function cleanUTF8($str, $force_php = false)
     {
         // UTF-8 validity is checked since PHP 4.3.5
         // This is an optimization: if the string is already valid UTF-8, no
         // need to do PHP stuff. 99% of the time, this will be the case.
-        // The regexp matches the XML char production, as well as well as excluding
-        // non-SGML codepoints U+007F to U+009F
         if (preg_match(
             '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
             $str
@@ -255,6 +260,7 @@ class HTMLPurifier_Encoder
                                 // 7F-9F is not strictly prohibited by XML,
                                 // but it is non-SGML, and thus we don't allow it
                                 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
+                                (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
                                 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
                             )
                         ) {
index 61529dcd9d9669103502e4e3527ab57f74a1213b..c372b5a6a6c0f9641cf94bb3feb97996898aca2c 100644 (file)
@@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
      */
     protected $_entity_lookup;
 
+    /**
+     * Callback regex string for entities in text.
+     * @type string
+     */
+    protected $_textEntitiesRegex;
+
+    /**
+     * Callback regex string for entities in attributes.
+     * @type string
+     */
+    protected $_attrEntitiesRegex;
+
+    /**
+     * Tests if the beginning of a string is a semi-optional regex
+     */
+    protected $_semiOptionalPrefixRegex;
+
+    public function __construct() {
+        // From
+        // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
+        $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
+
+        // NB: three empty captures to put the fourth match in the right
+        // place
+        $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
+
+        $this->_textEntitiesRegex =
+            '/&(?:'.
+            // hex
+            '[#]x([a-fA-F0-9]+);?|'.
+            // dec
+            '[#]0*(\d+);?|'.
+            // string (mandatory semicolon)
+            // NB: order matters: match semicolon preferentially
+            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+            // string (optional semicolon)
+            "($semi_optional)".
+            ')/';
+
+        $this->_attrEntitiesRegex =
+            '/&(?:'.
+            // hex
+            '[#]x([a-fA-F0-9]+);?|'.
+            // dec
+            '[#]0*(\d+);?|'.
+            // string (mandatory semicolon)
+            // NB: order matters: match semicolon preferentially
+            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+            // string (optional semicolon)
+            // don't match if trailing is equals or alphanumeric (URL
+            // like)
+            "($semi_optional)(?![=;A-Za-z0-9])".
+            ')/';
+
+    }
+
+    /**
+     * Substitute entities with the parsed equivalents.  Use this on
+     * textual data in an HTML document (as opposed to attributes.)
+     *
+     * @param string $string String to have entities parsed.
+     * @return string Parsed string.
+     */
+    public function substituteTextEntities($string)
+    {
+        return preg_replace_callback(
+            $this->_textEntitiesRegex,
+            array($this, 'entityCallback'),
+            $string
+        );
+    }
+
+    /**
+     * Substitute entities with the parsed equivalents.  Use this on
+     * attribute contents in documents.
+     *
+     * @param string $string String to have entities parsed.
+     * @return string Parsed string.
+     */
+    public function substituteAttrEntities($string)
+    {
+        return preg_replace_callback(
+            $this->_attrEntitiesRegex,
+            array($this, 'entityCallback'),
+            $string
+        );
+    }
+
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     *
+     * @param array $matches  PCRE matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @return string Replacement string.
+     */
+
+    protected function entityCallback($matches)
+    {
+        $entity = $matches[0];
+        $hex_part = @$matches[1];
+        $dec_part = @$matches[2];
+        $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
+        if ($hex_part !== NULL && $hex_part !== "") {
+            return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
+        } elseif ($dec_part !== NULL && $dec_part !== "") {
+            return HTMLPurifier_Encoder::unichr((int) $dec_part);
+        } else {
+            if (!$this->_entity_lookup) {
+                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$named_part])) {
+                return $this->_entity_lookup->table[$named_part];
+            } else {
+                // exact match didn't match anything, so test if
+                // any of the semicolon optional match the prefix.
+                // Test that this is an EXACT match is important to
+                // prevent infinite loop
+                if (!empty($matches[3])) {
+                    return preg_replace_callback(
+                        $this->_semiOptionalPrefixRegex,
+                        array($this, 'entityCallback'),
+                        $entity
+                    );
+                }
+                return $entity;
+            }
+        }
+    }
+
+    // LEGACY CODE BELOW
+
     /**
      * Callback regex string for parsing entities.
      * @type string
@@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
                 $entity;
         } else {
             return isset($this->_special_ent2dec[$matches[3]]) ?
-                $this->_special_ent2dec[$matches[3]] :
+                $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
                 $entity;
         }
     }
index 08e62c16bf79b2483ee89ae317e1bdaa9f874fc2..66f70b0fc00fdbfd54b88dcdb88f035dc2d34ba1 100644 (file)
@@ -95,7 +95,10 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
         if ($tidy !== null) {
             $this->_tidy = $tidy;
         }
-        $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
+        // NB: this must be NON-greedy because if we have
+        // <style>foo</style>  <style>bar</style>
+        // we must not grab foo</style>  <style>bar
+        $html = preg_replace_callback('#<style(?:\s.*)?>(.*)<\/style>#isU', array($this, 'styleCallback'), $html);
         $style_blocks = $this->_styleMatches;
         $this->_styleMatches = array(); // reset
         $context->register('StyleBlocks', $style_blocks); // $context must not be reused
index 6fb568714659bc7a8e480f4db5857817b7962a74..eb56e2dfa2e349f2dc5d37ab41153f546ff01f6c 100644 (file)
@@ -146,7 +146,7 @@ class HTMLPurifier_Generator
             $attr = $this->generateAttributes($token->attr, $token->name);
             if ($this->_flashCompat) {
                 if ($token->name == "object") {
-                    $flash = new stdclass();
+                    $flash = new stdClass();
                     $flash->attr = $token->attr;
                     $flash->param = array();
                     $this->_flashStack[] = $flash;
diff --git a/extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoopener.php b/extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoopener.php
new file mode 100644 (file)
index 0000000..b967ff5
--- /dev/null
@@ -0,0 +1,21 @@
+<?php
+
+/**
+ * Module adds the target-based noopener attribute transformation to a tags.  It
+ * is enabled by HTML.TargetNoopener
+ */
+class HTMLPurifier_HTMLModule_TargetNoopener extends HTMLPurifier_HTMLModule
+{
+    /**
+     * @type string
+     */
+    public $name = 'TargetNoopener';
+
+    /**
+     * @param HTMLPurifier_Config $config
+     */
+    public function setup($config) {
+        $a = $this->addBlankElement('a');
+        $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoopener();
+    }
+}
diff --git a/extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoreferrer.php b/extlib/HTMLPurifier/HTMLPurifier/HTMLModule/TargetNoreferrer.php
new file mode 100644 (file)
index 0000000..32484d6
--- /dev/null
@@ -0,0 +1,21 @@
+<?php
+
+/**
+ * Module adds the target-based noreferrer attribute transformation to a tags.  It
+ * is enabled by HTML.TargetNoreferrer
+ */
+class HTMLPurifier_HTMLModule_TargetNoreferrer extends HTMLPurifier_HTMLModule
+{
+    /**
+     * @type string
+     */
+    public $name = 'TargetNoreferrer';
+
+    /**
+     * @param HTMLPurifier_Config $config
+     */
+    public function setup($config) {
+        $a = $this->addBlankElement('a');
+        $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoreferrer();
+    }
+}
index f3a17cb03b3f90a8c35d74f676202d7d9822c9df..38c058fe235e6597a20fda49a7c800065b5954b5 100644 (file)
@@ -271,6 +271,14 @@ class HTMLPurifier_HTMLModuleManager
         if ($config->get('HTML.TargetBlank')) {
             $modules[] = 'TargetBlank';
         }
+        // NB: HTML.TargetNoreferrer and HTML.TargetNoopener must be AFTER HTML.TargetBlank
+        // so that its post-attr-transform gets run afterwards.
+        if ($config->get('HTML.TargetNoreferrer')) {
+            $modules[] = 'TargetNoreferrer';
+        }
+        if ($config->get('HTML.TargetNoopener')) {
+            $modules[] = 'TargetNoopener';
+        }
 
         // merge in custom modules
         $modules = array_merge($modules, $this->userModules);
index 8bc4f4b89629d6aa9983271111d1f5b84a260ed4..74f83eaa7d16ad8a5e4455243756e46559b4082a 100644 (file)
@@ -27,13 +27,18 @@ class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
         if (strpos($token->data, '://') === false) {
             // our really quick heuristic failed, abort
             // this may not work so well if we want to match things like
-            // "domainname.com", but then again, most people don't
+            // "google.com", but then again, most people don't
             return;
         }
 
-        // there is/are URL(s). Let's split the string:
-        // Note: this regex is extremely permissive
-        $bits = preg_split('#((?:https?|ftp)://[^\s\'",<>()]+)#Su', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
+        // there is/are URL(s). Let's split the string.
+        // We use this regex:
+        // https://gist.github.com/gruber/249502
+        // but with @cscott's backtracking fix and also
+        // the Unicode characters un-Unicodified.
+        $bits = preg_split(
+            '/\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'".,<>?\x{00ab}\x{00bb}\x{201c}\x{201d}\x{2018}\x{2019}]))/iu',
+            $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
 
 
         $token = array();
index 01353ff1d5037950f8dc6e162696de4709853745..0ebc477c68e47c4b28d0d213985b06fc4550c96b 100644 (file)
@@ -46,6 +46,12 @@ class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
         $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
         $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
         $this->exclude = $config->get('AutoFormat.RemoveEmpty.Predicate');
+        foreach ($this->exclude as $key => $attrs) {
+            if (!is_array($attrs)) {
+                // HACK, see HTMLPurifier/Printer/ConfigForm.php
+                $this->exclude[$key] = explode(';', $attrs);
+            }
+        }
         $this->attrValidator = new HTMLPurifier_AttrValidator();
     }
 
index 3d17e07af21ee5231afea8a2fd8410dfa9a11226..317f7864dd4b70b925ac4ae36300e6a44d32a8de 100644 (file)
@@ -36,6 +36,7 @@ class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
     );
 
     /**
+     * These are all lower-case keys.
      * @type array
      */
     protected $allowedParam = array(
@@ -43,7 +44,7 @@ class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
         'movie' => true,
         'flashvars' => true,
         'src' => true,
-        'allowFullScreen' => true, // if omitted, assume to be 'false'
+        'allowfullscreen' => true, // if omitted, assume to be 'false'
     );
 
     /**
@@ -93,9 +94,11 @@ class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
                     $token->attr['name'] === $this->addParam[$n]) {
                     // keep token, and add to param stack
                     $this->paramStack[$i][$n] = true;
-                } elseif (isset($this->allowedParam[$n])) {
+                } elseif (isset($this->allowedParam[strtolower($n)])) {
                     // keep token, don't do anything to it
                     // (could possibly check for duplicates here)
+                    // Note: In principle, parameters should be case sensitive.
+                    // But it seems they are not really; so accept any case.
                 } else {
                     $token = false;
                 }
index 43732621dc9737d10c66357e2c40c95f55883f8a..e9da3ed5e94400793bedd4bd437cc01620081912 100644 (file)
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer
                         break;
                     }
 
-                    if (class_exists('DOMDocument') &&
+                    if (class_exists('DOMDocument', false) &&
                         method_exists('DOMDocument', 'loadHTML') &&
                         !extension_loaded('domxml')
                     ) {
@@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
             '&#x27;' => "'"
         );
 
+    public function parseText($string, $config) {
+        return $this->parseData($string, false, $config);
+    }
+
+    public function parseAttr($string, $config) {
+        return $this->parseData($string, true, $config);
+    }
+
     /**
      * Parses special entities into the proper characters.
      *
      * This string will translate escaped versions of the special characters
      * into the correct ones.
      *
-     * @warning
-     * You should be able to treat the output of this function as
-     * completely parsed, but that's only because all other entities should
-     * have been handled previously in substituteNonSpecialEntities()
-     *
      * @param string $string String character data to be parsed.
      * @return string Parsed character data.
      */
-    public function parseData($string)
+    public function parseData($string, $is_attr, $config)
     {
         // following functions require at least one character
         if ($string === '') {
@@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
         }
 
         // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        if ($config->get('Core.LegacyEntityDecoder')) {
+            $string = $this->_entity_parser->substituteSpecialEntities($string);
+        } else {
+            if ($is_attr) {
+                $string = $this->_entity_parser->substituteAttrEntities($string);
+            } else {
+                $string = $this->_entity_parser->substituteTextEntities($string);
+            }
+        }
         return $string;
     }
 
@@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
         }
 
         // expand entities that aren't the big five
-        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        if ($config->get('Core.LegacyEntityDecoder')) {
+            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        }
 
         // clean into wellformed UTF-8 string for an SGML context: this has
         // to be done after entity expansion because the entities sometimes
@@ -335,6 +348,13 @@ class HTMLPurifier_Lexer
             $html = preg_replace('#<\?.+?\?>#s', '', $html);
         }
 
+        $hidden_elements = $config->get('Core.HiddenElements');
+        if ($config->get('Core.AggressivelyRemoveScript') &&
+            !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
+            || empty($hidden_elements["script"]))) {
+            $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
+        }
+
         return $html;
     }
 
@@ -345,12 +365,17 @@ class HTMLPurifier_Lexer
     public function extractBody($html)
     {
         $matches = array();
-        $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
+        $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
         if ($result) {
-            return $matches[1];
-        } else {
-            return $html;
+            // Make sure it's not in a comment
+            $comment_start = strrpos($matches[1], '<!--');
+            $comment_end   = strrpos($matches[1], '-->');
+            if ($comment_start === false ||
+                ($comment_end !== false && $comment_end > $comment_start)) {
+                return $matches[2];
+            }
         }
+        return $html;
     }
 }
 
index b81819290976e2d19c7e3cc93011abb305ec5c47..22ab5820c5f426ca9f5433c75103e88a7f307ec4 100644 (file)
@@ -72,12 +72,20 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         $doc->loadHTML($html);
         restore_error_handler();
 
+        $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
+                      getElementsByTagName('body')->item(0);  // <body>
+
+        $div = $body->getElementsByTagName('div')->item(0); // <div>
         $tokens = array();
-        $this->tokenizeDOM(
-            $doc->getElementsByTagName('html')->item(0)-> // <html>
-            getElementsByTagName('body')->item(0), //   <body>
-            $tokens
-        );
+        $this->tokenizeDOM($div, $tokens, $config);
+        // If the div has a sibling, that means we tripped across
+        // a premature </div> tag.  So remove the div we parsed,
+        // and then tokenize the rest of body.  We can't tokenize
+        // the sibling directly as we'll lose the tags in that case.
+        if ($div->nextSibling) {
+            $body->removeChild($div);
+            $this->tokenizeDOM($body, $tokens, $config);
+        }
         return $tokens;
     }
 
@@ -88,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
      * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
      * @return HTMLPurifier_Token of node appended to previously passed tokens.
      */
-    protected function tokenizeDOM($node, &$tokens)
+    protected function tokenizeDOM($node, &$tokens, $config)
     {
         $level = 0;
         $nodes = array($level => new HTMLPurifier_Queue(array($node)));
@@ -97,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
             while (!$nodes[$level]->isEmpty()) {
                 $node = $nodes[$level]->shift(); // FIFO
                 $collect = $level > 0 ? true : false;
-                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
                 if ($needEndingTag) {
                     $closingNodes[$level][] = $node;
                 }
@@ -127,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
      * @return bool if the token needs an endtoken
      * @todo data and tagName properties don't seem to exist in DOMNode?
      */
-    protected function createStartNode($node, &$tokens, $collect)
+    protected function createStartNode($node, &$tokens, $collect, $config)
     {
         // intercept non element nodes. WE MUST catch all of them,
         // but we're not getting the character reference nodes because
@@ -151,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                     }
                 }
             }
-            $tokens[] = $this->factory->createText($this->parseData($data));
+            $tokens[] = $this->factory->createText($this->parseText($data, $config));
             return false;
         } elseif ($node->nodeType === XML_COMMENT_NODE) {
             // this is code is only invoked for comments in script/style in versions
@@ -252,7 +260,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
      * @param HTMLPurifier_Context $context
      * @return string
      */
-    protected function wrapHTML($html, $config, $context)
+    protected function wrapHTML($html, $config, $context, $use_div = true)
     {
         $def = $config->getDefinition('HTML');
         $ret = '';
@@ -271,7 +279,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         $ret .= '<html><head>';
         $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
         // No protection if $html contains a stray </div>!
-        $ret .= '</head><body>' . $html . '</body></html>';
+        $ret .= '</head><body>';
+        if ($use_div) $ret .= '<div>';
+        $ret .= $html;
+        if ($use_div) $ret .= '</div>';
+        $ret .= '</body></html>';
         return $ret;
     }
 }
index 746b6e315f8ea0a7d01dcabf05a330bb97387a3d..6f1308966b2bf91eb271f1c5060f8790d0b83dad 100644 (file)
@@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 // We are not inside tag and there still is another tag to parse
                 $token = new
                 HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                         substr(
                             $html,
                             $cursor,
                             $position_next_lt - $cursor
-                        )
+                        ), $config
                     )
                 );
                 if ($maintain_line_numbers) {
@@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 // Create Text of rest of string
                 $token = new
                 HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                         substr(
                             $html,
                             $cursor
-                        )
+                        ), $config
                     )
                 );
                 if ($maintain_line_numbers) {
@@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 $token = new
                 HTMLPurifier_Token_Text(
                     '<' .
-                    $this->parseData(
-                        substr($html, $cursor)
+                    $this->parseText(
+                        substr($html, $cursor), $config
                     )
                 );
                 if ($maintain_line_numbers) {
@@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             if ($value === false) {
                 $value = '';
             }
-            return array($key => $this->parseData($value));
+            return array($key => $this->parseAttr($value, $config));
         }
 
         // setup loop environment
@@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 if ($value === false) {
                     $value = '';
                 }
-                $array[$key] = $this->parseData($value);
+                $array[$key] = $this->parseAttr($value, $config);
                 $cursor++;
             } else {
                 // boolattr
index ff4fa218fba247a4329324ee7cc920f5f45437bd..0b452d17fcd6c5b95580159cfc4526e4fac66ba8 100644 (file)
@@ -21,7 +21,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
     public function tokenizeHTML($html, $config, $context)
     {
         $new_html = $this->normalize($html, $config, $context);
-        $new_html = $this->wrapHTML($new_html, $config, $context);
+        $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
         try {
             $parser = new HTML5($new_html);
             $doc = $parser->save();
@@ -34,9 +34,9 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
         $tokens = array();
         $this->tokenizeDOM(
             $doc->getElementsByTagName('html')->item(0)-> // <html>
-                getElementsByTagName('body')->item(0) //   <body>
+                  getElementsByTagName('body')->item(0) //   <body>
             ,
-            $tokens
+            $tokens, $config
         );
         return $tokens;
     }
@@ -1515,6 +1515,7 @@ class HTML5
                 // Consume the maximum number of characters possible, with the
                 // consumed characters case-sensitively matching one of the
                 // identifiers in the first column of the entities table.
+
                 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
                 $len = strlen($e_name);
 
@@ -1547,7 +1548,7 @@ class HTML5
 
         // Return a character token for the character corresponding to the
         // entity name (as given by the second column of the entities table).
-        return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
+        return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
     }
 
     private function emitToken($token)
index 36100ce7384b6eb394f382e87878eaa0e9a7645e..65a777904136c18049ed6597b19275af44f2c5ee 100644 (file)
@@ -327,6 +327,10 @@ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer
                 case HTMLPurifier_VarParser::HASH:
                     $nvalue = '';
                     foreach ($value as $i => $v) {
+                        if (is_array($v)) {
+                            // HACK
+                            $v = implode(";", $v);
+                        }
                         $nvalue .= "$i:$v" . PHP_EOL;
                     }
                     $value = $nvalue;
index e389e001162d7d83091c05d7552abfd10c016f75..a6eb09e453eb6adc96a17188c1bf8a9111b4824a 100644 (file)
@@ -165,7 +165,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
                         if (empty($zipper->front)) break;
                         $token = $zipper->prev($token);
                         // indicate that other injectors should not process this token,
-                        // but we need to reprocess it
+                        // but we need to reprocess it.  See Note [Injector skips]
                         unset($token->skip[$i]);
                         $token->rewind = $i;
                         if ($token instanceof HTMLPurifier_Token_Start) {
@@ -210,6 +210,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
                 if ($token instanceof HTMLPurifier_Token_Text) {
                     foreach ($this->injectors as $i => $injector) {
                         if (isset($token->skip[$i])) {
+                            // See Note [Injector skips]
                             continue;
                         }
                         if ($token->rewind !== null && $token->rewind !== $i) {
@@ -367,6 +368,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
             if ($ok) {
                 foreach ($this->injectors as $i => $injector) {
                     if (isset($token->skip[$i])) {
+                        // See Note [Injector skips]
                         continue;
                     }
                     if ($token->rewind !== null && $token->rewind !== $i) {
@@ -422,6 +424,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
                 $token->start = $current_parent;
                 foreach ($this->injectors as $i => $injector) {
                     if (isset($token->skip[$i])) {
+                        // See Note [Injector skips]
                         continue;
                     }
                     if ($token->rewind !== null && $token->rewind !== $i) {
@@ -534,12 +537,17 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
      */
     protected function processToken($token, $injector = -1)
     {
+        // Zend OpCache miscompiles $token = array($token), so
+        // avoid this pattern.  See: https://github.com/ezyang/htmlpurifier/issues/108
+
         // normalize forms of token
         if (is_object($token)) {
-            $token = array(1, $token);
+            $tmp = $token;
+            $token = array(1, $tmp);
         }
         if (is_int($token)) {
-            $token = array($token);
+            $tmp = $token;
+            $token = array($tmp);
         }
         if ($token === false) {
             $token = array(1);
@@ -561,7 +569,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
         list($old, $r) = $this->zipper->splice($this->token, $delete, $token);
 
         if ($injector > -1) {
-            // determine appropriate skips
+            // See Note [Injector skips]
+            // Determine appropriate skips.  Here's what the code does:
+            //  *If* we deleted one or more tokens, copy the skips
+            //  of those tokens into the skips of the new tokens (in $token).
+            //  Also, mark the newly inserted tokens as having come from
+            //  $injector.
             $oldskip = isset($old[0]) ? $old[0]->skip : array();
             foreach ($token as $object) {
                 $object->skip = $oldskip;
@@ -597,4 +610,50 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
     }
 }
 
+// Note [Injector skips]
+// ~~~~~~~~~~~~~~~~~~~~~
+// When I originally designed this class, the idea behind the 'skip'
+// property of HTMLPurifier_Token was to help avoid infinite loops
+// in injector processing.  For example, suppose you wrote an injector
+// that bolded swear words.  Naively, you might write it so that
+// whenever you saw ****, you replaced it with <strong>****</strong>.
+//
+// When this happens, we will reprocess all of the tokens with the
+// other injectors.  Now there is an opportunity for infinite loop:
+// if we rerun the swear-word injector on these tokens, we might
+// see **** and then reprocess again to get
+// <strong><strong>****</strong></strong> ad infinitum.
+//
+// Thus, the idea of a skip is that once we process a token with
+// an injector, we mark all of those tokens as having "come from"
+// the injector, and we never run the injector again on these
+// tokens.
+//
+// There were two more complications, however:
+//
+//  - With HTMLPurifier_Injector_RemoveEmpty, we noticed that if
+//    you had <b><i></i></b>, after you removed the <i></i>, you
+//    really would like this injector to go back and reprocess
+//    the <b> tag, discovering that it is now empty and can be
+//    removed.  So we reintroduced the possibility of infinite looping
+//    by adding a "rewind" function, which let you go back to an
+//    earlier point in the token stream and reprocess it with injectors.
+//    Needless to say, we need to UN-skip the token so it gets
+//    reprocessed.
+//
+//  - Suppose that you successfuly process a token, replace it with
+//    one with your skip mark, but now another injector wants to
+//    process the skipped token with another token.  Should you continue
+//    to skip that new token, or reprocess it?  If you reprocess,
+//    you can end up with an infinite loop where one injector converts
+//    <a> to <b>, and then another injector converts it back.  So
+//    we inherit the skips, but for some reason, I thought that we
+//    should inherit the skip from the first token of the token
+//    that we deleted.  Why?  Well, it seems to work OK.
+//
+// If I were to redesign this functionality, I would absolutely not
+// go about doing it this way: the semantics are just not very well
+// defined, and in any case you probably wanted to operate on trees,
+// not token streams.
+
 // vim: et sw=4 sts=4
index 85b85e072d0e43ee779ca6edb43f8e0a07e1aa2f..84d3619a3670ab349efa86c47ae586d103454d93 100644 (file)
@@ -26,7 +26,7 @@ abstract class HTMLPurifier_Token
     public $armor = array();
 
     /**
-     * Used during MakeWellFormed.
+     * Used during MakeWellFormed.  See Note [Injector skips]
      * @type
      */
     public $skip;
index a5e7ae29841d9ab35eda53f1c43815080b87857c..9c5be39d188a42a46cd9db74cd9db21e5cf4d6a7 100644 (file)
@@ -85,11 +85,13 @@ class HTMLPurifier_URI
             $def = $config->getDefinition('URI');
             $scheme_obj = $def->getDefaultScheme($config, $context);
             if (!$scheme_obj) {
-                // something funky happened to the default scheme object
-                trigger_error(
-                    'Default scheme object "' . $def->defaultScheme . '" was not readable',
-                    E_USER_WARNING
-                );
+                if ($def->defaultScheme !== null) {
+                    // something funky happened to the default scheme object
+                    trigger_error(
+                        'Default scheme object "' . $def->defaultScheme . '" was not readable',
+                        E_USER_WARNING
+                    );
+                } // suppress error if it's null
                 return false;
             }
         }
index 6ebca4984810e2d0a43a18a02405aa9d23667fb7..41c49d5533f173142bb170ce9d09f372838bb105 100644 (file)
@@ -79,9 +79,18 @@ class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme
         } else {
             $raw_data = $data;
         }
+        if ( strlen($raw_data) < 12 ) {
+            // error; exif_imagetype throws exception with small files,
+            // and this likely indicates a corrupt URI/failed parse anyway
+            return false;
+        }
         // XXX probably want to refactor this into a general mechanism
         // for filtering arbitrary content types
-        $file = tempnam("/tmp", "");
+        if (function_exists('sys_get_temp_dir')) {
+            $file = tempnam(sys_get_temp_dir(), "");
+        } else {
+            $file = tempnam("/tmp", "");
+        }
         file_put_contents($file, $raw_data);
         if (function_exists('exif_imagetype')) {
             $image_code = exif_imagetype($file);
diff --git a/extlib/HTMLPurifier/HTMLPurifier/URIScheme/tel.php b/extlib/HTMLPurifier/HTMLPurifier/URIScheme/tel.php
new file mode 100644 (file)
index 0000000..8cd1933
--- /dev/null
@@ -0,0 +1,46 @@
+<?php
+
+/**
+ * Validates tel (for phone numbers).
+ *
+ * The relevant specifications for this protocol are RFC 3966 and RFC 5341,
+ * but this class takes a much simpler approach: we normalize phone
+ * numbers so that they only include (possibly) a leading plus,
+ * and then any number of digits and x'es.
+ */
+
+class HTMLPurifier_URIScheme_tel extends HTMLPurifier_URIScheme
+{
+    /**
+     * @type bool
+     */
+    public $browsable = false;
+
+    /**
+     * @type bool
+     */
+    public $may_omit_host = true;
+
+    /**
+     * @param HTMLPurifier_URI $uri
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return bool
+     */
+    public function doValidate(&$uri, $config, $context)
+    {
+        $uri->userinfo = null;
+        $uri->host     = null;
+        $uri->port     = null;
+
+        // Delete all non-numeric characters, non-x characters
+        // from phone number, EXCEPT for a leading plus sign.
+        $uri->path = preg_replace('/(?!^\+)[^\dx]/', '',
+                     // Normalize e(x)tension to lower-case
+                     str_replace('X', 'x', $uri->path));
+
+        return true;
+    }
+}
+
+// vim: et sw=4 sts=4
index 1163055e28e8e7f468a15d63124bb70ed6cdeb33..e94f14fa9ed3d03bd3a9c0a03a4cfae929772163 100644 (file)
@@ -1 +1 @@
-4.7.0
\ No newline at end of file
+4.9.3
\ No newline at end of file