library/HTML5/Data.php

   1 <?php
   2
   3 // warning: this file is encoded in UTF-8!
   4
   5 class HTML5_Data
   6 {
   7
   8     // at some point this should be moved to a .ser file. Another
   9     // possible optimization is to give UTF-8 bytes, not Unicode
  10     // codepoints
  11     protected static $realCodepointTable = array(
  12         0x0D => 0x000A, // LINE FEED (LF)
  13         0x80 => 0x20AC, // EURO SIGN ('€')
  14         0x81 => 0xFFFD, // REPLACEMENT CHARACTER
  15         0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
  16         0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
  17         0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
  18         0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
  19         0x86 => 0x2020, // DAGGER ('†')
  20         0x87 => 0x2021, // DOUBLE DAGGER ('‡')
  21         0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
  22         0x89 => 0x2030, // PER MILLE SIGN ('‰')
  23         0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
  24         0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
  25         0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
  26         0x8D => 0xFFFD, // REPLACEMENT CHARACTER
  27         0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
  28         0x8F => 0xFFFD, // REPLACEMENT CHARACTER
  29         0x90 => 0xFFFD, // REPLACEMENT CHARACTER
  30         0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
  31         0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
  32         0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
  33         0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
  34         0x95 => 0x2022, // BULLET ('•')
  35         0x96 => 0x2013, // EN DASH ('–')
  36         0x97 => 0x2014, // EM DASH ('—')
  37         0x98 => 0x02DC, // SMALL TILDE ('˜')
  38         0x99 => 0x2122, // TRADE MARK SIGN ('™')
  39         0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
  40         0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
  41         0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
  42         0x9D => 0xFFFD, // REPLACEMENT CHARACTER
  43         0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
  44         0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
  45     );
  46
  47     protected static $namedCharacterReferences;
  48
  49     protected static $namedCharacterReferenceMaxLength;
  50
  51     /**
  52      * Returns the "real" Unicode codepoint of a malformed character
  53      * reference.
  54      */
  55     public static function getRealCodepoint($ref) {
  56         if (!isset(self::$realCodepointTable[$ref])) return false;
  57         else return self::$realCodepointTable[$ref];
  58     }
  59
  60     public static function getNamedCharacterReferences() {
  61         if (!self::$namedCharacterReferences) {
  62             self::$namedCharacterReferences = unserialize(
  63                 file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
  64         }
  65         return self::$namedCharacterReferences;
  66     }
  67
  68     public static function getNamedCharacterReferenceMaxLength() {
  69         if (!self::$namedCharacterReferenceMaxLength) {
  70             $namedCharacterReferences = self::getNamedCharacterReferences();
  71             $lengths = array_map('strlen', array_keys($namedCharacterReferences));
  72             self::$namedCharacterReferenceMaxLength = max($lengths);
  73         }
  74         return self::$namedCharacterReferenceMaxLength;
  75     }
  76
  77
  78     /**
  79      * Converts a Unicode codepoint to sequence of UTF-8 bytes.
  80      * @note Shamelessly stolen from HTML Purifier, which is also
  81      *       shamelessly stolen from Feyd (which is in public domain).
  82      */
  83     public static function utf8chr($code) {
  84         if($code > 0x10FFFF or $code < 0x0 or
  85           ($code >= 0xD800 and $code <= 0xDFFF) ) {
  86             // bits are set outside the "valid" range as defined
  87             // by UNICODE 4.1.0
  88             return "\xEF\xBF\xBD";
  89         }
  90
  91         $x = $y = $z = $w = 0;
  92         if ($code < 0x80) {
  93             // regular ASCII character
  94             $x = $code;
  95         } else {
  96             // set up bits for UTF-8
  97             $x = ($code & 0x3F) | 0x80;
  98             if ($code < 0x800) {
  99                $y = (($code & 0x7FF) >> 6) | 0xC0;
 100             } else {
 101                 $y = (($code & 0xFC0) >> 6) | 0x80;
 102                 if($code < 0x10000) {
 103                     $z = (($code >> 12) & 0x0F) | 0xE0;
 104                 } else {
 105                     $z = (($code >> 12) & 0x3F) | 0x80;
 106                     $w = (($code >> 18) & 0x07) | 0xF0;
 107                 }
 108             }
 109         }
 110         // set up the actual character
 111         $ret = '';
 112         if($w) $ret .= chr($w);
 113         if($z) $ret .= chr($z);
 114         if($y) $ret .= chr($y);
 115         $ret .= chr($x);
 116
 117         return $ret;
 118     }
 119
 120 }