* @copyright 2008-2010 Phergie Development Team (http://phergie.org) * @license http://phergie.org/license New BSD License * @link http://pear.phergie.org/package/Phergie_Plugin_Encoding */ /** * Handles decoding markup entities and converting text between character * encodings. * * @category Phergie * @package Phergie_Plugin_Encoding * @author Phergie Development Team * @license http://phergie.org/license New BSD License * @link http://pear.phergie.org/package/Phergie_Plugin_Encoding */ class Phergie_Plugin_Encoding extends Phergie_Plugin_Abstract { /** * Lookup table for entity conversions not supported by * html_entity_decode() * * @var array * @link http://us.php.net/manual/en/function.get-html-translation-table.php#73409 * @link http://us.php.net/manual/en/function.get-html-translation-table.php#73410 */ protected static $entities = array( 'α' => 913, ''' => 39, 'β' => 914, '•' => 149, 'χ' => 935, 'ˆ' => 94, 'δ' => 916, 'ε' => 917, 'η' => 919, 'ƒ' => 402, 'γ' => 915, 'ι' => 921, 'κ' => 922, 'λ' => 923, '“' => 147, '‹' => 139, '‘' => 145, '—' => 151, '−' => 45, 'μ' => 924, '–' => 150, 'ν' => 925, 'œ' => 140, 'ω' => 937, 'ο' => 927, 'φ' => 934, 'π' => 928, 'ϖ' => 982, 'ψ' => 936, '”' => 148, 'ρ' => 929, '›' => 155, '’' => 146, 'š' => 138, 'σ' => 931, 'ς' => 962, 'τ' => 932, 'θ' => 920, 'ϑ' => 977, '˜' => 126, '™' => 153, 'ϒ' => 978, 'υ' => 933, 'ξ' => 926, 'ÿ' => 159, 'ζ' => 918, ); /** * Decodes markup entities in a given string. * * @param string $string String containing markup entities * @param string $charset Optional character set name to use in decoding * entities, defaults to UTF-8 * * @return string String with markup entities decoded */ public function decodeEntities($string, $charset = 'UTF-8') { $string = str_ireplace( array_keys(self::$entities), array_map('chr', self::$entities), $string ); $string = html_entity_decode($string, ENT_QUOTES, $charset); $string = preg_replace( array('/�*([0-9]+);/me', '/�*([a-f0-9]+);/mei'), array('$this->codeToUtf(\\1)', '$this->codeToUtf(hexdec(\\1))'), $string ); return $string; } /** * Converts a given unicode to its UTF-8 equivalent. * * @param int $code Code to convert * @return string Character corresponding to code */ public function codeToUtf8($code) { $code = (int) $code; switch ($code) { // 1 byte, 7 bits case 0: return chr(0); case ($code & 0x7F): return chr($code); // 2 bytes, 11 bits case ($code & 0x7FF): return chr(0xC0 | (($code >> 6) & 0x1F)) . chr(0x80 | ($code & 0x3F)); // 3 bytes, 16 bits case ($code & 0xFFFF): return chr(0xE0 | (($code >> 12) & 0x0F)) . chr(0x80 | (($code >> 6) & 0x3F)) . chr(0x80 | ($code & 0x3F)); // 4 bytes, 21 bits case ($code & 0x1FFFFF): return chr(0xF0 | ($code >> 18)) . chr(0x80 | (($code >> 12) & 0x3F)) . chr(0x80 | (($code >> 6) & 0x3F)) . chr(0x80 | ($code & 0x3F)); } } /** * Transliterates characters in a given string where possible. * * @param string $string String containing characters to * transliterate * @param string $charsetFrom Optional character set of the string, * defaults to UTF-8 * @param string $charsetTo Optional character set to which the string * should be converted, defaults to ISO-8859-1 * * @return string String with characters transliterated or the original * string if transliteration was not possible */ public function transliterate($string, $charsetFrom = 'UTF-8', $charsetTo = 'ISO-8859-1') { // @link http://pecl.php.net/package/translit if (function_exists('transliterate')) { $string = transliterate($string, array('han_transliterate', 'diacritical_remove'), $charsetFrom, $charsetTo); } elseif (function_exists('iconv')) { $string = iconv($charsetFrom, $charsetTo . '//TRANSLIT', $string); } else { // @link http://stackoverflow.com/questions/1284535/php-transliteration/1285491#1285491 $string = preg_replace( '~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i', '$1', htmlentities($string, ENT_COMPAT, $charsetFrom) ); } return $string; } }