9 * This source file is subject to the new BSD license that is bundled
10 * with this package in the file LICENSE.
11 * It is also available through the world-wide-web at this URL:
12 * http://phergie.org/license
15 * @package Phergie_Plugin_Encoding
16 * @author Phergie Development Team <team@phergie.org>
17 * @copyright 2008-2010 Phergie Development Team (http://phergie.org)
18 * @license http://phergie.org/license New BSD License
19 * @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
23 * Handles decoding markup entities and converting text between character
27 * @package Phergie_Plugin_Encoding
28 * @author Phergie Development Team <team@phergie.org>
29 * @license http://phergie.org/license New BSD License
30 * @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
32 class Phergie_Plugin_Encoding extends Phergie_Plugin_Abstract
35 * Lookup table for entity conversions not supported by
36 * html_entity_decode()
39 * @link http://us.php.net/manual/en/function.get-html-translation-table.php#73409
40 * @link http://us.php.net/manual/en/function.get-html-translation-table.php#73410
42 protected static $entities = array(
92 * Decodes markup entities in a given string.
94 * @param string $string String containing markup entities
95 * @param string $charset Optional character set name to use in decoding
96 * entities, defaults to UTF-8
98 * @return string String with markup entities decoded
100 public function decodeEntities($string, $charset = 'UTF-8')
102 $string = str_ireplace(
103 array_keys(self::$entities),
104 array_map('chr', self::$entities),
107 $string = html_entity_decode($string, ENT_QUOTES, $charset);
108 $string = preg_replace(
109 array('/�*([0-9]+);/me', '/�*([a-f0-9]+);/mei'),
110 array('$this->codeToUtf(\\1)', '$this->codeToUtf(hexdec(\\1))'),
117 * Converts a given unicode to its UTF-8 equivalent.
119 * @param int $code Code to convert
120 * @return string Character corresponding to code
122 public function codeToUtf8($code)
133 case ($code & 0x7FF):
134 return chr(0xC0 | (($code >> 6) & 0x1F)) .
135 chr(0x80 | ($code & 0x3F));
138 case ($code & 0xFFFF):
139 return chr(0xE0 | (($code >> 12) & 0x0F)) .
140 chr(0x80 | (($code >> 6) & 0x3F)) .
141 chr(0x80 | ($code & 0x3F));
144 case ($code & 0x1FFFFF):
145 return chr(0xF0 | ($code >> 18)) .
146 chr(0x80 | (($code >> 12) & 0x3F)) .
147 chr(0x80 | (($code >> 6) & 0x3F)) .
148 chr(0x80 | ($code & 0x3F));
153 * Transliterates characters in a given string where possible.
155 * @param string $string String containing characters to
157 * @param string $charsetFrom Optional character set of the string,
159 * @param string $charsetTo Optional character set to which the string
160 * should be converted, defaults to ISO-8859-1
162 * @return string String with characters transliterated or the original
163 * string if transliteration was not possible
165 public function transliterate($string, $charsetFrom = 'UTF-8', $charsetTo = 'ISO-8859-1')
167 // @link http://pecl.php.net/package/translit
168 if (function_exists('transliterate')) {
169 $string = transliterate($string, array('han_transliterate', 'diacritical_remove'), $charsetFrom, $charsetTo);
170 } elseif (function_exists('iconv')) {
171 $string = iconv($charsetFrom, $charsetTo . '//TRANSLIT', $string);
173 // @link http://stackoverflow.com/questions/1284535/php-transliteration/1285491#1285491
174 $string = preg_replace(
175 '~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i',
177 htmlentities($string, ENT_COMPAT, $charsetFrom)