5 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
7 // +----------------------------------------------------------------------+
8 // | This library is free software; you can redistribute it and/or modify |
9 // | it under the terms of the GNU Lesser General Public License as |
10 // | published by the Free Software Foundation; either version 2.1 of the |
11 // | License, or (at your option) any later version. |
13 // | This library is distributed in the hope that it will be useful, but |
14 // | WITHOUT ANY WARRANTY; without even the implied warranty of |
15 // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 // | Lesser General Public License for more details. |
18 // | You should have received a copy of the GNU Lesser General Public |
19 // | License along with this library; if not, write to the Free Software |
20 // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
22 // +----------------------------------------------------------------------+
26 require_once 'Net/IDNA2/Exception.php';
27 require_once 'Net/IDNA2/Exception/Nameprep.php';
30 * Encode/decode Internationalized Domain Names.
32 * The class allows one to convert internationalized domain names
33 * (see RFC 3490 for details) as they can be used with various registries worldwide
34 * to be translated between their original (localized) form and their encoded form
35 * as it will be used in the DNS (Domain Name System).
37 * The class provides two public methods, encode() and decode(), which do exactly
38 * what you would expect them to do. You are allowed to use complete domain names,
39 * simple strings and complete email addresses as well. That means, that you might
40 * use any of the following notations:
44 * - xn--brse-5qa.xn--knrz-1ra.info
46 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
47 * array. Unicode output is available in the same formats.
48 * You can select your preferred format via {@link set_paramter()}.
50 * ACE input and output is always expected to be ASCII.
53 * @author Markus Nix <mnix@docuverse.de>
54 * @author Matthias Sommerfeld <mso@phlylabs.de>
55 * @author Stefan Neufeind <pear.neufeind@speedpartner.de>
62 * These Unicode codepoints are
63 * mapped to nothing, See RFC3454 for details
69 private static $_np_map_nothing = [
100 * Prohibited codepints
106 private static $_general_prohibited = [
174 * Codepints prohibited by Nameprep
179 private static $_np_prohibit = [
267 * Codepoint ranges prohibited by nameprep
273 private static $_np_prohibit_ranges = [
279 [0x100000, 0x10FFFD],
287 * Replacement mappings (casemapping, replacement sequences, ...)
293 private static $_np_replacemaps = [
351 0xDF => [0x73, 0x73],
376 0x130 => [0x69, 0x307],
388 0x149 => [0x2BC, 0x6E],
476 0x1F0 => [0x6A, 0x30C],
513 0x37A => [0x20, 0x3B9],
521 0x390 => [0x3B9, 0x308, 0x301],
548 0x3B0 => [0x3C5, 0x308, 0x301],
739 0x587 => [0x565, 0x582],
815 0x1E96 => [0x68, 0x331],
816 0x1E97 => [0x74, 0x308],
817 0x1E98 => [0x77, 0x30A],
818 0x1E99 => [0x79, 0x30A],
819 0x1E9A => [0x61, 0x2BE],
902 0x1F50 => [0x3C5, 0x313],
903 0x1F52 => [0x3C5, 0x313, 0x300],
904 0x1F54 => [0x3C5, 0x313, 0x301],
905 0x1F56 => [0x3C5, 0x313, 0x342],
918 0x1F80 => [0x1F00, 0x3B9],
919 0x1F81 => [0x1F01, 0x3B9],
920 0x1F82 => [0x1F02, 0x3B9],
921 0x1F83 => [0x1F03, 0x3B9],
922 0x1F84 => [0x1F04, 0x3B9],
923 0x1F85 => [0x1F05, 0x3B9],
924 0x1F86 => [0x1F06, 0x3B9],
925 0x1F87 => [0x1F07, 0x3B9],
926 0x1F88 => [0x1F00, 0x3B9],
927 0x1F89 => [0x1F01, 0x3B9],
928 0x1F8A => [0x1F02, 0x3B9],
929 0x1F8B => [0x1F03, 0x3B9],
930 0x1F8C => [0x1F04, 0x3B9],
931 0x1F8D => [0x1F05, 0x3B9],
932 0x1F8E => [0x1F06, 0x3B9],
933 0x1F8F => [0x1F07, 0x3B9],
934 0x1F90 => [0x1F20, 0x3B9],
935 0x1F91 => [0x1F21, 0x3B9],
936 0x1F92 => [0x1F22, 0x3B9],
937 0x1F93 => [0x1F23, 0x3B9],
938 0x1F94 => [0x1F24, 0x3B9],
939 0x1F95 => [0x1F25, 0x3B9],
940 0x1F96 => [0x1F26, 0x3B9],
941 0x1F97 => [0x1F27, 0x3B9],
942 0x1F98 => [0x1F20, 0x3B9],
943 0x1F99 => [0x1F21, 0x3B9],
944 0x1F9A => [0x1F22, 0x3B9],
945 0x1F9B => [0x1F23, 0x3B9],
946 0x1F9C => [0x1F24, 0x3B9],
947 0x1F9D => [0x1F25, 0x3B9],
948 0x1F9E => [0x1F26, 0x3B9],
949 0x1F9F => [0x1F27, 0x3B9],
950 0x1FA0 => [0x1F60, 0x3B9],
951 0x1FA1 => [0x1F61, 0x3B9],
952 0x1FA2 => [0x1F62, 0x3B9],
953 0x1FA3 => [0x1F63, 0x3B9],
954 0x1FA4 => [0x1F64, 0x3B9],
955 0x1FA5 => [0x1F65, 0x3B9],
956 0x1FA6 => [0x1F66, 0x3B9],
957 0x1FA7 => [0x1F67, 0x3B9],
958 0x1FA8 => [0x1F60, 0x3B9],
959 0x1FA9 => [0x1F61, 0x3B9],
960 0x1FAA => [0x1F62, 0x3B9],
961 0x1FAB => [0x1F63, 0x3B9],
962 0x1FAC => [0x1F64, 0x3B9],
963 0x1FAD => [0x1F65, 0x3B9],
964 0x1FAE => [0x1F66, 0x3B9],
965 0x1FAF => [0x1F67, 0x3B9],
966 0x1FB2 => [0x1F70, 0x3B9],
967 0x1FB3 => [0x3B1, 0x3B9],
968 0x1FB4 => [0x3AC, 0x3B9],
969 0x1FB6 => [0x3B1, 0x342],
970 0x1FB7 => [0x3B1, 0x342, 0x3B9],
975 0x1FBC => [0x3B1, 0x3B9],
977 0x1FC2 => [0x1F74, 0x3B9],
978 0x1FC3 => [0x3B7, 0x3B9],
979 0x1FC4 => [0x3AE, 0x3B9],
980 0x1FC6 => [0x3B7, 0x342],
981 0x1FC7 => [0x3B7, 0x342, 0x3B9],
986 0x1FCC => [0x3B7, 0x3B9],
987 0x1FD2 => [0x3B9, 0x308, 0x300],
988 0x1FD3 => [0x3B9, 0x308, 0x301],
989 0x1FD6 => [0x3B9, 0x342],
990 0x1FD7 => [0x3B9, 0x308, 0x342],
995 0x1FE2 => [0x3C5, 0x308, 0x300],
996 0x1FE3 => [0x3C5, 0x308, 0x301],
997 0x1FE4 => [0x3C1, 0x313],
998 0x1FE6 => [0x3C5, 0x342],
999 0x1FE7 => [0x3C5, 0x308, 0x342],
1005 0x1FF2 => [0x1F7C, 0x3B9],
1006 0x1FF3 => [0x3C9, 0x3B9],
1007 0x1FF4 => [0x3CE, 0x3B9],
1008 0x1FF6 => [0x3C9, 0x342],
1009 0x1FF7 => [0x3C9, 0x342, 0x3B9],
1014 0x1FFC => [0x3C9, 0x3B9],
1015 0x20A8 => [0x72, 0x73],
1017 0x2103 => [0xB0, 0x63],
1019 0x2109 => [0xB0, 0x66],
1027 0x2116 => [0x6E, 0x6F],
1033 0x2120 => [0x73, 0x6D],
1034 0x2121 => [0x74, 0x65, 0x6C],
1035 0x2122 => [0x74, 0x6D],
1091 0x3371 => [0x68, 0x70, 0x61],
1092 0x3373 => [0x61, 0x75],
1093 0x3375 => [0x6F, 0x76],
1094 0x3380 => [0x70, 0x61],
1095 0x3381 => [0x6E, 0x61],
1096 0x3382 => [0x3BC, 0x61],
1097 0x3383 => [0x6D, 0x61],
1098 0x3384 => [0x6B, 0x61],
1099 0x3385 => [0x6B, 0x62],
1100 0x3386 => [0x6D, 0x62],
1101 0x3387 => [0x67, 0x62],
1102 0x338A => [0x70, 0x66],
1103 0x338B => [0x6E, 0x66],
1104 0x338C => [0x3BC, 0x66],
1105 0x3390 => [0x68, 0x7A],
1106 0x3391 => [0x6B, 0x68, 0x7A],
1107 0x3392 => [0x6D, 0x68, 0x7A],
1108 0x3393 => [0x67, 0x68, 0x7A],
1109 0x3394 => [0x74, 0x68, 0x7A],
1110 0x33A9 => [0x70, 0x61],
1111 0x33AA => [0x6B, 0x70, 0x61],
1112 0x33AB => [0x6D, 0x70, 0x61],
1113 0x33AC => [0x67, 0x70, 0x61],
1114 0x33B4 => [0x70, 0x76],
1115 0x33B5 => [0x6E, 0x76],
1116 0x33B6 => [0x3BC, 0x76],
1117 0x33B7 => [0x6D, 0x76],
1118 0x33B8 => [0x6B, 0x76],
1119 0x33B9 => [0x6D, 0x76],
1120 0x33BA => [0x70, 0x77],
1121 0x33BB => [0x6E, 0x77],
1122 0x33BC => [0x3BC, 0x77],
1123 0x33BD => [0x6D, 0x77],
1124 0x33BE => [0x6B, 0x77],
1125 0x33BF => [0x6D, 0x77],
1126 0x33C0 => [0x6B, 0x3C9],
1127 0x33C1 => [0x6D, 0x3C9],
1128 /* 0x33C2 => [0x61, 0x2E, 0x6D, 0x2E], */
1129 0x33C3 => [0x62, 0x71],
1130 0x33C6 => [0x63, 0x2215, 0x6B, 0x67],
1131 0x33C7 => [0x63, 0x6F, 0x2E],
1132 0x33C8 => [0x64, 0x62],
1133 0x33C9 => [0x67, 0x79],
1134 0x33CB => [0x68, 0x70],
1135 0x33CD => [0x6B, 0x6B],
1136 0x33CE => [0x6B, 0x6D],
1137 0x33D7 => [0x70, 0x68],
1138 0x33D9 => [0x70, 0x70, 0x6D],
1139 0x33DA => [0x70, 0x72],
1140 0x33DC => [0x73, 0x76],
1141 0x33DD => [0x77, 0x62],
1142 0xFB00 => [0x66, 0x66],
1143 0xFB01 => [0x66, 0x69],
1144 0xFB02 => [0x66, 0x6C],
1145 0xFB03 => [0x66, 0x66, 0x69],
1146 0xFB04 => [0x66, 0x66, 0x6C],
1147 0xFB05 => [0x73, 0x74],
1148 0xFB06 => [0x73, 0x74],
1149 0xFB13 => [0x574, 0x576],
1150 0xFB14 => [0x574, 0x565],
1151 0xFB15 => [0x574, 0x56B],
1152 0xFB16 => [0x57E, 0x576],
1153 0xFB17 => [0x574, 0x56D],
1180 0x10400 => [0x10428],
1181 0x10401 => [0x10429],
1182 0x10402 => [0x1042A],
1183 0x10403 => [0x1042B],
1184 0x10404 => [0x1042C],
1185 0x10405 => [0x1042D],
1186 0x10406 => [0x1042E],
1187 0x10407 => [0x1042F],
1188 0x10408 => [0x10430],
1189 0x10409 => [0x10431],
1190 0x1040A => [0x10432],
1191 0x1040B => [0x10433],
1192 0x1040C => [0x10434],
1193 0x1040D => [0x10435],
1194 0x1040E => [0x10436],
1195 0x1040F => [0x10437],
1196 0x10410 => [0x10438],
1197 0x10411 => [0x10439],
1198 0x10412 => [0x1043A],
1199 0x10413 => [0x1043B],
1200 0x10414 => [0x1043C],
1201 0x10415 => [0x1043D],
1202 0x10416 => [0x1043E],
1203 0x10417 => [0x1043F],
1204 0x10418 => [0x10440],
1205 0x10419 => [0x10441],
1206 0x1041A => [0x10442],
1207 0x1041B => [0x10443],
1208 0x1041C => [0x10444],
1209 0x1041D => [0x10445],
1210 0x1041E => [0x10446],
1211 0x1041F => [0x10447],
1212 0x10420 => [0x10448],
1213 0x10421 => [0x10449],
1214 0x10422 => [0x1044A],
1215 0x10423 => [0x1044B],
1216 0x10424 => [0x1044C],
1217 0x10425 => [0x1044D],
1688 0x213B => [0x66, 0x61, 0x78],
1689 0x3250 => [0x70, 0x74, 0x65],
1690 0x32CC => [0x68, 0x67],
1691 0x32CE => [0x65, 0x76],
1692 0x32CF => [0x6C, 0x74, 0x64],
1693 0x337A => [0x69, 0x75],
1694 0x33DE => [0x76, 0x2215, 0x6D],
1695 0x33DF => [0x61, 0x2215, 0x6D]
1699 * Normalization Combining Classes; Code Points not listed
1700 * got Combining Class 0.
1706 private static $_np_norm_combcls = [
2056 private $_punycode_prefix = 'xn--';
2061 private $_invalid_ucs = 0x80000000;
2066 private $_max_ucs = 0x10FFFF;
2072 private $_base = 36;
2084 private $_tmax = 26;
2090 private $_skew = 38;
2096 private $_damp = 700;
2102 private $_initial_bias = 72;
2108 private $_initial_n = 0x80;
2119 private $_sbase = 0xAC00;
2124 private $_lbase = 0x1100;
2129 private $_vbase = 0x1161;
2134 private $_tbase = 0x11a7;
2140 private $_lcount = 19;
2146 private $_vcount = 21;
2152 private $_tcount = 28;
2160 private $_ncount = 588;
2163 * lcount * tcount * vcount
2168 private $_scount = 11172;
2171 * Default encoding for encode()'s input and decode()'s output is UTF-8;
2172 * Other possible encodings are ucs4_string and ucs4_array
2173 * See {@link setParams()} for how to select these
2178 private $_api_encoding = 'utf8';
2181 * Overlong UTF-8 encodings are forbidden
2186 private $_allow_overlong = false;
2189 * Behave strict or not
2194 private $_strict_mode = false;
2197 * IDNA-version to use
2199 * Values are "2003" and "2008".
2200 * Defaults to "2003", since that was the original version and for
2201 * compatibility with previous versions of this library.
2202 * If you need to encode "new" characters like the German "Eszett",
2203 * please switch to 2008 first before encoding.
2208 private $_version = '2003';
2211 * Cached value indicating whether or not mbstring function overloading is
2214 * This is cached for optimal performance.
2217 * @see Net_IDNA2::_byteLength()
2219 private static $_mb_string_overload = null;
2227 * @param array|null $options Options to initialise the object with
2232 public function __construct($options = null)
2234 $this->_slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
2236 if (is_array($options)) {
2237 $this->setParams($options);
2240 // populate mbstring overloading cache if not set
2241 if (self::$_mb_string_overload === null) {
2242 self::$_mb_string_overload = (extension_loaded('mbstring')
2243 && (ini_get('mbstring.func_overload') & 0x02) === 0x02);
2250 * Sets a new option value. Available options and values:
2252 * [utf8 - Use either UTF-8 or ISO-8859-1 as input (true for UTF-8, false
2253 * otherwise); The output is always UTF-8]
2254 * [overlong - Unicode does not allow unnecessarily long encodings of chars,
2255 * to allow this, set this parameter to true, else to false;
2256 * default is false.]
2257 * [strict - true: strict mode, good for registration purposes - Causes errors
2258 * on failures; false: loose mode, ideal for "wildlife" applications
2259 * by silently ignoring errors and returning the original input instead]
2261 * @param mixed $option Parameter to set (string: single parameter; array of Parameter => Value pairs)
2262 * @param string|false $value Value to use (if parameter 1 is a string)
2264 * @return bool true on success, false otherwise
2267 public function setParams($option, $value = false): bool
2269 if (!is_array($option)) {
2270 $option = [$option => $value];
2273 foreach ($option as $k => $v) {
2280 $this->_api_encoding = $v;
2284 throw new InvalidArgumentException('Set Parameter: Unknown parameter ' . $v . ' for option ' . $k);
2290 $this->_allow_overlong = ($v) ? true : false;
2294 $this->_strict_mode = ($v) ? true : false;
2298 if (in_array($v, ['2003', '2008'])) {
2299 $this->_version = $v;
2301 throw new InvalidArgumentException('Set Parameter: Invalid parameter ' . $v . ' for option ' . $k);
2314 * Encode a given UTF-8 domain name.
2316 * @param string $decoded Domain name (UTF-8 or UCS-4)
2317 * @param string|false $one_time_encoding Desired input encoding, see {@link set_parameter}
2318 * If not given will use default-encoding
2320 * @return mixed Encoded Domain name (ACE string) / processed string
2324 public function encode(string $decoded, $one_time_encoding = false)
2326 // Forcing conversion of input to UCS4 array
2327 // If one time encoding is given, use this, else the objects property
2328 switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
2330 $decoded = $this->_utf8_to_ucs4($decoded);
2333 $decoded = $this->_ucs4_string_to_ucs4($decoded);
2338 throw new InvalidArgumentException('Unsupported input format');
2341 // No input, no output, what else did you expect?
2342 if (empty($decoded)) {
2346 // Anchors for iteration
2351 foreach ($decoded as $k => $v) {
2352 // Make sure to use just the plain dot
2357 $decoded[$k] = 0x2E;
2358 // It's right, no break here
2359 // The codepoints above have to be converted to dots anyway
2361 // Stumbling across an anchoring character
2368 // Neither email addresses nor URLs allowed in strict mode
2369 if ($this->_strict_mode) {
2370 throw new InvalidArgumentException('Neither email addresses nor URLs are allowed in strict mode.');
2375 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k) - $last_begin)));
2377 $output .= $encoded;
2379 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k) - $last_begin)));
2381 $output .= chr($decoded[$k]);
2383 $last_begin = $k + 1;
2386 // Catch the rest of the string
2388 $inp_len = sizeof($decoded);
2390 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
2392 $output .= $encoded;
2394 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
2399 if ($output = $this->_encode($decoded)) {
2403 return $this->_ucs4_to_utf8($decoded);
2407 * Decode a given ACE domain name.
2409 * @param string $input Domain name (ACE string)
2410 * @param string|false $one_time_encoding Desired output encoding, see {@link set_parameter}
2412 * @return mixed Decoded Domain name (UTF-8 or UCS-4) / processed string
2416 public function decode(string $input, $one_time_encoding = false)
2419 if ($one_time_encoding) {
2420 switch ($one_time_encoding) {
2426 throw new InvalidArgumentException('Unknown encoding ' . $one_time_encoding);
2429 // Make sure to drop any newline characters around
2430 $input = trim($input);
2432 // Negotiate input and try to determine, whether it is a plain string,
2433 // an email address or something like a complete URL
2434 if (strpos($input, '@')) { // Maybe it is an email address
2435 // No no in strict mode
2436 if ($this->_strict_mode) {
2437 throw new InvalidArgumentException('Only simple domain name parts can be handled in strict mode');
2439 list($email_pref, $input) = explode('@', $input, 2);
2440 $arr = explode('.', $input);
2441 foreach ($arr as $k => $v) {
2442 $conv = $this->_decode($v);
2447 $return = $email_pref . '@' . join('.', $arr);
2448 } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
2449 // No no in strict mode
2450 if ($this->_strict_mode) {
2451 throw new InvalidArgumentException('Only simple domain name parts can be handled in strict mode');
2454 $parsed = parse_url($input);
2455 if (isset($parsed['host'])) {
2456 $arr = explode('.', $parsed['host']);
2457 foreach ($arr as $k => $v) {
2458 $conv = $this->_decode($v);
2463 $parsed['host'] = join('.', $arr);
2464 if (isset($parsed['scheme'])) {
2465 $parsed['scheme'] .= (strtolower($parsed['scheme']) == 'mailto') ? ':' : '://';
2467 $return = $this->_unparse_url($parsed);
2468 } else { // parse_url seems to have failed, try without it
2469 $arr = explode('.', $input);
2470 foreach ($arr as $k => $v) {
2471 $conv = $this->_decode($v);
2476 $return = join('.', $arr);
2478 } else { // Otherwise we consider it being a pure domain name string
2479 $return = $this->_decode($input);
2481 // The output is UTF-8 by default, other output formats need conversion here
2482 // If one time encoding is given, use this, else the objects property
2483 switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
2488 return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
2491 return $this->_utf8_to_ucs4($return);
2494 throw new InvalidArgumentException('Unsupported output format');
2502 * Opposite function to parse_url()
2504 * Inspired by code from comments of php.net-documentation for parse_url()
2506 * @param array $parts_arr parts (strings) as returned by parse_url()
2511 private function _unparse_url(array $parts_arr): string
2513 if (!empty($parts_arr['scheme'])) {
2514 $ret_url = $parts_arr['scheme'];
2516 if (!empty($parts_arr['user'])) {
2517 $ret_url .= $parts_arr['user'];
2518 if (!empty($parts_arr['pass'])) {
2519 $ret_url .= ':' . $parts_arr['pass'];
2523 $ret_url .= $parts_arr['host'];
2524 if (!empty($parts_arr['port'])) {
2525 $ret_url .= ':' . $parts_arr['port'];
2527 $ret_url .= $parts_arr['path'];
2528 if (!empty($parts_arr['query'])) {
2529 $ret_url .= '?' . $parts_arr['query'];
2531 if (!empty($parts_arr['fragment'])) {
2532 $ret_url .= '#' . $parts_arr['fragment'];
2538 * The actual encoding algorithm.
2540 * @param array of strings $decoded Decoded string which should be encoded
2542 * @return string Encoded string
2546 private function _encode($decoded): string
2548 // We cannot encode a domain name containing the Punycode prefix
2549 $extract = self::_byteLength($this->_punycode_prefix);
2550 $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
2551 $check_deco = array_slice($decoded, 0, $extract);
2553 if ($check_pref == $check_deco) {
2554 throw new InvalidArgumentException('This is already a punycode string');
2557 // We will not try to encode strings consisting of basic code points only
2559 foreach ($decoded as $k => $v) {
2566 if ($this->_strict_mode) {
2567 throw new InvalidArgumentException('The given string does not contain encodable chars');
2574 $decoded = $this->_nameprep($decoded);
2576 $deco_len = count($decoded);
2583 // How many chars have been consumed
2586 // Start with the prefix; copy it to output
2587 $encoded = $this->_punycode_prefix;
2590 // Copy all basic code points to output
2591 for ($i = 0; $i < $deco_len; ++$i) {
2592 $test = $decoded[$i];
2593 // Will match [0-9a-zA-Z-]
2594 if ((0x2F < $test && $test < 0x40)
2595 || (0x40 < $test && $test < 0x5B)
2596 || (0x60 < $test && $test <= 0x7B)
2599 $encoded .= chr($decoded[$i]);
2604 // All codepoints were basic ones
2605 if ($codecount == $deco_len) {
2609 // Start with the prefix; copy it to output
2610 $encoded = $this->_punycode_prefix . $encoded;
2612 // If we have basic code points in output, add an hyphen to the end
2617 // Now find and encode all non-basic code points
2619 $cur_code = $this->_initial_n;
2620 $bias = $this->_initial_bias;
2623 while ($codecount < $deco_len) {
2624 // Find the smallest code point >= the current code point and
2625 // remember the last ouccrence of it in the input
2626 for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
2627 if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
2628 $next_code = $decoded[$i];
2632 $delta += ($next_code - $cur_code) * ($codecount + 1);
2633 $cur_code = $next_code;
2635 // Scan input again and encode all characters whose code point is $cur_code
2636 for ($i = 0; $i < $deco_len; $i++) {
2637 if ($decoded[$i] < $cur_code) {
2639 } elseif ($decoded[$i] == $cur_code) {
2640 for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
2641 $t = ($k <= $bias) ?
2643 (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
2649 $encoded .= $this->_encodeDigit(ceil($t + (($q - $t) % ($this->_base - $t))));
2650 $q = ($q - $t) / ($this->_base - $t);
2653 $encoded .= $this->_encodeDigit($q);
2654 $bias = $this->_adapt($delta, $codecount + 1, $is_first);
2669 * The actual decoding algorithm.
2671 * @param string $encoded Encoded string which should be decoded
2673 * @return string Decoded string
2677 private function _decode($encoded): string
2679 // We do need to find the Punycode prefix
2680 if (!preg_match('!^' . preg_quote($this->_punycode_prefix, '!') . '!', $encoded)) {
2684 $encode_test = preg_replace('!^' . preg_quote($this->_punycode_prefix, '!') . '!', '', $encoded);
2686 // If nothing left after removing the prefix, it is hopeless
2687 if (!$encode_test) {
2691 // Find last occurrence of the delimiter
2692 $delim_pos = strrpos($encoded, '-');
2694 if ($delim_pos > self::_byteLength($this->_punycode_prefix)) {
2695 for ($k = self::_byteLength($this->_punycode_prefix); $k < $delim_pos; ++$k) {
2696 $decoded[] = ord($encoded{$k});
2702 $deco_len = count($decoded);
2703 $enco_len = self::_byteLength($encoded);
2705 // Wandering through the strings; init
2707 $bias = $this->_initial_bias;
2709 $char = $this->_initial_n;
2711 for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
2712 for ($old_idx = $idx, $w = 1, $k = $this->_base; 1; $k += $this->_base) {
2713 $digit = $this->_decodeDigit($encoded{$enco_idx++});
2714 $idx += $digit * $w;
2716 $t = ($k <= $bias) ?
2718 (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
2724 $w = (int)($w * ($this->_base - $t));
2727 $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
2729 $char += (int)($idx / ($deco_len + 1));
2730 $idx %= ($deco_len + 1);
2732 if ($deco_len > 0) {
2733 // Make room for the decoded char
2734 for ($i = $deco_len; $i > $idx; $i--) {
2735 $decoded[$i] = $decoded[($i - 1)];
2739 $decoded[$idx++] = $char;
2742 return $this->_ucs4_to_utf8($decoded);
2746 * Adapt the bias according to the current code point and position.
2748 * @param int $delta ...
2749 * @param int $npoints ...
2750 * @param bool $is_first ...
2755 private function _adapt(int $delta, int $npoints, bool $is_first): int
2757 $delta = (int)($is_first ? ($delta / $this->_damp) : ($delta / 2));
2758 $delta += (int)($delta / $npoints);
2760 for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
2761 $delta = (int)($delta / ($this->_base - $this->_tmin));
2764 return (int)($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
2768 * Encoding a certain digit.
2770 * @param int $d One digit to encode
2772 * @return string (char) Encoded digit
2775 private function _encodeDigit(int $d): string
2777 return chr($d + 22 + 75 * ($d < 26));
2781 * Decode a certain digit.
2783 * @param string (char) $cp One digit (character) to decode
2785 * @return int Decoded digit
2788 private function _decodeDigit(string $cp): int
2791 return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
2795 * Do Nameprep according to RFC3491 and RFC3454.
2797 * @param array $input Unicode Characters
2799 * @return array of strings Unicode Characters, Nameprep'd
2803 private function _nameprep(array $input): array
2807 // Walking through the input array, performing the required steps on each of
2808 // the input chars and putting the result into the output array
2809 // While mapping required chars we apply the canonical ordering
2811 foreach ($input as $v) {
2812 // Map to nothing == skip that code point
2813 if (in_array($v, self::$_np_map_nothing)) {
2817 // Try to find prohibited input
2818 if (in_array($v, self::$_np_prohibit) || in_array($v, self::$_general_prohibited)) {
2819 throw new Net_IDNA2_Exception_Nameprep('Prohibited input U+' . sprintf('%08X', $v));
2822 foreach (self::$_np_prohibit_ranges as $range) {
2823 if ($range[0] <= $v && $v <= $range[1]) {
2824 throw new Net_IDNA2_Exception_Nameprep('Prohibited input U+' . sprintf('%08X', $v));
2828 // Hangul syllable decomposition
2829 if (0xAC00 <= $v && $v <= 0xD7AF) {
2830 foreach ($this->_hangulDecompose($v) as $out) {
2833 } elseif (($this->_version == '2003') && isset(self::$_np_replacemaps[$v])) {
2834 // There's a decomposition mapping for that code point
2835 // Decompositions only in version 2003 (original) of IDNA
2836 foreach ($this->_applyCannonicalOrdering(self::$_np_replacemaps[$v]) as $out) {
2844 // Combine code points
2848 $out_len = count($output);
2850 for ($i = 0; $i < $out_len; ++$i) {
2851 $class = $this->_getCombiningClass($output[$i]);
2853 if ((!$last_class || $last_class != $class) && $class) {
2855 $seq_len = $i - $last_starter;
2856 $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
2858 // On match: Replace the last starter with the composed character and remove
2859 // the now redundant non-starter(s)
2861 $output[$last_starter] = $out;
2863 if (count($out) != $seq_len) {
2864 for ($j = $i + 1; $j < $out_len; ++$j) {
2865 $output[$j - 1] = $output[$j];
2868 unset($output[$out_len]);
2871 // Rewind the for loop by one, since there can be more possible compositions
2874 $last_class = ($i == $last_starter) ? 0 : $this->_getCombiningClass($output[$i - 1]);
2880 // The current class is 0
2885 $last_class = $class;
2892 * Decomposes a Hangul syllable
2893 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul).
2895 * @param int $char 32bit UCS4 code point
2897 * @return array Either Hangul Syllable decomposed or original 32bit
2898 * value as one value array
2901 private function _hangulDecompose(int $char): array
2903 $sindex = $char - $this->_sbase;
2905 if ($sindex < 0 || $sindex >= $this->_scount) {
2910 $T = $this->_tbase + $sindex % $this->_tcount;
2911 $result[] = (int)($this->_lbase + $sindex / $this->_ncount);
2912 $result[] = (int)($this->_vbase + ($sindex % $this->_ncount) / $this->_tcount);
2914 if ($T != $this->_tbase) {
2922 * Ccomposes a Hangul syllable
2923 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul).
2925 * @param array $input Decomposed UCS4 sequence
2927 * @return array UCS4 sequence with syllables composed
2930 private function _hangulCompose(array $input): array
2932 $inp_len = count($input);
2940 $result[] = $last; // copy first char from input to output
2942 for ($i = 1; $i < $inp_len; ++$i) {
2945 // Find out, wether two current characters from L and V
2946 $lindex = $last - $this->_lbase;
2948 if (0 <= $lindex && $lindex < $this->_lcount) {
2949 $vindex = $char - $this->_vbase;
2951 if (0 <= $vindex && $vindex < $this->_vcount) {
2952 // create syllable of form LV
2953 $last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount);
2954 $out_off = count($result) - 1;
2955 $result[$out_off] = $last; // reset last
2962 // Find out, wether two current characters are LV and T
2963 $sindex = $last - $this->_sbase;
2965 if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0) {
2966 $tindex = $char - $this->_tbase;
2968 if (0 <= $tindex && $tindex <= $this->_tcount) {
2969 // create syllable of form LVT
2971 $out_off = count($result) - 1;
2972 $result[$out_off] = $last; // reset last
2979 // if neither case was true, just add the character
2988 * Returns the combining class of a certain wide char.
2990 * @param integer $char Wide char to check (32bit integer)
2992 * @return int Combining class if found, else 0
2995 private function _getCombiningClass(int $char): int
2997 return isset(self::$_np_norm_combcls[$char]) ? self::$_np_norm_combcls[$char] : 0;
3001 * Apllies the canonical ordering of a decomposed UCS4 sequence.
3003 * @param array $input Decomposed UCS4 sequence
3005 * @return array Ordered USC4 sequence
3008 private function _applyCannonicalOrdering(array $input): array
3011 $size = count($input);
3015 $last = $this->_getCombiningClass($input[0]);
3017 for ($i = 0; $i < $size - 1; ++$i) {
3018 $next = $this->_getCombiningClass($input[$i + 1]);
3020 if ($next != 0 && $last > $next) {
3021 // Move item leftward until it fits
3022 for ($j = $i + 1; $j > 0; --$j) {
3023 if ($this->_getCombiningClass($input[$j - 1]) <= $next) {
3028 $input[$j] = $input[$j - 1];
3029 $input[$j - 1] = $t;
3033 // Reentering the loop looking at the old character again
3045 * Do composition of a sequence of starter and non-starter.
3047 * @param array $input UCS4 Decomposed sequence
3049 * @return array|false Ordered USC4 sequence
3052 private function _combine($input)
3054 $inp_len = count($input);
3056 // Is it a Hangul syllable?
3057 if (1 != $inp_len) {
3058 $hangul = $this->_hangulCompose($input);
3060 // This place is probably wrong
3061 if (count($hangul) != $inp_len) {
3066 foreach (self::$_np_replacemaps as $np_src => $np_target) {
3067 if ($np_target[0] != $input[0]) {
3071 if (count($np_target) != $inp_len) {
3077 foreach ($input as $k2 => $v2) {
3078 if ($v2 == $np_target[$k2]) {
3095 * This converts an UTF-8 encoded string to its UCS-4 (array) representation
3096 * By talking about UCS-4 we mean arrays of 32bit integers representing
3097 * each of the "chars". This is due to PHP not being able to handle strings with
3098 * bit depth different from 8. This applies to the reverse method _ucs4_to_utf8(), too.
3099 * The following UTF-8 encodings are supported:
3101 * bytes bits representation
3103 * 2 11 110xxxxx 10xxxxxx
3104 * 3 16 1110xxxx 10xxxxxx 10xxxxxx
3105 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
3106 * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
3107 * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
3109 * Each x represents a bit that can be used to store character data.
3111 * @param string $input utf8-encoded string
3113 * @return array ucs4-encoded array
3117 private function _utf8_to_ucs4(string $input): array
3121 $inp_len = self::_byteLength($input, '8bit');
3124 for ($k = 0; $k < $inp_len; ++$k) {
3125 $v = ord($input{$k}); // Extract byte from input string
3127 if ($v < 128) { // We found an ASCII char - put into string as is
3128 $output[$out_len] = $v;
3130 if ('add' == $mode) {
3131 throw new UnexpectedValueException('Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k);
3135 if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
3139 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
3140 $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
3141 $v = ($v - 192) << 6;
3142 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
3144 $v = ($v - 224) << 12;
3145 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
3147 $v = ($v - 240) << 18;
3148 } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
3150 $v = ($v - 248) << 24;
3151 } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
3153 $v = ($v - 252) << 30;
3155 throw new UnexpectedValueException('This might be UTF-8, but I don\'t understand it at byte ' . $k);
3157 if ('add' == $mode) {
3158 $output[$out_len] = (int)$v;
3163 if ('add' == $mode) {
3164 if (!$this->_allow_overlong && $test == 'range') {
3166 if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
3167 throw new OutOfRangeException('Bogus UTF-8 character detected (out of legal range) at byte ' . $k);
3170 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
3171 $v = ($v - 128) << ($next_byte * 6);
3172 $output[($out_len - 1)] += $v;
3175 throw new UnexpectedValueException('Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k);
3177 if ($next_byte < 0) {
3186 * Convert UCS-4 array into UTF-8 string
3188 * @param array $input ucs4-encoded array
3190 * @return string utf8-encoded string
3194 private function _ucs4_to_utf8(array $input): string
3198 foreach ($input as $v) {
3202 // 7bit are transferred literally
3204 } elseif ($v < 1 << 11) {
3206 $output .= chr(192 + ($v >> 6))
3207 . chr(128 + ($v & 63));
3208 } elseif ($v < 1 << 16) {
3210 $output .= chr(224 + ($v >> 12))
3211 . chr(128 + (($v >> 6) & 63))
3212 . chr(128 + ($v & 63));
3213 } elseif ($v < 1 << 21) {
3215 $output .= chr(240 + ($v >> 18))
3216 . chr(128 + (($v >> 12) & 63))
3217 . chr(128 + (($v >> 6) & 63))
3218 . chr(128 + ($v & 63));
3219 } elseif ($v < 1 << 26) {
3221 $output .= chr(248 + ($v >> 24))
3222 . chr(128 + (($v >> 18) & 63))
3223 . chr(128 + (($v >> 12) & 63))
3224 . chr(128 + (($v >> 6) & 63))
3225 . chr(128 + ($v & 63));
3226 } elseif ($v < 1 << 31) {
3228 $output .= chr(252 + ($v >> 30))
3229 . chr(128 + (($v >> 24) & 63))
3230 . chr(128 + (($v >> 18) & 63))
3231 . chr(128 + (($v >> 12) & 63))
3232 . chr(128 + (($v >> 6) & 63))
3233 . chr(128 + ($v & 63));
3235 throw new UnexpectedValueException('Conversion from UCS-4 to UTF-8 failed: malformed input');
3243 * Convert UCS-4 array into UCS-4 string
3245 * @param array $input ucs4-encoded array
3247 * @return string ucs4-encoded string
3251 private function _ucs4_to_ucs4_string(array $input): string
3254 // Take array values and split output to 4 bytes per value
3255 // The bit mask is 255, which reads &11111111
3256 foreach ($input as $v) {
3257 $output .= ($v & (255 << 24) >> 24) . ($v & (255 << 16) >> 16) . ($v & (255 << 8) >> 8) . ($v & 255);
3263 * Convert UCS-4 string into UCS-4 array
3265 * @param string $input ucs4-encoded string
3267 * @return array ucs4-encoded array
3268 * @throws InvalidArgumentException
3271 private function _ucs4_string_to_ucs4(string $input): array
3275 $inp_len = self::_byteLength($input);
3276 // Input length must be dividable by 4
3278 throw new InvalidArgumentException('Input UCS4 string is broken');
3281 // Empty input - return empty output
3286 for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
3287 // Increment output position every 4 input bytes
3290 $output[$out_len] = 0;
3292 $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4)));
3298 * Echo hex representation of UCS4 sequence.
3300 * @param array $input UCS4 sequence
3301 * @param bool $include_bit Include bitmask in output
3307 private static function _showHex(array $input, bool $include_bit = false) //: void XXX PHP: Upgrade to PHP 7.1
3309 foreach ($input as $k => $v) {
3310 echo '[', $k, '] => ', sprintf('%X', $v);
3313 echo ' (', Net_IDNA2::_showBitmask($v), ')';
3321 * Gives you a bit representation of given Byte (8 bits), Word (16 bits) or DWord (32 bits)
3322 * Output width is automagically determined
3324 * @param int $octet ...
3326 * @return string Bitmask-representation
3330 private static function _showBitmask(int $octet): string
3332 if ($octet >= (1 << 16)) {
3334 } elseif ($octet >= (1 << 8)) {
3342 for ($i = $w; $i > -1; $i--) {
3343 $return .= ($octet & (1 << $i)) ? '1' : '0';
3350 * Gets the length of a string in bytes even if mbstring function
3351 * overloading is turned on
3353 * @param string $string the string for which to get the length.
3354 * @param string $encoding [optional] &mbstring.encoding.parameter;
3356 * @return int the length of the string in bytes.
3358 * @see Net_IDNA2::$_mb_string_overload
3360 private static function _byteLength(string $string, string $encoding = '8bit'): int
3362 if (self::$_mb_string_overload) {
3363 return mb_strlen($string, $encoding);
3365 return strlen((binary)$string);
3372 * Attempts to return a concrete IDNA instance for either php4 or php5.
3374 * @param array $params Set of paramaters
3379 public static function getInstance(array $params = []): Net_IDNA2
3381 return new Net_IDNA2($params);
3387 * Attempts to return a concrete IDNA instance for either php4 or php5,
3388 * only creating a new instance if no IDNA instance with the same
3389 * parameters currently exists.
3391 * @param array $params Set of parameters
3396 public static function singleton(array $params = []): Net_IDNA2
3399 if (!isset($instances)) {
3403 $signature = serialize($params);
3404 if (!isset($instances[$signature])) {
3405 $instances[$signature] = Net_IDNA2::getInstance($params);
3408 return $instances[$signature];