4 * Parses a URI into the components and fragment identifier as specified
7 class HTMLPurifier_URIParser
11 * Instance of HTMLPurifier_PercentEncoder to do normalization with.
13 protected $percentEncoder;
15 public function __construct() {
16 $this->percentEncoder = new HTMLPurifier_PercentEncoder();
21 * @param $uri string URI to parse
22 * @return HTMLPurifier_URI representation of URI. This representation has
23 * not been validated yet and may not conform to RFC.
25 public function parse($uri) {
27 $uri = $this->percentEncoder->normalize($uri);
29 // Regexp is as per Appendix B.
30 // Note that ["<>] are an addition to the RFC's recommended
31 // characters, because they represent external delimeters.
33 '(([^:/?#"<>]+):)?'. // 2. Scheme
34 '(//([^/?#"<>]*))?'. // 4. Authority
35 '([^?#"<>]*)'. // 5. Path
36 '(\?([^#"<>]*))?'. // 7. Query
37 '(#([^"<>]*))?'. // 8. Fragment
41 $result = preg_match($r_URI, $uri, $matches);
43 if (!$result) return false; // *really* invalid URI
46 $scheme = !empty($matches[1]) ? $matches[2] : null;
47 $authority = !empty($matches[3]) ? $matches[4] : null;
48 $path = $matches[5]; // always present, can be empty
49 $query = !empty($matches[6]) ? $matches[7] : null;
50 $fragment = !empty($matches[8]) ? $matches[9] : null;
52 // further parse authority
53 if ($authority !== null) {
54 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
56 preg_match($r_authority, $authority, $matches);
57 $userinfo = !empty($matches[1]) ? $matches[2] : null;
58 $host = !empty($matches[3]) ? $matches[3] : '';
59 $port = !empty($matches[4]) ? (int) $matches[5] : null;
61 $port = $host = $userinfo = null;
64 return new HTMLPurifier_URI(
65 $scheme, $userinfo, $host, $port, $path, $query, $fragment);