extlib/Mf2/Parser.php

   1 <?php
   2
   3 namespace Mf2;
   4
   5 use DOMDocument;
   6 use DOMElement;
   7 use DOMXPath;
   8 use DOMNode;
   9 use DOMNodeList;
  10 use Exception;
  11 use SplObjectStorage;
  12 use stdClass;
  13
  14 /**
  15  * Parse Microformats2
  16  *
  17  * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
  18  *
  19  * Example usage:
  20  *
  21  *     use Mf2;
  22  *     $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
  23  *     echo json_encode($output, JSON_PRETTY_PRINT);
  24  *
  25  * Produces:
  26  *
  27  *     {
  28  *      "items": [
  29  *       {
  30  *        "type": ["h-card"],
  31  *        "properties": {
  32  *         "name": ["Barnaby Walters"]
  33  *        }
  34  *       }
  35  *      ],
  36  *      "rels": {}
  37  *     }
  38  *
  39  * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
  40  * @param string $url The URL the input document was found at, for relative URL resolution
  41  * @param bool $convertClassic whether or not to convert classic microformats
  42  * @return array Canonical MF2 array structure
  43  */
  44 function parse($input, $url = null, $convertClassic = true) {
  45         $parser = new Parser($input, $url);
  46         return $parser->parse($convertClassic);
  47 }
  48
  49 /**
  50  * Fetch microformats2
  51  *
  52  * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
  53  * microformats2 array structure.
  54  *
  55  * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
  56  * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
  57  * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
  58  * for the actual value.
  59  *
  60  * @param string $url The URL to fetch
  61  * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
  62  * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
  63  * @return array|null canonical microformats2 array structure on success, null on failure
  64  */
  65 function fetch($url, $convertClassic = true, &$curlInfo=null) {
  66         $ch = curl_init();
  67         curl_setopt($ch, CURLOPT_URL, $url);
  68         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  69         curl_setopt($ch, CURLOPT_HEADER, 0);
  70         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  71         curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
  72         $response = curl_exec($ch);
  73         $info = $curlInfo = curl_getinfo($ch);
  74         curl_close($ch);
  75
  76         if (strpos(strtolower($info['content_type']), 'html') === false) {
  77                 // The content was not delivered as HTML, do not attempt to parse it.
  78                 return null;
  79         }
  80
  81         $html = mb_substr($response, $info['header_size']);
  82         return parse($html, $url, $convertClassic);
  83 }
  84
  85 /**
  86  * Unicode to HTML Entities
  87  * @param string $input String containing characters to convert into HTML entities
  88  * @return string
  89  */
  90 function unicodeToHtmlEntities($input) {
  91         return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  92 }
  93
  94 /**
  95  * Collapse Whitespace
  96  *
  97  * Collapses any sequences of whitespace within a string into a single space
  98  * character.
  99  *
 100  * @deprecated since v0.2.3
 101  * @param string $str
 102  * @return string
 103  */
 104 function collapseWhitespace($str) {
 105         return preg_replace('/[\s|\n]+/', ' ', $str);
 106 }
 107
 108 function unicodeTrim($str) {
 109         // this is cheating. TODO: find a better way if this causes any problems
 110         $str = str_replace(mb_convert_encoding('&nbsp;', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
 111         $str = preg_replace('/^\s+/', '', $str);
 112         return preg_replace('/\s+$/', '', $str);
 113 }
 114
 115 /**
 116  * Microformat Name From Class string
 117  *
 118  * Given the value of @class, get the relevant mf classnames (e.g. h-card,
 119  * p-name).
 120  *
 121  * @param string $class A space delimited list of classnames
 122  * @param string $prefix The prefix to look for
 123  * @return string|array The prefixed name of the first microfomats class found or false
 124  */
 125 function mfNamesFromClass($class, $prefix='h-') {
 126         $class = str_replace([' ', '    ', "\n"], ' ', $class);
 127         $classes = explode(' ', $class);
 128         $matches = array();
 129
 130         foreach ($classes as $classname) {
 131                 if (strpos($classname, $prefix) === 0 && $classname !== $prefix) {
 132                         $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
 133                 }
 134         }
 135
 136         return $matches;
 137 }
 138
 139 /**
 140  * Get Nested µf Property Name From Class
 141  *
 142  * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
 143  * space-separated string.
 144  *
 145  * @param string $class
 146  * @return array
 147  */
 148 function nestedMfPropertyNamesFromClass($class) {
 149         $prefixes = array('p-', 'u-', 'dt-', 'e-');
 150         $propertyNames = array();
 151
 152         $class = str_replace([' ', '    ', "\n"], ' ', $class);
 153         foreach (explode(' ', $class) as $classname) {
 154                 foreach ($prefixes as $prefix) {
 155                         if (strpos($classname, $prefix) === 0 and $classname !== $prefix) {
 156                                 $propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix)));
 157                         }
 158                 }
 159         }
 160
 161         return $propertyNames;
 162 }
 163
 164 /**
 165  * Wraps mfNamesFromClass to handle an element as input (common)
 166  *
 167  * @param DOMElement $e The element to get the classname for
 168  * @param string $prefix The prefix to look for
 169  * @return mixed See return value of mf2\Parser::mfNameFromClass()
 170  */
 171 function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
 172         $class = $e->getAttribute('class');
 173         return mfNamesFromClass($class, $prefix);
 174 }
 175
 176 /**
 177  * Wraps nestedMfPropertyNamesFromClass to handle an element as input
 178  */
 179 function nestedMfPropertyNamesFromElement(\DOMElement $e) {
 180         $class = $e->getAttribute('class');
 181         return nestedMfPropertyNamesFromClass($class);
 182 }
 183
 184 /**
 185  * Converts various time formats to HH:MM
 186  * @param string $time The time to convert
 187  * @return string
 188  */
 189 function convertTimeFormat($time) {
 190         $hh = $mm = $ss = '';
 191         preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
 192
 193         // if no am/pm specified
 194         if ( empty($matches[4]) ) {
 195                 return $time;
 196         }
 197         // else am/pm specified
 198         else {
 199                 $meridiem = strtolower(str_replace('.', '', $matches[4]));
 200
 201                 // hours
 202                 $hh = $matches[1];
 203
 204                 // add 12 to the pm hours
 205                 if ( $meridiem == 'pm' && ($hh < 12) )
 206                 {
 207                         $hh += 12;
 208                 }
 209
 210                 $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
 211
 212                 // minutes
 213                 $mm = ( empty($matches[2]) ) ? '00' : $matches[2];
 214
 215                 // seconds, only if supplied
 216                 if ( !empty($matches[3]) )
 217                 {
 218                         $ss = $matches[3];
 219                 }
 220
 221                 if ( empty($ss) ) {
 222                         return sprintf('%s:%s', $hh, $mm);
 223                 }
 224                 else {
 225                         return sprintf('%s:%s:%s', $hh, $mm, $ss);
 226                 }
 227
 228         }
 229
 230 }
 231
 232 /**
 233  * Microformats2 Parser
 234  *
 235  * A class which holds state for parsing microformats2 from HTML.
 236  *
 237  * Example usage:
 238  *
 239  *     use Mf2;
 240  *     $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
 241  *     $output = $parser->parse();
 242  */
 243 class Parser {
 244         /** @var string The baseurl (if any) to use for this parse */
 245         public $baseurl;
 246
 247         /** @var DOMXPath object which can be used to query over any fragment*/
 248         public $xpath;
 249
 250         /** @var DOMDocument */
 251         public $doc;
 252
 253         /** @var SplObjectStorage */
 254         protected $parsed;
 255
 256         public $jsonMode;
 257
 258         /**
 259          * Constructor
 260          *
 261          * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
 262          * @param string $url The URL of the parsed document, for relative URL resolution
 263          * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
 264          */
 265         public function __construct($input, $url = null, $jsonMode = false) {
 266                 libxml_use_internal_errors(true);
 267                 if (is_string($input)) {
 268                         $doc = new DOMDocument();
 269                         @$doc->loadHTML(unicodeToHtmlEntities($input));
 270                 } elseif (is_a($input, 'DOMDocument')) {
 271                         $doc = $input;
 272                 } else {
 273                         $doc = new DOMDocument();
 274                         @$doc->loadHTML('');
 275                 }
 276
 277                 $this->xpath = new DOMXPath($doc);
 278
 279                 $baseurl = $url;
 280                 foreach ($this->xpath->query('//base[@href]') as $base) {
 281                         $baseElementUrl = $base->getAttribute('href');
 282
 283                         if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
 284                                 /* The base element URL is relative to the document URL.
 285                                  *
 286                                  * :/
 287                                  *
 288                                  * Perhaps the author was high? */
 289
 290                                 $baseurl = resolveUrl($url, $baseElementUrl);
 291                         } else {
 292                                 $baseurl = $baseElementUrl;
 293                         }
 294                         break;
 295                 }
 296
 297                 $this->baseurl = $baseurl;
 298                 $this->doc = $doc;
 299                 $this->parsed = new SplObjectStorage();
 300                 $this->jsonMode = $jsonMode;
 301         }
 302
 303         private function elementPrefixParsed(\DOMElement $e, $prefix) {
 304                 if (!$this->parsed->contains($e))
 305                         $this->parsed->attach($e, array());
 306
 307                 $prefixes = $this->parsed[$e];
 308                 $prefixes[] = $prefix;
 309                 $this->parsed[$e] = $prefixes;
 310         }
 311
 312         private function isElementParsed(\DOMElement $e, $prefix) {
 313                 if (!$this->parsed->contains($e))
 314                         return false;
 315
 316                 $prefixes = $this->parsed[$e];
 317
 318                 if (!in_array($prefix, $prefixes))
 319                         return false;
 320
 321                 return true;
 322         }
 323
 324         // TODO: figure out if this has problems with sms: and geo: URLs
 325         public function resolveUrl($url) {
 326                 // If the URL is seriously malformed it’s probably beyond the scope of this
 327                 // parser to try to do anything with it.
 328                 if (parse_url($url) === false)
 329                         return $url;
 330
 331                 $scheme = parse_url($url, PHP_URL_SCHEME);
 332
 333                 if (empty($scheme) and !empty($this->baseurl)) {
 334                         return resolveUrl($this->baseurl, $url);
 335                 } else {
 336                         return $url;
 337                 }
 338         }
 339
 340         // Parsing Functions
 341
 342         /**
 343          * Parse value-class/value-title on an element, joining with $separator if
 344          * there are multiple.
 345          *
 346          * @param \DOMElement $e
 347          * @param string $separator = '' if multiple value-title elements, join with this string
 348          * @return string|null the parsed value or null if value-class or -title aren’t in use
 349          */
 350         public function parseValueClassTitle(\DOMElement $e, $separator = '') {
 351                 $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
 352
 353                 if ($valueClassElements->length !== 0) {
 354                         // Process value-class stuff
 355                         $val = '';
 356                         foreach ($valueClassElements as $el) {
 357                                 $val .= $el->textContent;
 358                         }
 359
 360                         return unicodeTrim($val);
 361                 }
 362
 363                 $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
 364
 365                 if ($valueTitleElements->length !== 0) {
 366                         // Process value-title stuff
 367                         $val = '';
 368                         foreach ($valueTitleElements as $el) {
 369                                 $val .= $el->getAttribute('title');
 370                         }
 371
 372                         return unicodeTrim($val);
 373                 }
 374
 375                 // No value-title or -class in this element
 376                 return null;
 377         }
 378
 379         /**
 380          * Given an element with class="p-*", get it’s value
 381          *
 382          * @param DOMElement $p The element to parse
 383          * @return string The plaintext value of $p, dependant on type
 384          * @todo Make this adhere to value-class
 385          */
 386         public function parseP(\DOMElement $p) {
 387                 $classTitle = $this->parseValueClassTitle($p, ' ');
 388
 389                 if ($classTitle !== null)
 390                         return $classTitle;
 391
 392                 if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
 393                         $pValue = $p->getAttribute('alt');
 394                 } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
 395                         $pValue = $p->getAttribute('alt');
 396                 } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
 397                         $pValue = $p->getAttribute('title');
 398                 } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
 399                         $pValue = $p->getAttribute('value');
 400                 } else {
 401                         $pValue = unicodeTrim($p->textContent);
 402                 }
 403
 404                 return $pValue;
 405         }
 406
 407         /**
 408          * Given an element with class="u-*", get the value of the URL
 409          *
 410          * @param DOMElement $u The element to parse
 411          * @return string The plaintext value of $u, dependant on type
 412          * @todo make this adhere to value-class
 413          */
 414         public function parseU(\DOMElement $u) {
 415                 if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
 416                         $uValue = $u->getAttribute('href');
 417                 } elseif ($u->tagName == 'img' and $u->getAttribute('src') !== null) {
 418                         $uValue = $u->getAttribute('src');
 419                 } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
 420                         $uValue = $u->getAttribute('data');
 421                 }
 422
 423                 if (isset($uValue)) {
 424                         return $this->resolveUrl($uValue);
 425                 }
 426
 427                 $classTitle = $this->parseValueClassTitle($u);
 428
 429                 if ($classTitle !== null) {
 430                         return $classTitle;
 431                 } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
 432                         return $u->getAttribute('title');
 433                 } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
 434                         return $u->getAttribute('value');
 435                 } else {
 436                         return unicodeTrim($u->textContent);
 437                 }
 438         }
 439
 440         /**
 441          * Given an element with class="dt-*", get the value of the datetime as a php date object
 442          *
 443          * @param DOMElement $dt The element to parse
 444          * @param array $dates Array of dates processed so far
 445          * @return string The datetime string found
 446          */
 447         public function parseDT(\DOMElement $dt, &$dates = array()) {
 448                 // Check for value-class pattern
 449                 $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
 450                 $dtValue = false;
 451
 452                 if ($valueClassChildren->length > 0) {
 453                         // They’re using value-class
 454                         $dateParts = array();
 455
 456                         foreach ($valueClassChildren as $e) {
 457                                 if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
 458                                         $title = $e->getAttribute('title');
 459                                         if (!empty($title))
 460                                                 $dateParts[] = $title;
 461                                 }
 462                                 elseif ($e->tagName == 'img' or $e->tagName == 'area') {
 463                                         // Use @alt
 464                                         $alt = $e->getAttribute('alt');
 465                                         if (!empty($alt))
 466                                                 $dateParts[] = $alt;
 467                                 }
 468                                 elseif ($e->tagName == 'data') {
 469                                         // Use @value, otherwise innertext
 470                                         $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
 471                                         if (!empty($value))
 472                                                 $dateParts[] = $value;
 473                                 }
 474                                 elseif ($e->tagName == 'abbr') {
 475                                         // Use @title, otherwise innertext
 476                                         $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
 477                                         if (!empty($title))
 478                                                 $dateParts[] = $title;
 479                                 }
 480                                 elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
 481                                         // Use @datetime if available, otherwise innertext
 482                                         $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
 483                                         if (!empty($dtAttr))
 484                                                 $dateParts[] = $dtAttr;
 485                                 }
 486                                 else {
 487                                         if (!empty($e->nodeValue))
 488                                                 $dateParts[] = unicodeTrim($e->nodeValue);
 489                                 }
 490                         }
 491
 492                         // Look through dateParts
 493                         $datePart = '';
 494                         $timePart = '';
 495                         foreach ($dateParts as $part) {
 496                                 // Is this part a full ISO8601 datetime?
 497                                 if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
 498                                         // Break completely, we’ve got our value.
 499                                         $dtValue = $part;
 500                                         break;
 501                                 } else {
 502                                         // Is the current part a valid time(+TZ?) AND no other time representation has been found?
 503                                         if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
 504                                                 $timePart = $part;
 505                                         } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
 506                                                 // Is the current part a valid date AND no other date representation has been found?
 507                                                 $datePart = $part;
 508                                         }
 509
 510                                         if ( !empty($datePart) && !in_array($datePart, $dates) ) {
 511                                                 $dates[] = $datePart;
 512                                         }
 513
 514                                         $dtValue = '';
 515
 516                                         if ( empty($datePart) && !empty($timePart) ) {
 517                                                 $timePart = convertTimeFormat($timePart);
 518                                                 $dtValue = unicodeTrim($timePart, 'T');
 519                                         }
 520                                         else if ( !empty($datePart) && empty($timePart) ) {
 521                                                 $dtValue = rtrim($datePart, 'T');
 522                                         }
 523                                         else {
 524                                                 $timePart = convertTimeFormat($timePart);
 525                                                 $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
 526                                         }
 527                                 }
 528                         }
 529                 } else {
 530                         // Not using value-class (phew).
 531                         if ($dt->tagName == 'img' or $dt->tagName == 'area') {
 532                                 // Use @alt
 533                                 // Is it an entire dt?
 534                                 $alt = $dt->getAttribute('alt');
 535                                 if (!empty($alt))
 536                                         $dtValue = $alt;
 537                         } elseif (in_array($dt->tagName, array('data'))) {
 538                                 // Use @value, otherwise innertext
 539                                 // Is it an entire dt?
 540                                 $value = $dt->getAttribute('value');
 541                                 if (!empty($value))
 542                                         $dtValue = $value;
 543                                 else
 544                                         $dtValue = $dt->nodeValue;
 545                         } elseif ($dt->tagName == 'abbr') {
 546                                 // Use @title, otherwise innertext
 547                                 // Is it an entire dt?
 548                                 $title = $dt->getAttribute('title');
 549                                 if (!empty($title))
 550                                         $dtValue = $title;
 551                                 else
 552                                         $dtValue = $dt->nodeValue;
 553                         } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
 554                                 // Use @datetime if available, otherwise innertext
 555                                 // Is it an entire dt?
 556                                 $dtAttr = $dt->getAttribute('datetime');
 557                                 if (!empty($dtAttr))
 558                                         $dtValue = $dtAttr;
 559                                 else
 560                                         $dtValue = $dt->nodeValue;
 561                         } else {
 562                                 $dtValue = $dt->nodeValue;
 563                         }
 564
 565                         if ( preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches) ) {
 566                                 $dates[] = $matches[0];
 567                         }
 568                 }
 569
 570                 /**
 571                  * if $dtValue is only a time and there are recently parsed dates,
 572                  * form the full date-time using the most recnetly parsed dt- value
 573                  */
 574                 if ( (preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates) ) {
 575                         $dtValue = convertTimeFormat($dtValue);
 576                         $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
 577                 }
 578
 579                 return $dtValue;
 580         }
 581
 582         /**
 583          *      Given the root element of some embedded markup, return a string representing that markup
 584          *
 585          *      @param DOMElement $e The element to parse
 586          *      @return string $e’s innerHTML
 587          *
 588          * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
 589          */
 590         public function parseE(\DOMElement $e) {
 591                 $classTitle = $this->parseValueClassTitle($e);
 592
 593                 if ($classTitle !== null)
 594                         return $classTitle;
 595
 596                 // Expand relative URLs within children of this element
 597                 // TODO: as it is this is not relative to only children, make this .// and rerun tests
 598                 $hyperlinkChildren = $this->xpath->query('//*[@src or @href or @data]', $e);
 599
 600                 foreach ($hyperlinkChildren as $child) {
 601                         if ($child->hasAttribute('href'))
 602                                 $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
 603                         if ($child->hasAttribute('src'))
 604                                 $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
 605                         if ($child->hasAttribute('data'))
 606                                 $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
 607                 }
 608
 609                 $html = '';
 610                 foreach ($e->childNodes as $node) {
 611                         $html .= $node->C14N();
 612                 }
 613
 614                 return array(
 615                         'html' => $html,
 616                         'value' => unicodeTrim($e->textContent)
 617                 );
 618         }
 619
 620         /**
 621          * Recursively parse microformats
 622          *
 623          * @param DOMElement $e The element to parse
 624          * @return array A representation of the values contained within microformat $e
 625          */
 626         public function parseH(\DOMElement $e) {
 627                 // If it’s already been parsed (e.g. is a child mf), skip
 628                 if ($this->parsed->contains($e))
 629                         return null;
 630
 631                 // Get current µf name
 632                 $mfTypes = mfNamesFromElement($e, 'h-');
 633
 634                 // Initalise var to store the representation in
 635                 $return = array();
 636                 $children = array();
 637                 $dates = array();
 638
 639                 // Handle nested microformats (h-*)
 640                 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
 641                         // Parse
 642                         $result = $this->parseH($subMF);
 643
 644                         // If result was already parsed, skip it
 645                         if (null === $result)
 646                                 continue;
 647
 648                         $result['value'] = $this->parseP($subMF);
 649
 650                         // Does this µf have any property names other than h-*?
 651                         $properties = nestedMfPropertyNamesFromElement($subMF);
 652
 653                         if (!empty($properties)) {
 654                                 // Yes! It’s a nested property µf
 655                                 foreach ($properties as $property) {
 656                                         $return[$property][] = $result;
 657                                 }
 658                         } else {
 659                                 // No, it’s a child µf
 660                                 $children[] = $result;
 661                         }
 662
 663                         // Make sure this sub-mf won’t get parsed as a µf or property
 664                         // TODO: Determine if clearing this is required?
 665                         $this->elementPrefixParsed($subMF, 'h');
 666                         $this->elementPrefixParsed($subMF, 'p');
 667                         $this->elementPrefixParsed($subMF, 'u');
 668                         $this->elementPrefixParsed($subMF, 'dt');
 669                         $this->elementPrefixParsed($subMF, 'e');
 670                 }
 671
 672                 // Handle p-*
 673                 foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
 674                         if ($this->isElementParsed($p, 'p'))
 675                                 continue;
 676
 677                         $pValue = $this->parseP($p);
 678
 679                         // Add the value to the array for it’s p- properties
 680                         foreach (mfNamesFromElement($p, 'p-') as $propName) {
 681                                 if (!empty($propName))
 682                                         $return[$propName][] = $pValue;
 683                         }
 684
 685                         // Make sure this sub-mf won’t get parsed as a top level mf
 686                         $this->elementPrefixParsed($p, 'p');
 687                 }
 688
 689                 // Handle u-*
 690                 foreach ($this->xpath->query('.//*[contains(concat(" ",  @class)," u-")]', $e) as $u) {
 691                         if ($this->isElementParsed($u, 'u'))
 692                                 continue;
 693
 694                         $uValue = $this->parseU($u);
 695
 696                         // Add the value to the array for it’s property types
 697                         foreach (mfNamesFromElement($u, 'u-') as $propName) {
 698                                 $return[$propName][] = $uValue;
 699                         }
 700
 701                         // Make sure this sub-mf won’t get parsed as a top level mf
 702                         $this->elementPrefixParsed($u, 'u');
 703                 }
 704
 705                 // Handle dt-*
 706                 foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
 707                         if ($this->isElementParsed($dt, 'dt'))
 708                                 continue;
 709
 710                         $dtValue = $this->parseDT($dt, $dates);
 711
 712                         if ($dtValue) {
 713                                 // Add the value to the array for dt- properties
 714                                 foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
 715                                         $return[$propName][] = $dtValue;
 716                                 }
 717                         }
 718
 719                         // Make sure this sub-mf won’t get parsed as a top level mf
 720                         $this->elementPrefixParsed($dt, 'dt');
 721                 }
 722
 723                 // Handle e-*
 724                 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
 725                         if ($this->isElementParsed($em, 'e'))
 726                                 continue;
 727
 728                         $eValue = $this->parseE($em);
 729
 730                         if ($eValue) {
 731                                 // Add the value to the array for e- properties
 732                                 foreach (mfNamesFromElement($em, 'e-') as $propName) {
 733                                         $return[$propName][] = $eValue;
 734                                 }
 735                         }
 736                         // Make sure this sub-mf won’t get parsed as a top level mf
 737                         $this->elementPrefixParsed($em, 'e');
 738                 }
 739
 740                 // Implied Properties
 741                 // Check for p-name
 742                 if (!array_key_exists('name', $return)) {
 743                         try {
 744                                 // Look for img @alt
 745                                 if ($e->tagName == 'img' and $e->getAttribute('alt') != '')
 746                                         throw new Exception($e->getAttribute('alt'));
 747
 748                                 if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
 749                                         throw new Exception($e->getAttribute('title'));
 750
 751                                 // Look for nested img @alt
 752                                 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
 753                                         if ($em->getAttribute('alt') != '')
 754                                                 throw new Exception($em->getAttribute('alt'));
 755                                 }
 756
 757                                 // Look for double nested img @alt
 758                                 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
 759                                         if ($em->getAttribute('alt') != '')
 760                                                 throw new Exception($em->getAttribute('alt'));
 761                                 }
 762
 763                                 throw new Exception($e->nodeValue);
 764                         } catch (Exception $exc) {
 765                                 $return['name'][] = unicodeTrim($exc->getMessage());
 766                         }
 767                 }
 768
 769                 // Check for u-photo
 770                 if (!array_key_exists('photo', $return)) {
 771                         // Look for img @src
 772                         try {
 773                                 if ($e->tagName == 'img')
 774                                         throw new Exception($e->getAttribute('src'));
 775
 776                                 // Look for nested img @src
 777                                 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
 778                                         if ($em->getAttribute('src') != '')
 779                                                 throw new Exception($em->getAttribute('src'));
 780                                 }
 781
 782                                 // Look for double nested img @src
 783                                 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
 784                                         if ($em->getAttribute('src') != '')
 785                                                 throw new Exception($em->getAttribute('src'));
 786                                 }
 787                         } catch (Exception $exc) {
 788                                 $return['photo'][] = $this->resolveUrl($exc->getMessage());
 789                         }
 790                 }
 791
 792                 // Check for u-url
 793                 if (!array_key_exists('url', $return)) {
 794                         // Look for img @src
 795                         if ($e->tagName == 'a')
 796                                 $url = $e->getAttribute('href');
 797
 798                         // Look for nested img @src
 799                         foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
 800                                 $url = $em->getAttribute('href');
 801                                 break;
 802                         }
 803
 804                         if (!empty($url))
 805                                 $return['url'][] = $this->resolveUrl($url);
 806                 }
 807
 808                 // Make sure things are in alphabetical order
 809                 sort($mfTypes);
 810
 811                 // Phew. Return the final result.
 812                 $parsed = array(
 813                         'type' => $mfTypes,
 814                         'properties' => $return
 815                 );
 816                 if (!empty($children))
 817                         $parsed['children'] = array_values(array_filter($children));
 818                 return $parsed;
 819         }
 820
 821         /**
 822          * Parse Rels and Alternatives
 823          *
 824          * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
 825          * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
 826          * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
 827          * it will be an empty array.
 828          */
 829         public function parseRelsAndAlternates() {
 830                 $rels = array();
 831                 $alternates = array();
 832
 833                 // Iterate through all a, area and link elements with rel attributes
 834                 foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
 835                         if ($hyperlink->getAttribute('rel') == '')
 836                                 continue;
 837
 838                         // Resolve the href
 839                         $href = $this->resolveUrl($hyperlink->getAttribute('href'));
 840
 841                         // Split up the rel into space-separated values
 842                         $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
 843
 844                         // If alternate in rels, create alternate structure, append
 845                         if (in_array('alternate', $linkRels)) {
 846                                 $alt = array(
 847                                         'url' => $href,
 848                                         'rel' => implode(' ', array_diff($linkRels, array('alternate')))
 849                                 );
 850                                 if ($hyperlink->hasAttribute('media'))
 851                                         $alt['media'] = $hyperlink->getAttribute('media');
 852
 853                                 if ($hyperlink->hasAttribute('hreflang'))
 854                                         $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
 855
 856                                 $alternates[] = $alt;
 857                         } else {
 858                                 foreach ($linkRels as $rel) {
 859                                         $rels[$rel][] = $href;
 860                                 }
 861                         }
 862                 }
 863
 864                 if (empty($rels) and $this->jsonMode) {
 865                         $rels = new stdClass();
 866                 }
 867
 868                 return array($rels, $alternates);
 869         }
 870
 871         /**
 872          * Kicks off the parsing routine
 873          *
 874          * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
 875          * will be HTML-encoded, bringing all output to the same level of encoding.
 876          *
 877          * If a DOMElement is set as the $context, only descendants of that element will
 878          * be parsed for microformats.
 879          *
 880          * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
 881          * @param DOMElement $context optionally an element from which to parse microformats
 882          * @return array An array containing all the µfs found in the current document
 883          */
 884         public function parse($convertClassic = true, DOMElement $context = null) {
 885                 $mfs = array();
 886
 887                 if ($convertClassic) {
 888                         $this->convertLegacy();
 889                 }
 890
 891                 $mfElements = null === $context
 892                         ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
 893                         : $this->xpath->query('.//*[contains(concat(" ",        @class), " h-")]', $context);
 894
 895                 // Parser microformats
 896                 foreach ($mfElements as $node) {
 897                         // For each microformat
 898                         $result = $this->parseH($node);
 899
 900                         // Add the value to the array for this property type
 901                         $mfs[] = $result;
 902                 }
 903
 904                 // Parse rels
 905                 list($rels, $alternates) = $this->parseRelsAndAlternates();
 906
 907                 $top = array(
 908                         'items' => array_values(array_filter($mfs)),
 909                         'rels' => $rels
 910                 );
 911
 912                 if (count($alternates))
 913                         $top['alternates'] = $alternates;
 914
 915                 return $top;
 916         }
 917
 918         /**
 919          * Parse From ID
 920          *
 921          * Given an ID, parse all microformats which are children of the element with
 922          * that ID.
 923          *
 924          * Note that rel values are still document-wide.
 925          *
 926          * If an element with the ID is not found, an empty skeleton mf2 array structure
 927          * will be returned.
 928          *
 929          * @param string $id
 930          * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
 931          * @return array
 932          */
 933         public function parseFromId($id, $convertClassic=true) {
 934                 $matches = $this->xpath->query("//*[@id='{$id}']");
 935
 936                 if (empty($matches))
 937                         return array('items' => array(), 'rels' => array(), 'alternates' => array());
 938
 939                 return $this->parse($convertClassic, $matches->item(0));
 940         }
 941
 942         /**
 943          * Convert Legacy Classnames
 944          *
 945          * Adds microformats2 classnames into a document containing only legacy
 946          * semantic classnames.
 947          *
 948          * @return Parser $this
 949          */
 950         public function convertLegacy() {
 951                 $doc = $this->doc;
 952                 $xp = new DOMXPath($doc);
 953
 954                 // replace all roots
 955                 foreach ($this->classicRootMap as $old => $new) {
 956                         foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
 957                                 $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
 958                         }
 959                 }
 960
 961                 foreach ($this->classicPropertyMap as $oldRoot => $properties) {
 962                         $newRoot = $this->classicRootMap[$oldRoot];
 963                         foreach ($properties as $old => $new) {
 964                                 foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
 965                                         $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
 966                                 }
 967                         }
 968                 }
 969
 970                 return $this;
 971         }
 972
 973         /**
 974          * XPath Query
 975          *
 976          * Runs an XPath query over the current document. Works in exactly the same
 977          * way as DOMXPath::query.
 978          *
 979          * @param string $expression
 980          * @param DOMNode $context
 981          * @return DOMNodeList
 982          */
 983         public function query($expression, $context = null) {
 984                 return $this->xpath->query($expression, $context);
 985         }
 986
 987         /**
 988          * Classic Root Classname map
 989          */
 990         public $classicRootMap = array(
 991                 'vcard' => 'h-card',
 992                 'hfeed' => 'h-feed',
 993                 'hentry' => 'h-entry',
 994                 'hrecipe' => 'h-recipe',
 995                 'hresume' => 'h-resume',
 996                 'hevent' => 'h-event',
 997                 'hreview' => 'h-review'
 998         );
 999
1000         public $classicPropertyMap = array(
1001                 'vcard' => array(
1002                         'fn' => 'p-name',
1003                         'url' => 'u-url',
1004                         'honorific-prefix' => 'p-honorific-prefix',
1005                         'given-name' => 'p-given-name',
1006                         'additional-name' => 'p-additional-name',
1007                         'family-name' => 'p-family-name',
1008                         'honorific-suffix' => 'p-honorific-suffix',
1009                         'nickname' => 'p-nickname',
1010                         'email' => 'u-email',
1011                         'logo' => 'u-logo',
1012                         'photo' => 'u-photo',
1013                         'url' => 'u-url',
1014                         'uid' => 'u-uid',
1015                         'category' => 'p-category',
1016                         'adr' => 'p-adr h-adr',
1017                         'extended-address' => 'p-extended-address',
1018                         'street-address' => 'p-street-address',
1019                         'locality' => 'p-locality',
1020                         'region' => 'p-region',
1021                         'postal-code' => 'p-postal-code',
1022                         'country-name' => 'p-country-name',
1023                         'label' => 'p-label',
1024                         'geo' => 'p-geo h-geo',
1025                         'latitude' => 'p-latitude',
1026                         'longitude' => 'p-longitude',
1027                         'tel' => 'p-tel',
1028                         'note' => 'p-note',
1029                         'bday' => 'dt-bday',
1030                         'key' => 'u-key',
1031                         'org' => 'p-org',
1032                         'organization-name' => 'p-organization-name',
1033                         'organization-unit' => 'p-organization-unit',
1034                 ),
1035                 'hentry' => array(
1036                         'entry-title' => 'p-name',
1037                         'entry-summary' => 'p-summary',
1038                         'entry-content' => 'e-content',
1039                         'published' => 'dt-published',
1040                         'updated' => 'dt-updated',
1041                         'author' => 'p-author h-card',
1042                         'category' => 'p-category',
1043                         'geo' => 'p-geo h-geo',
1044                         'latitude' => 'p-latitude',
1045                         'longitude' => 'p-longitude',
1046                 ),
1047                 'hrecipe' => array(
1048                         'fn' => 'p-name',
1049                         'ingredient' => 'p-ingredient',
1050                         'yield' => 'p-yield',
1051                         'instructions' => 'e-instructions',
1052                         'duration' => 'dt-duration',
1053                         'nutrition' => 'p-nutrition',
1054                         'photo' => 'u-photo',
1055                         'summary' => 'p-summary',
1056                         'author' => 'p-author h-card'
1057                 ),
1058                 'hresume' => array(
1059                         'summary' => 'p-summary',
1060                         'contact' => 'h-card p-contact',
1061                         'education' => 'h-event p-education',
1062                         'experience' => 'h-event p-experience',
1063                         'skill' => 'p-skill',
1064                         'affiliation' => 'p-affiliation h-card',
1065                 ),
1066                 'hevent' => array(
1067                         'dtstart' => 'dt-start',
1068                         'dtend' => 'dt-end',
1069                         'duration' => 'dt-duration',
1070                         'description' => 'p-description',
1071                         'summary' => 'p-summary',
1072                         'description' => 'p-description',
1073                         'url' => 'u-url',
1074                         'category' => 'p-category',
1075                         'location' => 'h-card',
1076                         'geo' => 'p-location h-geo'
1077                 ),
1078                 'hreview' => array(
1079                         'summary' => 'p-name',
1080                         'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
1081                         'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
1082                         'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
1083                         'reviewer' => 'p-reviewer p-author h-card',
1084                         'dtreviewed' => 'dt-reviewed',
1085                         'rating' => 'p-rating',
1086                         'best' => 'p-best',
1087                         'worst' => 'p-worst',
1088                         'description' => 'p-description'
1089                 )
1090         );
1091 }
1092
1093 function parseUriToComponents($uri) {
1094         $result = array(
1095                 'scheme' => null,
1096                 'authority' => null,
1097                 'path' => null,
1098                 'query' => null,
1099                 'fragment' => null
1100         );
1101
1102         $u = @parse_url($uri);
1103
1104         if(array_key_exists('scheme', $u))
1105                 $result['scheme'] = $u['scheme'];
1106
1107         if(array_key_exists('host', $u)) {
1108                 if(array_key_exists('user', $u))
1109                         $result['authority'] = $u['user'];
1110                 if(array_key_exists('pass', $u))
1111                         $result['authority'] .= ':' . $u['pass'];
1112                 if(array_key_exists('user', $u) || array_key_exists('pass', $u))
1113                         $result['authority'] .= '@';
1114                 $result['authority'] .= $u['host'];
1115                 if(array_key_exists('port', $u))
1116                         $result['authority'] .= ':' . $u['port'];
1117         }
1118
1119         if(array_key_exists('path', $u))
1120                 $result['path'] = $u['path'];
1121
1122         if(array_key_exists('query', $u))
1123                 $result['query'] = $u['query'];
1124
1125         if(array_key_exists('fragment', $u))
1126                 $result['fragment'] = $u['fragment'];
1127
1128         return $result;
1129 }
1130
1131 function resolveUrl($baseURI, $referenceURI) {
1132         $target = array(
1133                 'scheme' => null,
1134                 'authority' => null,
1135                 'path' => null,
1136                 'query' => null,
1137                 'fragment' => null
1138         );
1139
1140         # 5.2.1 Pre-parse the Base URI
1141         # The base URI (Base) is established according to the procedure of
1142   # Section 5.1 and parsed into the five main components described in
1143   # Section 3
1144         $base = parseUriToComponents($baseURI);
1145
1146         # If base path is blank (http://example.com) then set it to /
1147         # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
1148         if($base['path'] == null)
1149                 $base['path'] = '/';
1150
1151         # 5.2.2. Transform References
1152
1153         # The URI reference is parsed into the five URI components
1154         # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
1155         $reference = parseUriToComponents($referenceURI);
1156
1157         # A non-strict parser may ignore a scheme in the reference
1158         # if it is identical to the base URI's scheme.
1159         # TODO
1160
1161         if($reference['scheme']) {
1162                 $target['scheme'] = $reference['scheme'];
1163                 $target['authority'] = $reference['authority'];
1164                 $target['path'] = removeDotSegments($reference['path']);
1165                 $target['query'] = $reference['query'];
1166         } else {
1167                 if($reference['authority']) {
1168                         $target['authority'] = $reference['authority'];
1169                         $target['path'] = removeDotSegments($reference['path']);
1170                         $target['query'] = $reference['query'];
1171                 } else {
1172                         if($reference['path'] == '') {
1173                                 $target['path'] = $base['path'];
1174                                 if($reference['query']) {
1175                                         $target['query'] = $reference['query'];
1176                                 } else {
1177                                         $target['query'] = $base['query'];
1178                                 }
1179                         } else {
1180                                 if(substr($reference['path'], 0, 1) == '/') {
1181                                         $target['path'] = removeDotSegments($reference['path']);
1182                                 } else {
1183                                         $target['path'] = mergePaths($base, $reference);
1184                                         $target['path'] = removeDotSegments($target['path']);
1185                                 }
1186                                 $target['query'] = $reference['query'];
1187                         }
1188                         $target['authority'] = $base['authority'];
1189                 }
1190                 $target['scheme'] = $base['scheme'];
1191         }
1192         $target['fragment'] = $reference['fragment'];
1193
1194         # 5.3 Component Recomposition
1195         $result = '';
1196         if($target['scheme']) {
1197                 $result .= $target['scheme'] . ':';
1198         }
1199         if($target['authority']) {
1200                 $result .= '//' . $target['authority'];
1201         }
1202         $result .= $target['path'];
1203         if($target['query']) {
1204                 $result .= '?' . $target['query'];
1205         }
1206         if($target['fragment']) {
1207                 $result .= '#' . $target['fragment'];
1208         } elseif($referenceURI == '#') {
1209                 $result .= '#';
1210         }
1211         return $result;
1212 }
1213
1214 # 5.2.3 Merge Paths
1215 function mergePaths($base, $reference) {
1216         # If the base URI has a defined authority component and an empty
1217         #    path,
1218         if($base['authority'] && $base['path'] == null) {
1219                 # then return a string consisting of "/" concatenated with the
1220                 # reference's path; otherwise,
1221                 $merged = '/' . $reference['path'];
1222         } else {
1223                 if(($pos=strrpos($base['path'], '/')) !== false) {
1224                         # return a string consisting of the reference's path component
1225                         #    appended to all but the last segment of the base URI's path (i.e.,
1226                         #    excluding any characters after the right-most "/" in the base URI
1227                         #    path,
1228                         $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
1229                 } else {
1230                         #    or excluding the entire base URI path if it does not contain
1231                         #    any "/" characters).
1232                         $merged = $base['path'];
1233                 }
1234         }
1235         return $merged;
1236 }
1237
1238 # 5.2.4.A Remove leading ../ or ./
1239 function removeLeadingDotSlash(&$input) {
1240         if(substr($input, 0, 3) == '../') {
1241                 $input = substr($input, 3);
1242         } elseif(substr($input, 0, 2) == './') {
1243                 $input = substr($input, 2);
1244         }
1245 }
1246
1247 # 5.2.4.B Replace leading /. with /
1248 function removeLeadingSlashDot(&$input) {
1249         if(substr($input, 0, 3) == '/./') {
1250                 $input = '/' . substr($input, 3);
1251         } else {
1252                 $input = '/' . substr($input, 2);
1253         }
1254 }
1255
1256 # 5.2.4.C Given leading /../ remove component from output buffer
1257 function removeOneDirLevel(&$input, &$output) {
1258         if(substr($input, 0, 4) == '/../') {
1259                 $input = '/' . substr($input, 4);
1260         } else {
1261                 $input = '/' . substr($input, 3);
1262         }
1263         $output = substr($output, 0, strrpos($output, '/'));
1264 }
1265
1266 # 5.2.4.D Remove . and .. if it's the only thing in the input
1267 function removeLoneDotDot(&$input) {
1268         if($input == '.') {
1269                 $input = substr($input, 1);
1270         } else {
1271                 $input = substr($input, 2);
1272         }
1273 }
1274
1275 # 5.2.4.E Move one segment from input to output
1276 function moveOneSegmentFromInput(&$input, &$output) {
1277         if(substr($input, 0, 1) != '/') {
1278                 $pos = strpos($input, '/');
1279         } else {
1280                 $pos = strpos($input, '/', 1);
1281         }
1282
1283         if($pos === false) {
1284                 $output .= $input;
1285                 $input = '';
1286         } else {
1287                 $output .= substr($input, 0, $pos);
1288                 $input = substr($input, $pos);
1289         }
1290 }
1291
1292 # 5.2.4 Remove Dot Segments
1293 function removeDotSegments($path) {
1294         # 1.  The input buffer is initialized with the now-appended path
1295         #     components and the output buffer is initialized to the empty
1296         #     string.
1297         $input = $path;
1298         $output = '';
1299
1300         $step = 0;
1301
1302         # 2.  While the input buffer is not empty, loop as follows:
1303         while($input) {
1304                 $step++;
1305
1306                 if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
1307                         #     A.  If the input buffer begins with a prefix of "../" or "./",
1308                         #         then remove that prefix from the input buffer; otherwise,
1309                         removeLeadingDotSlash($input);
1310                 } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
1311                         #     B.  if the input buffer begins with a prefix of "/./" or "/.",
1312                         #         where "." is a complete path segment, then replace that
1313                         #         prefix with "/" in the input buffer; otherwise,
1314                         removeLeadingSlashDot($input);
1315                 } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
1316                         #     C.  if the input buffer begins with a prefix of "/../" or "/..",
1317                         #          where ".." is a complete path segment, then replace that
1318                         #          prefix with "/" in the input buffer and remove the last
1319                         #          segment and its preceding "/" (if any) from the output
1320                         #          buffer; otherwise,
1321                         removeOneDirLevel($input, $output);
1322                 } elseif($input == '.' || $input == '..') {
1323                         #     D.  if the input buffer consists only of "." or "..", then remove
1324                         #         that from the input buffer; otherwise,
1325                         removeLoneDotDot($input);
1326                 } else {
1327                         #     E.  move the first path segment in the input buffer to the end of
1328                         #         the output buffer and any subsequent characters up to, but not including,
1329                         #         the next "/" character or the end of the input buffer
1330                         moveOneSegmentFromInput($input, $output);
1331                 }
1332         }
1333
1334         return $output;
1335 }