17 * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
22 * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
23 * echo json_encode($output, JSON_PRETTY_PRINT);
32 * "name": ["Barnaby Walters"]
39 * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
40 * @param string $url The URL the input document was found at, for relative URL resolution
41 * @param bool $convertClassic whether or not to convert classic microformats
42 * @return array Canonical MF2 array structure
44 function parse($input, $url = null, $convertClassic = true) {
45 $parser = new Parser($input, $url);
46 return $parser->parse($convertClassic);
52 * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
53 * microformats2 array structure.
55 * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
56 * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
57 * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
58 * for the actual value.
60 * @param string $url The URL to fetch
61 * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
62 * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
63 * @return array|null canonical microformats2 array structure on success, null on failure
65 function fetch($url, $convertClassic = true, &$curlInfo=null) {
67 curl_setopt($ch, CURLOPT_URL, $url);
68 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
69 curl_setopt($ch, CURLOPT_HEADER, 0);
70 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
71 curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
72 $response = curl_exec($ch);
73 $info = $curlInfo = curl_getinfo($ch);
76 if (strpos(strtolower($info['content_type']), 'html') === false) {
77 // The content was not delivered as HTML, do not attempt to parse it.
81 $html = mb_substr($response, $info['header_size']);
82 return parse($html, $url, $convertClassic);
86 * Unicode to HTML Entities
87 * @param string $input String containing characters to convert into HTML entities
90 function unicodeToHtmlEntities($input) {
91 return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
97 * Collapses any sequences of whitespace within a string into a single space
100 * @deprecated since v0.2.3
104 function collapseWhitespace($str) {
105 return preg_replace('/[\s|\n]+/', ' ', $str);
108 function unicodeTrim($str) {
109 // this is cheating. TODO: find a better way if this causes any problems
110 $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
111 $str = preg_replace('/^\s+/', '', $str);
112 return preg_replace('/\s+$/', '', $str);
116 * Microformat Name From Class string
118 * Given the value of @class, get the relevant mf classnames (e.g. h-card,
121 * @param string $class A space delimited list of classnames
122 * @param string $prefix The prefix to look for
123 * @return string|array The prefixed name of the first microfomats class found or false
125 function mfNamesFromClass($class, $prefix='h-') {
126 $class = str_replace([' ', ' ', "\n"], ' ', $class);
127 $classes = explode(' ', $class);
130 foreach ($classes as $classname) {
131 if (strpos($classname, $prefix) === 0 && $classname !== $prefix) {
132 $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
140 * Get Nested µf Property Name From Class
142 * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
143 * space-separated string.
145 * @param string $class
148 function nestedMfPropertyNamesFromClass($class) {
149 $prefixes = array('p-', 'u-', 'dt-', 'e-');
150 $propertyNames = array();
152 $class = str_replace([' ', ' ', "\n"], ' ', $class);
153 foreach (explode(' ', $class) as $classname) {
154 foreach ($prefixes as $prefix) {
155 if (strpos($classname, $prefix) === 0 and $classname !== $prefix) {
156 $propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix)));
161 return $propertyNames;
165 * Wraps mfNamesFromClass to handle an element as input (common)
167 * @param DOMElement $e The element to get the classname for
168 * @param string $prefix The prefix to look for
169 * @return mixed See return value of mf2\Parser::mfNameFromClass()
171 function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
172 $class = $e->getAttribute('class');
173 return mfNamesFromClass($class, $prefix);
177 * Wraps nestedMfPropertyNamesFromClass to handle an element as input
179 function nestedMfPropertyNamesFromElement(\DOMElement $e) {
180 $class = $e->getAttribute('class');
181 return nestedMfPropertyNamesFromClass($class);
185 * Converts various time formats to HH:MM
186 * @param string $time The time to convert
189 function convertTimeFormat($time) {
190 $hh = $mm = $ss = '';
191 preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
193 // if no am/pm specified
194 if ( empty($matches[4]) ) {
197 // else am/pm specified
199 $meridiem = strtolower(str_replace('.', '', $matches[4]));
204 // add 12 to the pm hours
205 if ( $meridiem == 'pm' && ($hh < 12) )
210 $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
213 $mm = ( empty($matches[2]) ) ? '00' : $matches[2];
215 // seconds, only if supplied
216 if ( !empty($matches[3]) )
222 return sprintf('%s:%s', $hh, $mm);
225 return sprintf('%s:%s:%s', $hh, $mm, $ss);
233 * Microformats2 Parser
235 * A class which holds state for parsing microformats2 from HTML.
240 * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
241 * $output = $parser->parse();
244 /** @var string The baseurl (if any) to use for this parse */
247 /** @var DOMXPath object which can be used to query over any fragment*/
250 /** @var DOMDocument */
253 /** @var SplObjectStorage */
261 * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
262 * @param string $url The URL of the parsed document, for relative URL resolution
263 * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
265 public function __construct($input, $url = null, $jsonMode = false) {
266 libxml_use_internal_errors(true);
267 if (is_string($input)) {
268 $doc = new DOMDocument();
269 @$doc->loadHTML(unicodeToHtmlEntities($input));
270 } elseif (is_a($input, 'DOMDocument')) {
273 $doc = new DOMDocument();
277 $this->xpath = new DOMXPath($doc);
280 foreach ($this->xpath->query('//base[@href]') as $base) {
281 $baseElementUrl = $base->getAttribute('href');
283 if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
284 /* The base element URL is relative to the document URL.
288 * Perhaps the author was high? */
290 $baseurl = resolveUrl($url, $baseElementUrl);
292 $baseurl = $baseElementUrl;
297 $this->baseurl = $baseurl;
299 $this->parsed = new SplObjectStorage();
300 $this->jsonMode = $jsonMode;
303 private function elementPrefixParsed(\DOMElement $e, $prefix) {
304 if (!$this->parsed->contains($e))
305 $this->parsed->attach($e, array());
307 $prefixes = $this->parsed[$e];
308 $prefixes[] = $prefix;
309 $this->parsed[$e] = $prefixes;
312 private function isElementParsed(\DOMElement $e, $prefix) {
313 if (!$this->parsed->contains($e))
316 $prefixes = $this->parsed[$e];
318 if (!in_array($prefix, $prefixes))
324 // TODO: figure out if this has problems with sms: and geo: URLs
325 public function resolveUrl($url) {
326 // If the URL is seriously malformed it’s probably beyond the scope of this
327 // parser to try to do anything with it.
328 if (parse_url($url) === false)
331 $scheme = parse_url($url, PHP_URL_SCHEME);
333 if (empty($scheme) and !empty($this->baseurl)) {
334 return resolveUrl($this->baseurl, $url);
343 * Parse value-class/value-title on an element, joining with $separator if
344 * there are multiple.
346 * @param \DOMElement $e
347 * @param string $separator = '' if multiple value-title elements, join with this string
348 * @return string|null the parsed value or null if value-class or -title aren’t in use
350 public function parseValueClassTitle(\DOMElement $e, $separator = '') {
351 $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
353 if ($valueClassElements->length !== 0) {
354 // Process value-class stuff
356 foreach ($valueClassElements as $el) {
357 $val .= $el->textContent;
360 return unicodeTrim($val);
363 $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
365 if ($valueTitleElements->length !== 0) {
366 // Process value-title stuff
368 foreach ($valueTitleElements as $el) {
369 $val .= $el->getAttribute('title');
372 return unicodeTrim($val);
375 // No value-title or -class in this element
380 * Given an element with class="p-*", get it’s value
382 * @param DOMElement $p The element to parse
383 * @return string The plaintext value of $p, dependant on type
384 * @todo Make this adhere to value-class
386 public function parseP(\DOMElement $p) {
387 $classTitle = $this->parseValueClassTitle($p, ' ');
389 if ($classTitle !== null)
392 if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
393 $pValue = $p->getAttribute('alt');
394 } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
395 $pValue = $p->getAttribute('alt');
396 } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
397 $pValue = $p->getAttribute('title');
398 } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
399 $pValue = $p->getAttribute('value');
401 $pValue = unicodeTrim($p->textContent);
408 * Given an element with class="u-*", get the value of the URL
410 * @param DOMElement $u The element to parse
411 * @return string The plaintext value of $u, dependant on type
412 * @todo make this adhere to value-class
414 public function parseU(\DOMElement $u) {
415 if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
416 $uValue = $u->getAttribute('href');
417 } elseif ($u->tagName == 'img' and $u->getAttribute('src') !== null) {
418 $uValue = $u->getAttribute('src');
419 } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
420 $uValue = $u->getAttribute('data');
423 if (isset($uValue)) {
424 return $this->resolveUrl($uValue);
427 $classTitle = $this->parseValueClassTitle($u);
429 if ($classTitle !== null) {
431 } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
432 return $u->getAttribute('title');
433 } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
434 return $u->getAttribute('value');
436 return unicodeTrim($u->textContent);
441 * Given an element with class="dt-*", get the value of the datetime as a php date object
443 * @param DOMElement $dt The element to parse
444 * @param array $dates Array of dates processed so far
445 * @return string The datetime string found
447 public function parseDT(\DOMElement $dt, &$dates = array()) {
448 // Check for value-class pattern
449 $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
452 if ($valueClassChildren->length > 0) {
453 // They’re using value-class
454 $dateParts = array();
456 foreach ($valueClassChildren as $e) {
457 if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
458 $title = $e->getAttribute('title');
460 $dateParts[] = $title;
462 elseif ($e->tagName == 'img' or $e->tagName == 'area') {
464 $alt = $e->getAttribute('alt');
468 elseif ($e->tagName == 'data') {
469 // Use @value, otherwise innertext
470 $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
472 $dateParts[] = $value;
474 elseif ($e->tagName == 'abbr') {
475 // Use @title, otherwise innertext
476 $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
478 $dateParts[] = $title;
480 elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
481 // Use @datetime if available, otherwise innertext
482 $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
484 $dateParts[] = $dtAttr;
487 if (!empty($e->nodeValue))
488 $dateParts[] = unicodeTrim($e->nodeValue);
492 // Look through dateParts
495 foreach ($dateParts as $part) {
496 // Is this part a full ISO8601 datetime?
497 if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
498 // Break completely, we’ve got our value.
502 // Is the current part a valid time(+TZ?) AND no other time representation has been found?
503 if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
505 } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
506 // Is the current part a valid date AND no other date representation has been found?
510 if ( !empty($datePart) && !in_array($datePart, $dates) ) {
511 $dates[] = $datePart;
516 if ( empty($datePart) && !empty($timePart) ) {
517 $timePart = convertTimeFormat($timePart);
518 $dtValue = unicodeTrim($timePart, 'T');
520 else if ( !empty($datePart) && empty($timePart) ) {
521 $dtValue = rtrim($datePart, 'T');
524 $timePart = convertTimeFormat($timePart);
525 $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
530 // Not using value-class (phew).
531 if ($dt->tagName == 'img' or $dt->tagName == 'area') {
533 // Is it an entire dt?
534 $alt = $dt->getAttribute('alt');
537 } elseif (in_array($dt->tagName, array('data'))) {
538 // Use @value, otherwise innertext
539 // Is it an entire dt?
540 $value = $dt->getAttribute('value');
544 $dtValue = $dt->nodeValue;
545 } elseif ($dt->tagName == 'abbr') {
546 // Use @title, otherwise innertext
547 // Is it an entire dt?
548 $title = $dt->getAttribute('title');
552 $dtValue = $dt->nodeValue;
553 } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
554 // Use @datetime if available, otherwise innertext
555 // Is it an entire dt?
556 $dtAttr = $dt->getAttribute('datetime');
560 $dtValue = $dt->nodeValue;
562 $dtValue = $dt->nodeValue;
565 if ( preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches) ) {
566 $dates[] = $matches[0];
571 * if $dtValue is only a time and there are recently parsed dates,
572 * form the full date-time using the most recnetly parsed dt- value
574 if ( (preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates) ) {
575 $dtValue = convertTimeFormat($dtValue);
576 $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
583 * Given the root element of some embedded markup, return a string representing that markup
585 * @param DOMElement $e The element to parse
586 * @return string $e’s innerHTML
588 * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
590 public function parseE(\DOMElement $e) {
591 $classTitle = $this->parseValueClassTitle($e);
593 if ($classTitle !== null)
596 // Expand relative URLs within children of this element
597 // TODO: as it is this is not relative to only children, make this .// and rerun tests
598 $hyperlinkChildren = $this->xpath->query('//*[@src or @href or @data]', $e);
600 foreach ($hyperlinkChildren as $child) {
601 if ($child->hasAttribute('href'))
602 $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
603 if ($child->hasAttribute('src'))
604 $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
605 if ($child->hasAttribute('data'))
606 $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
610 foreach ($e->childNodes as $node) {
611 $html .= $node->C14N();
616 'value' => unicodeTrim($e->textContent)
621 * Recursively parse microformats
623 * @param DOMElement $e The element to parse
624 * @return array A representation of the values contained within microformat $e
626 public function parseH(\DOMElement $e) {
627 // If it’s already been parsed (e.g. is a child mf), skip
628 if ($this->parsed->contains($e))
631 // Get current µf name
632 $mfTypes = mfNamesFromElement($e, 'h-');
634 // Initalise var to store the representation in
639 // Handle nested microformats (h-*)
640 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
642 $result = $this->parseH($subMF);
644 // If result was already parsed, skip it
645 if (null === $result)
648 $result['value'] = $this->parseP($subMF);
650 // Does this µf have any property names other than h-*?
651 $properties = nestedMfPropertyNamesFromElement($subMF);
653 if (!empty($properties)) {
654 // Yes! It’s a nested property µf
655 foreach ($properties as $property) {
656 $return[$property][] = $result;
659 // No, it’s a child µf
660 $children[] = $result;
663 // Make sure this sub-mf won’t get parsed as a µf or property
664 // TODO: Determine if clearing this is required?
665 $this->elementPrefixParsed($subMF, 'h');
666 $this->elementPrefixParsed($subMF, 'p');
667 $this->elementPrefixParsed($subMF, 'u');
668 $this->elementPrefixParsed($subMF, 'dt');
669 $this->elementPrefixParsed($subMF, 'e');
673 foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
674 if ($this->isElementParsed($p, 'p'))
677 $pValue = $this->parseP($p);
679 // Add the value to the array for it’s p- properties
680 foreach (mfNamesFromElement($p, 'p-') as $propName) {
681 if (!empty($propName))
682 $return[$propName][] = $pValue;
685 // Make sure this sub-mf won’t get parsed as a top level mf
686 $this->elementPrefixParsed($p, 'p');
690 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
691 if ($this->isElementParsed($u, 'u'))
694 $uValue = $this->parseU($u);
696 // Add the value to the array for it’s property types
697 foreach (mfNamesFromElement($u, 'u-') as $propName) {
698 $return[$propName][] = $uValue;
701 // Make sure this sub-mf won’t get parsed as a top level mf
702 $this->elementPrefixParsed($u, 'u');
706 foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
707 if ($this->isElementParsed($dt, 'dt'))
710 $dtValue = $this->parseDT($dt, $dates);
713 // Add the value to the array for dt- properties
714 foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
715 $return[$propName][] = $dtValue;
719 // Make sure this sub-mf won’t get parsed as a top level mf
720 $this->elementPrefixParsed($dt, 'dt');
724 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
725 if ($this->isElementParsed($em, 'e'))
728 $eValue = $this->parseE($em);
731 // Add the value to the array for e- properties
732 foreach (mfNamesFromElement($em, 'e-') as $propName) {
733 $return[$propName][] = $eValue;
736 // Make sure this sub-mf won’t get parsed as a top level mf
737 $this->elementPrefixParsed($em, 'e');
740 // Implied Properties
742 if (!array_key_exists('name', $return)) {
745 if ($e->tagName == 'img' and $e->getAttribute('alt') != '')
746 throw new Exception($e->getAttribute('alt'));
748 if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
749 throw new Exception($e->getAttribute('title'));
751 // Look for nested img @alt
752 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
753 if ($em->getAttribute('alt') != '')
754 throw new Exception($em->getAttribute('alt'));
757 // Look for double nested img @alt
758 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
759 if ($em->getAttribute('alt') != '')
760 throw new Exception($em->getAttribute('alt'));
763 throw new Exception($e->nodeValue);
764 } catch (Exception $exc) {
765 $return['name'][] = unicodeTrim($exc->getMessage());
770 if (!array_key_exists('photo', $return)) {
773 if ($e->tagName == 'img')
774 throw new Exception($e->getAttribute('src'));
776 // Look for nested img @src
777 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
778 if ($em->getAttribute('src') != '')
779 throw new Exception($em->getAttribute('src'));
782 // Look for double nested img @src
783 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
784 if ($em->getAttribute('src') != '')
785 throw new Exception($em->getAttribute('src'));
787 } catch (Exception $exc) {
788 $return['photo'][] = $this->resolveUrl($exc->getMessage());
793 if (!array_key_exists('url', $return)) {
795 if ($e->tagName == 'a')
796 $url = $e->getAttribute('href');
798 // Look for nested img @src
799 foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
800 $url = $em->getAttribute('href');
805 $return['url'][] = $this->resolveUrl($url);
808 // Make sure things are in alphabetical order
811 // Phew. Return the final result.
814 'properties' => $return
816 if (!empty($children))
817 $parsed['children'] = array_values(array_filter($children));
822 * Parse Rels and Alternatives
824 * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
825 * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
826 * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
827 * it will be an empty array.
829 public function parseRelsAndAlternates() {
831 $alternates = array();
833 // Iterate through all a, area and link elements with rel attributes
834 foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
835 if ($hyperlink->getAttribute('rel') == '')
839 $href = $this->resolveUrl($hyperlink->getAttribute('href'));
841 // Split up the rel into space-separated values
842 $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
844 // If alternate in rels, create alternate structure, append
845 if (in_array('alternate', $linkRels)) {
848 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
850 if ($hyperlink->hasAttribute('media'))
851 $alt['media'] = $hyperlink->getAttribute('media');
853 if ($hyperlink->hasAttribute('hreflang'))
854 $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
856 $alternates[] = $alt;
858 foreach ($linkRels as $rel) {
859 $rels[$rel][] = $href;
864 if (empty($rels) and $this->jsonMode) {
865 $rels = new stdClass();
868 return array($rels, $alternates);
872 * Kicks off the parsing routine
874 * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
875 * will be HTML-encoded, bringing all output to the same level of encoding.
877 * If a DOMElement is set as the $context, only descendants of that element will
878 * be parsed for microformats.
880 * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
881 * @param DOMElement $context optionally an element from which to parse microformats
882 * @return array An array containing all the µfs found in the current document
884 public function parse($convertClassic = true, DOMElement $context = null) {
887 if ($convertClassic) {
888 $this->convertLegacy();
891 $mfElements = null === $context
892 ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
893 : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
895 // Parser microformats
896 foreach ($mfElements as $node) {
897 // For each microformat
898 $result = $this->parseH($node);
900 // Add the value to the array for this property type
905 list($rels, $alternates) = $this->parseRelsAndAlternates();
908 'items' => array_values(array_filter($mfs)),
912 if (count($alternates))
913 $top['alternates'] = $alternates;
921 * Given an ID, parse all microformats which are children of the element with
924 * Note that rel values are still document-wide.
926 * If an element with the ID is not found, an empty skeleton mf2 array structure
930 * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
933 public function parseFromId($id, $convertClassic=true) {
934 $matches = $this->xpath->query("//*[@id='{$id}']");
937 return array('items' => array(), 'rels' => array(), 'alternates' => array());
939 return $this->parse($convertClassic, $matches->item(0));
943 * Convert Legacy Classnames
945 * Adds microformats2 classnames into a document containing only legacy
946 * semantic classnames.
948 * @return Parser $this
950 public function convertLegacy() {
952 $xp = new DOMXPath($doc);
955 foreach ($this->classicRootMap as $old => $new) {
956 foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
957 $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
961 foreach ($this->classicPropertyMap as $oldRoot => $properties) {
962 $newRoot = $this->classicRootMap[$oldRoot];
963 foreach ($properties as $old => $new) {
964 foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
965 $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
976 * Runs an XPath query over the current document. Works in exactly the same
977 * way as DOMXPath::query.
979 * @param string $expression
980 * @param DOMNode $context
981 * @return DOMNodeList
983 public function query($expression, $context = null) {
984 return $this->xpath->query($expression, $context);
988 * Classic Root Classname map
990 public $classicRootMap = array(
993 'hentry' => 'h-entry',
994 'hrecipe' => 'h-recipe',
995 'hresume' => 'h-resume',
996 'hevent' => 'h-event',
997 'hreview' => 'h-review'
1000 public $classicPropertyMap = array(
1004 'honorific-prefix' => 'p-honorific-prefix',
1005 'given-name' => 'p-given-name',
1006 'additional-name' => 'p-additional-name',
1007 'family-name' => 'p-family-name',
1008 'honorific-suffix' => 'p-honorific-suffix',
1009 'nickname' => 'p-nickname',
1010 'email' => 'u-email',
1012 'photo' => 'u-photo',
1015 'category' => 'p-category',
1016 'adr' => 'p-adr h-adr',
1017 'extended-address' => 'p-extended-address',
1018 'street-address' => 'p-street-address',
1019 'locality' => 'p-locality',
1020 'region' => 'p-region',
1021 'postal-code' => 'p-postal-code',
1022 'country-name' => 'p-country-name',
1023 'label' => 'p-label',
1024 'geo' => 'p-geo h-geo',
1025 'latitude' => 'p-latitude',
1026 'longitude' => 'p-longitude',
1029 'bday' => 'dt-bday',
1032 'organization-name' => 'p-organization-name',
1033 'organization-unit' => 'p-organization-unit',
1036 'entry-title' => 'p-name',
1037 'entry-summary' => 'p-summary',
1038 'entry-content' => 'e-content',
1039 'published' => 'dt-published',
1040 'updated' => 'dt-updated',
1041 'author' => 'p-author h-card',
1042 'category' => 'p-category',
1043 'geo' => 'p-geo h-geo',
1044 'latitude' => 'p-latitude',
1045 'longitude' => 'p-longitude',
1049 'ingredient' => 'p-ingredient',
1050 'yield' => 'p-yield',
1051 'instructions' => 'e-instructions',
1052 'duration' => 'dt-duration',
1053 'nutrition' => 'p-nutrition',
1054 'photo' => 'u-photo',
1055 'summary' => 'p-summary',
1056 'author' => 'p-author h-card'
1059 'summary' => 'p-summary',
1060 'contact' => 'h-card p-contact',
1061 'education' => 'h-event p-education',
1062 'experience' => 'h-event p-experience',
1063 'skill' => 'p-skill',
1064 'affiliation' => 'p-affiliation h-card',
1067 'dtstart' => 'dt-start',
1068 'dtend' => 'dt-end',
1069 'duration' => 'dt-duration',
1070 'description' => 'p-description',
1071 'summary' => 'p-summary',
1072 'description' => 'p-description',
1074 'category' => 'p-category',
1075 'location' => 'h-card',
1076 'geo' => 'p-location h-geo'
1079 'summary' => 'p-name',
1080 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
1081 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
1082 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
1083 'reviewer' => 'p-reviewer p-author h-card',
1084 'dtreviewed' => 'dt-reviewed',
1085 'rating' => 'p-rating',
1087 'worst' => 'p-worst',
1088 'description' => 'p-description'
1093 function parseUriToComponents($uri) {
1096 'authority' => null,
1102 $u = @parse_url($uri);
1104 if(array_key_exists('scheme', $u))
1105 $result['scheme'] = $u['scheme'];
1107 if(array_key_exists('host', $u)) {
1108 if(array_key_exists('user', $u))
1109 $result['authority'] = $u['user'];
1110 if(array_key_exists('pass', $u))
1111 $result['authority'] .= ':' . $u['pass'];
1112 if(array_key_exists('user', $u) || array_key_exists('pass', $u))
1113 $result['authority'] .= '@';
1114 $result['authority'] .= $u['host'];
1115 if(array_key_exists('port', $u))
1116 $result['authority'] .= ':' . $u['port'];
1119 if(array_key_exists('path', $u))
1120 $result['path'] = $u['path'];
1122 if(array_key_exists('query', $u))
1123 $result['query'] = $u['query'];
1125 if(array_key_exists('fragment', $u))
1126 $result['fragment'] = $u['fragment'];
1131 function resolveUrl($baseURI, $referenceURI) {
1134 'authority' => null,
1140 # 5.2.1 Pre-parse the Base URI
1141 # The base URI (Base) is established according to the procedure of
1142 # Section 5.1 and parsed into the five main components described in
1144 $base = parseUriToComponents($baseURI);
1146 # If base path is blank (http://example.com) then set it to /
1147 # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
1148 if($base['path'] == null)
1149 $base['path'] = '/';
1151 # 5.2.2. Transform References
1153 # The URI reference is parsed into the five URI components
1154 # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
1155 $reference = parseUriToComponents($referenceURI);
1157 # A non-strict parser may ignore a scheme in the reference
1158 # if it is identical to the base URI's scheme.
1161 if($reference['scheme']) {
1162 $target['scheme'] = $reference['scheme'];
1163 $target['authority'] = $reference['authority'];
1164 $target['path'] = removeDotSegments($reference['path']);
1165 $target['query'] = $reference['query'];
1167 if($reference['authority']) {
1168 $target['authority'] = $reference['authority'];
1169 $target['path'] = removeDotSegments($reference['path']);
1170 $target['query'] = $reference['query'];
1172 if($reference['path'] == '') {
1173 $target['path'] = $base['path'];
1174 if($reference['query']) {
1175 $target['query'] = $reference['query'];
1177 $target['query'] = $base['query'];
1180 if(substr($reference['path'], 0, 1) == '/') {
1181 $target['path'] = removeDotSegments($reference['path']);
1183 $target['path'] = mergePaths($base, $reference);
1184 $target['path'] = removeDotSegments($target['path']);
1186 $target['query'] = $reference['query'];
1188 $target['authority'] = $base['authority'];
1190 $target['scheme'] = $base['scheme'];
1192 $target['fragment'] = $reference['fragment'];
1194 # 5.3 Component Recomposition
1196 if($target['scheme']) {
1197 $result .= $target['scheme'] . ':';
1199 if($target['authority']) {
1200 $result .= '//' . $target['authority'];
1202 $result .= $target['path'];
1203 if($target['query']) {
1204 $result .= '?' . $target['query'];
1206 if($target['fragment']) {
1207 $result .= '#' . $target['fragment'];
1208 } elseif($referenceURI == '#') {
1215 function mergePaths($base, $reference) {
1216 # If the base URI has a defined authority component and an empty
1218 if($base['authority'] && $base['path'] == null) {
1219 # then return a string consisting of "/" concatenated with the
1220 # reference's path; otherwise,
1221 $merged = '/' . $reference['path'];
1223 if(($pos=strrpos($base['path'], '/')) !== false) {
1224 # return a string consisting of the reference's path component
1225 # appended to all but the last segment of the base URI's path (i.e.,
1226 # excluding any characters after the right-most "/" in the base URI
1228 $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
1230 # or excluding the entire base URI path if it does not contain
1231 # any "/" characters).
1232 $merged = $base['path'];
1238 # 5.2.4.A Remove leading ../ or ./
1239 function removeLeadingDotSlash(&$input) {
1240 if(substr($input, 0, 3) == '../') {
1241 $input = substr($input, 3);
1242 } elseif(substr($input, 0, 2) == './') {
1243 $input = substr($input, 2);
1247 # 5.2.4.B Replace leading /. with /
1248 function removeLeadingSlashDot(&$input) {
1249 if(substr($input, 0, 3) == '/./') {
1250 $input = '/' . substr($input, 3);
1252 $input = '/' . substr($input, 2);
1256 # 5.2.4.C Given leading /../ remove component from output buffer
1257 function removeOneDirLevel(&$input, &$output) {
1258 if(substr($input, 0, 4) == '/../') {
1259 $input = '/' . substr($input, 4);
1261 $input = '/' . substr($input, 3);
1263 $output = substr($output, 0, strrpos($output, '/'));
1266 # 5.2.4.D Remove . and .. if it's the only thing in the input
1267 function removeLoneDotDot(&$input) {
1269 $input = substr($input, 1);
1271 $input = substr($input, 2);
1275 # 5.2.4.E Move one segment from input to output
1276 function moveOneSegmentFromInput(&$input, &$output) {
1277 if(substr($input, 0, 1) != '/') {
1278 $pos = strpos($input, '/');
1280 $pos = strpos($input, '/', 1);
1283 if($pos === false) {
1287 $output .= substr($input, 0, $pos);
1288 $input = substr($input, $pos);
1292 # 5.2.4 Remove Dot Segments
1293 function removeDotSegments($path) {
1294 # 1. The input buffer is initialized with the now-appended path
1295 # components and the output buffer is initialized to the empty
1302 # 2. While the input buffer is not empty, loop as follows:
1306 if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
1307 # A. If the input buffer begins with a prefix of "../" or "./",
1308 # then remove that prefix from the input buffer; otherwise,
1309 removeLeadingDotSlash($input);
1310 } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
1311 # B. if the input buffer begins with a prefix of "/./" or "/.",
1312 # where "." is a complete path segment, then replace that
1313 # prefix with "/" in the input buffer; otherwise,
1314 removeLeadingSlashDot($input);
1315 } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
1316 # C. if the input buffer begins with a prefix of "/../" or "/..",
1317 # where ".." is a complete path segment, then replace that
1318 # prefix with "/" in the input buffer and remove the last
1319 # segment and its preceding "/" (if any) from the output
1320 # buffer; otherwise,
1321 removeOneDirLevel($input, $output);
1322 } elseif($input == '.' || $input == '..') {
1323 # D. if the input buffer consists only of "." or "..", then remove
1324 # that from the input buffer; otherwise,
1325 removeLoneDotDot($input);
1327 # E. move the first path segment in the input buffer to the end of
1328 # the output buffer and any subsequent characters up to, but not including,
1329 # the next "/" character or the end of the input buffer
1330 moveOneSegmentFromInput($input, $output);