17 * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
22 * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
23 * echo json_encode($output, JSON_PRETTY_PRINT);
32 * "name": ["Barnaby Walters"]
39 * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
40 * @param string $url The URL the input document was found at, for relative URL resolution
41 * @param bool $convertClassic whether or not to convert classic microformats
42 * @return array Canonical MF2 array structure
44 function parse($input, $url = null, $convertClassic = true) {
45 $parser = new Parser($input, $url);
46 return $parser->parse($convertClassic);
52 * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed
53 * microformats2 array structure.
55 * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed
56 * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted
57 * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code']
58 * for the actual value.
60 * @param string $url The URL to fetch
61 * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats
62 * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging
63 * @return array|null canonical microformats2 array structure on success, null on failure
65 function fetch($url, $convertClassic = true, &$curlInfo=null) {
67 curl_setopt($ch, CURLOPT_URL, $url);
68 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
69 curl_setopt($ch, CURLOPT_HEADER, 0);
70 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
71 curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
72 $html = curl_exec($ch);
73 $info = $curlInfo = curl_getinfo($ch);
76 if (strpos(strtolower($info['content_type']), 'html') === false) {
77 // The content was not delivered as HTML, do not attempt to parse it.
81 return parse($html, $url, $convertClassic);
85 * Unicode to HTML Entities
86 * @param string $input String containing characters to convert into HTML entities
89 function unicodeToHtmlEntities($input) {
90 return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
96 * Collapses any sequences of whitespace within a string into a single space
99 * @deprecated since v0.2.3
103 function collapseWhitespace($str) {
104 return preg_replace('/[\s|\n]+/', ' ', $str);
107 function unicodeTrim($str) {
108 // this is cheating. TODO: find a better way if this causes any problems
109 $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str);
110 $str = preg_replace('/^\s+/', '', $str);
111 return preg_replace('/\s+$/', '', $str);
115 * Microformat Name From Class string
117 * Given the value of @class, get the relevant mf classnames (e.g. h-card,
120 * @param string $class A space delimited list of classnames
121 * @param string $prefix The prefix to look for
122 * @return string|array The prefixed name of the first microfomats class found or false
124 function mfNamesFromClass($class, $prefix='h-') {
125 $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
126 $classes = explode(' ', $class);
129 foreach ($classes as $classname) {
130 $compare_classname = ' ' . $classname;
131 $compare_prefix = ' ' . $prefix;
132 if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
133 $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
141 * Get Nested µf Property Name From Class
143 * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
144 * space-separated string.
146 * @param string $class
149 function nestedMfPropertyNamesFromClass($class) {
150 $prefixes = array('p-', 'u-', 'dt-', 'e-');
151 $propertyNames = array();
153 $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
154 foreach (explode(' ', $class) as $classname) {
155 foreach ($prefixes as $prefix) {
156 // Check if $classname is a valid property classname for $prefix.
157 if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
158 $propertyName = mb_substr($classname, mb_strlen($prefix));
159 $propertyNames[$propertyName][] = $prefix;
164 foreach ($propertyNames as $property => $prefixes) {
165 $propertyNames[$property] = array_unique($prefixes);
168 return $propertyNames;
172 * Wraps mfNamesFromClass to handle an element as input (common)
174 * @param DOMElement $e The element to get the classname for
175 * @param string $prefix The prefix to look for
176 * @return mixed See return value of mf2\Parser::mfNameFromClass()
178 function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') {
179 $class = $e->getAttribute('class');
180 return mfNamesFromClass($class, $prefix);
184 * Wraps nestedMfPropertyNamesFromClass to handle an element as input
186 function nestedMfPropertyNamesFromElement(\DOMElement $e) {
187 $class = $e->getAttribute('class');
188 return nestedMfPropertyNamesFromClass($class);
192 * Converts various time formats to HH:MM
193 * @param string $time The time to convert
196 function convertTimeFormat($time) {
197 $hh = $mm = $ss = '';
198 preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
200 // If no am/pm is specified:
201 if (empty($matches[4])) {
204 // Otherwise, am/pm is specified.
205 $meridiem = strtolower(str_replace('.', '', $matches[4]));
210 // Add 12 to hours if pm applies.
211 if ($meridiem == 'pm' && ($hh < 12)) {
215 $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
218 $mm = (empty($matches[2]) ) ? '00' : $matches[2];
220 // Seconds, only if supplied.
221 if (!empty($matches[3])) {
226 return sprintf('%s:%s', $hh, $mm);
229 return sprintf('%s:%s:%s', $hh, $mm, $ss);
235 * Microformats2 Parser
237 * A class which holds state for parsing microformats2 from HTML.
242 * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
243 * $output = $parser->parse();
246 /** @var string The baseurl (if any) to use for this parse */
249 /** @var DOMXPath object which can be used to query over any fragment*/
252 /** @var DOMDocument */
255 /** @var SplObjectStorage */
263 * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
264 * @param string $url The URL of the parsed document, for relative URL resolution
265 * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
267 public function __construct($input, $url = null, $jsonMode = false) {
268 libxml_use_internal_errors(true);
269 if (is_string($input)) {
270 $doc = new DOMDocument();
271 @$doc->loadHTML(unicodeToHtmlEntities($input));
272 } elseif (is_a($input, 'DOMDocument')) {
275 $doc = new DOMDocument();
279 $this->xpath = new DOMXPath($doc);
282 foreach ($this->xpath->query('//base[@href]') as $base) {
283 $baseElementUrl = $base->getAttribute('href');
285 if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
286 /* The base element URL is relative to the document URL.
290 * Perhaps the author was high? */
292 $baseurl = resolveUrl($url, $baseElementUrl);
294 $baseurl = $baseElementUrl;
299 // Ignore <template> elements as per the HTML5 spec
300 foreach ($this->xpath->query('//template') as $templateEl) {
301 $templateEl->parentNode->removeChild($templateEl);
304 $this->baseurl = $baseurl;
306 $this->parsed = new SplObjectStorage();
307 $this->jsonMode = $jsonMode;
310 private function elementPrefixParsed(\DOMElement $e, $prefix) {
311 if (!$this->parsed->contains($e))
312 $this->parsed->attach($e, array());
314 $prefixes = $this->parsed[$e];
315 $prefixes[] = $prefix;
316 $this->parsed[$e] = $prefixes;
319 private function isElementParsed(\DOMElement $e, $prefix) {
320 if (!$this->parsed->contains($e))
323 $prefixes = $this->parsed[$e];
325 if (!in_array($prefix, $prefixes))
331 private function resolveChildUrls(DOMElement $el) {
332 $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
334 foreach ($hyperlinkChildren as $child) {
335 if ($child->hasAttribute('href'))
336 $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
337 if ($child->hasAttribute('src'))
338 $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
339 if ($child->hasAttribute('data'))
340 $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
344 public function textContent(DOMElement $el) {
345 $this->resolveChildUrls($el);
347 $clonedEl = $el->cloneNode(true);
349 foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
350 $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
351 $imgEl->parentNode->replaceChild($newNode, $imgEl);
354 return $clonedEl->textContent;
357 // TODO: figure out if this has problems with sms: and geo: URLs
358 public function resolveUrl($url) {
359 // If the URL is seriously malformed it’s probably beyond the scope of this
360 // parser to try to do anything with it.
361 if (parse_url($url) === false)
364 $scheme = parse_url($url, PHP_URL_SCHEME);
366 if (empty($scheme) and !empty($this->baseurl)) {
367 return resolveUrl($this->baseurl, $url);
376 * Parse value-class/value-title on an element, joining with $separator if
377 * there are multiple.
379 * @param \DOMElement $e
380 * @param string $separator = '' if multiple value-title elements, join with this string
381 * @return string|null the parsed value or null if value-class or -title aren’t in use
383 public function parseValueClassTitle(\DOMElement $e, $separator = '') {
384 $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
386 if ($valueClassElements->length !== 0) {
387 // Process value-class stuff
389 foreach ($valueClassElements as $el) {
390 $val .= $this->textContent($el);
393 return unicodeTrim($val);
396 $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
398 if ($valueTitleElements->length !== 0) {
399 // Process value-title stuff
401 foreach ($valueTitleElements as $el) {
402 $val .= $el->getAttribute('title');
405 return unicodeTrim($val);
408 // No value-title or -class in this element
413 * Given an element with class="p-*", get it’s value
415 * @param DOMElement $p The element to parse
416 * @return string The plaintext value of $p, dependant on type
417 * @todo Make this adhere to value-class
419 public function parseP(\DOMElement $p) {
420 $classTitle = $this->parseValueClassTitle($p, ' ');
422 if ($classTitle !== null)
425 if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
426 $pValue = $p->getAttribute('alt');
427 } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
428 $pValue = $p->getAttribute('alt');
429 } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') {
430 $pValue = $p->getAttribute('title');
431 } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
432 $pValue = $p->getAttribute('value');
434 $pValue = unicodeTrim($this->textContent($p));
441 * Given an element with class="u-*", get the value of the URL
443 * @param DOMElement $u The element to parse
444 * @return string The plaintext value of $u, dependant on type
445 * @todo make this adhere to value-class
447 public function parseU(\DOMElement $u) {
448 if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
449 $uValue = $u->getAttribute('href');
450 } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->getAttribute('src') !== null) {
451 $uValue = $u->getAttribute('src');
452 } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
453 $uValue = $u->getAttribute('data');
456 if (isset($uValue)) {
457 return $this->resolveUrl($uValue);
460 $classTitle = $this->parseValueClassTitle($u);
462 if ($classTitle !== null) {
464 } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
465 return $u->getAttribute('title');
466 } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
467 return $u->getAttribute('value');
469 return unicodeTrim($this->textContent($u));
474 * Given an element with class="dt-*", get the value of the datetime as a php date object
476 * @param DOMElement $dt The element to parse
477 * @param array $dates Array of dates processed so far
478 * @return string The datetime string found
480 public function parseDT(\DOMElement $dt, &$dates = array()) {
481 // Check for value-class pattern
482 $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
485 if ($valueClassChildren->length > 0) {
486 // They’re using value-class
487 $dateParts = array();
489 foreach ($valueClassChildren as $e) {
490 if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
491 $title = $e->getAttribute('title');
493 $dateParts[] = $title;
495 elseif ($e->tagName == 'img' or $e->tagName == 'area') {
497 $alt = $e->getAttribute('alt');
501 elseif ($e->tagName == 'data') {
502 // Use @value, otherwise innertext
503 $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue);
505 $dateParts[] = $value;
507 elseif ($e->tagName == 'abbr') {
508 // Use @title, otherwise innertext
509 $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue);
511 $dateParts[] = $title;
513 elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') {
514 // Use @datetime if available, otherwise innertext
515 $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue);
517 $dateParts[] = $dtAttr;
520 if (!empty($e->nodeValue))
521 $dateParts[] = unicodeTrim($e->nodeValue);
525 // Look through dateParts
528 foreach ($dateParts as $part) {
529 // Is this part a full ISO8601 datetime?
530 if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) {
531 // Break completely, we’ve got our value.
535 // Is the current part a valid time(+TZ?) AND no other time representation has been found?
536 if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) {
538 } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) {
539 // Is the current part a valid date AND no other date representation has been found?
543 if ( !empty($datePart) && !in_array($datePart, $dates) ) {
544 $dates[] = $datePart;
549 if ( empty($datePart) && !empty($timePart) ) {
550 $timePart = convertTimeFormat($timePart);
551 $dtValue = unicodeTrim($timePart, 'T');
553 else if ( !empty($datePart) && empty($timePart) ) {
554 $dtValue = rtrim($datePart, 'T');
557 $timePart = convertTimeFormat($timePart);
558 $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T');
563 // Not using value-class (phew).
564 if ($dt->tagName == 'img' or $dt->tagName == 'area') {
566 // Is it an entire dt?
567 $alt = $dt->getAttribute('alt');
570 } elseif (in_array($dt->tagName, array('data'))) {
571 // Use @value, otherwise innertext
572 // Is it an entire dt?
573 $value = $dt->getAttribute('value');
577 $dtValue = $dt->nodeValue;
578 } elseif ($dt->tagName == 'abbr') {
579 // Use @title, otherwise innertext
580 // Is it an entire dt?
581 $title = $dt->getAttribute('title');
585 $dtValue = $dt->nodeValue;
586 } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') {
587 // Use @datetime if available, otherwise innertext
588 // Is it an entire dt?
589 $dtAttr = $dt->getAttribute('datetime');
593 $dtValue = $dt->nodeValue;
595 $dtValue = $dt->nodeValue;
598 if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
599 $dates[] = $matches[0];
604 * if $dtValue is only a time and there are recently parsed dates,
605 * form the full date-time using the most recently parsed dt- value
607 if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
608 $dtValue = convertTimeFormat($dtValue);
609 $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
616 * Given the root element of some embedded markup, return a string representing that markup
618 * @param DOMElement $e The element to parse
619 * @return string $e’s innerHTML
621 * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too
623 public function parseE(\DOMElement $e) {
624 $classTitle = $this->parseValueClassTitle($e);
626 if ($classTitle !== null)
629 // Expand relative URLs within children of this element
630 // TODO: as it is this is not relative to only children, make this .// and rerun tests
631 $this->resolveChildUrls($e);
634 foreach ($e->childNodes as $node) {
635 $html .= $node->C14N();
640 'value' => unicodeTrim($this->textContent($e))
645 * Recursively parse microformats
647 * @param DOMElement $e The element to parse
648 * @return array A representation of the values contained within microformat $e
650 public function parseH(\DOMElement $e) {
651 // If it’s already been parsed (e.g. is a child mf), skip
652 if ($this->parsed->contains($e))
655 // Get current µf name
656 $mfTypes = mfNamesFromElement($e, 'h-');
658 // Initalise var to store the representation in
663 // Handle nested microformats (h-*)
664 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
666 $result = $this->parseH($subMF);
668 // If result was already parsed, skip it
669 if (null === $result)
672 // In most cases, the value attribute of the nested microformat should be the p- parsed value of the elemnt.
673 // The only times this is different is when the microformat is nested under certain prefixes, which are handled below.
674 $result['value'] = $this->parseP($subMF);
676 // Does this µf have any property names other than h-*?
677 $properties = nestedMfPropertyNamesFromElement($subMF);
679 if (!empty($properties)) {
680 // Yes! It’s a nested property µf
681 foreach ($properties as $property => $prefixes) {
682 // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
683 $prefixSpecificResult = $result;
684 if (in_array('p-', $prefixes)) {
685 $prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
686 } elseif (in_array('e-', $prefixes)) {
687 $eParsedResult = $this->parseE($subMF);
688 $prefixSpecificResult['html'] = $eParsedResult['html'];
689 $prefixSpecificResult['value'] = $eParsedResult['value'];
690 } elseif (in_array('u-', $prefixes)) {
691 $prefixSpecificResult['value'] = $this->parseU($subMF);
693 $return[$property][] = $prefixSpecificResult;
696 // No, it’s a child µf
697 $children[] = $result;
700 // Make sure this sub-mf won’t get parsed as a µf or property
701 // TODO: Determine if clearing this is required?
702 $this->elementPrefixParsed($subMF, 'h');
703 $this->elementPrefixParsed($subMF, 'p');
704 $this->elementPrefixParsed($subMF, 'u');
705 $this->elementPrefixParsed($subMF, 'dt');
706 $this->elementPrefixParsed($subMF, 'e');
709 if($e->tagName == 'area') {
710 $coords = $e->getAttribute('coords');
711 $shape = $e->getAttribute('shape');
715 foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
716 if ($this->isElementParsed($p, 'p'))
719 $pValue = $this->parseP($p);
721 // Add the value to the array for it’s p- properties
722 foreach (mfNamesFromElement($p, 'p-') as $propName) {
723 if (!empty($propName))
724 $return[$propName][] = $pValue;
727 // Make sure this sub-mf won’t get parsed as a top level mf
728 $this->elementPrefixParsed($p, 'p');
732 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
733 if ($this->isElementParsed($u, 'u'))
736 $uValue = $this->parseU($u);
738 // Add the value to the array for it’s property types
739 foreach (mfNamesFromElement($u, 'u-') as $propName) {
740 $return[$propName][] = $uValue;
743 // Make sure this sub-mf won’t get parsed as a top level mf
744 $this->elementPrefixParsed($u, 'u');
748 foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
749 if ($this->isElementParsed($dt, 'dt'))
752 $dtValue = $this->parseDT($dt, $dates);
755 // Add the value to the array for dt- properties
756 foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
757 $return[$propName][] = $dtValue;
761 // Make sure this sub-mf won’t get parsed as a top level mf
762 $this->elementPrefixParsed($dt, 'dt');
766 foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) {
767 if ($this->isElementParsed($em, 'e'))
770 $eValue = $this->parseE($em);
773 // Add the value to the array for e- properties
774 foreach (mfNamesFromElement($em, 'e-') as $propName) {
775 $return[$propName][] = $eValue;
778 // Make sure this sub-mf won’t get parsed as a top level mf
779 $this->elementPrefixParsed($em, 'e');
782 // Implied Properties
784 if (!array_key_exists('name', $return)) {
787 if (($e->tagName == 'img' or $e->tagName == 'area') and $e->getAttribute('alt') != '')
788 throw new Exception($e->getAttribute('alt'));
790 if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
791 throw new Exception($e->getAttribute('title'));
793 // Look for nested img @alt
794 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
795 $emNames = mfNamesFromElement($em, 'h-');
796 if (empty($emNames) && $em->getAttribute('alt') != '') {
797 throw new Exception($em->getAttribute('alt'));
801 // Look for nested area @alt
802 foreach ($this->xpath->query('./area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
803 $emNames = mfNamesFromElement($em, 'h-');
804 if (empty($emNames) && $em->getAttribute('alt') != '') {
805 throw new Exception($em->getAttribute('alt'));
810 // Look for double nested img @alt
811 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
812 $emNames = mfNamesFromElement($em, 'h-');
813 if (empty($emNames) && $em->getAttribute('alt') != '') {
814 throw new Exception($em->getAttribute('alt'));
818 // Look for double nested img @alt
819 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
820 $emNames = mfNamesFromElement($em, 'h-');
821 if (empty($emNames) && $em->getAttribute('alt') != '') {
822 throw new Exception($em->getAttribute('alt'));
826 throw new Exception($e->nodeValue);
827 } catch (Exception $exc) {
828 $return['name'][] = unicodeTrim($exc->getMessage());
833 if (!array_key_exists('photo', $return)) {
836 if ($e->tagName == 'img')
837 throw new Exception($e->getAttribute('src'));
839 // Look for nested img @src
840 foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
841 if ($em->getAttribute('src') != '')
842 throw new Exception($em->getAttribute('src'));
845 // Look for double nested img @src
846 foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
847 if ($em->getAttribute('src') != '')
848 throw new Exception($em->getAttribute('src'));
850 } catch (Exception $exc) {
851 $return['photo'][] = $this->resolveUrl($exc->getMessage());
856 if (!array_key_exists('url', $return)) {
858 if ($e->tagName == 'a' or $e->tagName == 'area')
859 $url = $e->getAttribute('href');
861 // Look for nested a @href
862 foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
863 $emNames = mfNamesFromElement($em, 'h-');
864 if (empty($emNames)) {
865 $url = $em->getAttribute('href');
870 // Look for nested area @src
871 foreach ($this->xpath->query('./area[count(preceding-sibling::area)+count(following-sibling::area)=0]', $e) as $em) {
872 $emNames = mfNamesFromElement($em, 'h-');
873 if (empty($emNames)) {
874 $url = $em->getAttribute('href');
880 $return['url'][] = $this->resolveUrl($url);
883 // Make sure things are in alphabetical order
886 // Phew. Return the final result.
889 'properties' => $return
892 if (!empty($shape)) {
893 $parsed['shape'] = $shape;
896 if (!empty($coords)) {
897 $parsed['coords'] = $coords;
900 if (!empty($children)) {
901 $parsed['children'] = array_values(array_filter($children));
907 * Parse Rels and Alternatives
909 * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
910 * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
911 * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
912 * it will be an empty array.
914 public function parseRelsAndAlternates() {
916 $alternates = array();
918 // Iterate through all a, area and link elements with rel attributes
919 foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
920 if ($hyperlink->getAttribute('rel') == '')
924 $href = $this->resolveUrl($hyperlink->getAttribute('href'));
926 // Split up the rel into space-separated values
927 $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
929 // If alternate in rels, create alternate structure, append
930 if (in_array('alternate', $linkRels)) {
933 'rel' => implode(' ', array_diff($linkRels, array('alternate')))
935 if ($hyperlink->hasAttribute('media'))
936 $alt['media'] = $hyperlink->getAttribute('media');
938 if ($hyperlink->hasAttribute('hreflang'))
939 $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
941 if ($hyperlink->hasAttribute('title'))
942 $alt['title'] = $hyperlink->getAttribute('title');
944 if ($hyperlink->hasAttribute('type'))
945 $alt['type'] = $hyperlink->getAttribute('type');
947 if ($hyperlink->nodeValue)
948 $alt['text'] = $hyperlink->nodeValue;
950 $alternates[] = $alt;
952 foreach ($linkRels as $rel) {
953 $rels[$rel][] = $href;
958 if (empty($rels) and $this->jsonMode) {
959 $rels = new stdClass();
962 return array($rels, $alternates);
966 * Kicks off the parsing routine
968 * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
969 * will be HTML-encoded, bringing all output to the same level of encoding.
971 * If a DOMElement is set as the $context, only descendants of that element will
972 * be parsed for microformats.
974 * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
975 * @param DOMElement $context optionally an element from which to parse microformats
976 * @return array An array containing all the µfs found in the current document
978 public function parse($convertClassic = true, DOMElement $context = null) {
981 if ($convertClassic) {
982 $this->convertLegacy();
985 $mfElements = null === $context
986 ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
987 : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
989 // Parser microformats
990 foreach ($mfElements as $node) {
991 // For each microformat
992 $result = $this->parseH($node);
994 // Add the value to the array for this property type
999 list($rels, $alternates) = $this->parseRelsAndAlternates();
1002 'items' => array_values(array_filter($mfs)),
1006 if (count($alternates))
1007 $top['alternates'] = $alternates;
1015 * Given an ID, parse all microformats which are children of the element with
1018 * Note that rel values are still document-wide.
1020 * If an element with the ID is not found, an empty skeleton mf2 array structure
1024 * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
1027 public function parseFromId($id, $convertClassic=true) {
1028 $matches = $this->xpath->query("//*[@id='{$id}']");
1030 if (empty($matches))
1031 return array('items' => array(), 'rels' => array(), 'alternates' => array());
1033 return $this->parse($convertClassic, $matches->item(0));
1037 * Convert Legacy Classnames
1039 * Adds microformats2 classnames into a document containing only legacy
1040 * semantic classnames.
1042 * @return Parser $this
1044 public function convertLegacy() {
1046 $xp = new DOMXPath($doc);
1048 // replace all roots
1049 foreach ($this->classicRootMap as $old => $new) {
1050 foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
1051 $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
1055 foreach ($this->classicPropertyMap as $oldRoot => $properties) {
1056 $newRoot = $this->classicRootMap[$oldRoot];
1057 foreach ($properties as $old => $new) {
1058 foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
1059 $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
1070 * Runs an XPath query over the current document. Works in exactly the same
1071 * way as DOMXPath::query.
1073 * @param string $expression
1074 * @param DOMNode $context
1075 * @return DOMNodeList
1077 public function query($expression, $context = null) {
1078 return $this->xpath->query($expression, $context);
1082 * Classic Root Classname map
1084 public $classicRootMap = array(
1085 'vcard' => 'h-card',
1086 'hfeed' => 'h-feed',
1087 'hentry' => 'h-entry',
1088 'hrecipe' => 'h-recipe',
1089 'hresume' => 'h-resume',
1090 'vevent' => 'h-event',
1091 'hreview' => 'h-review',
1092 'hproduct' => 'h-product'
1095 public $classicPropertyMap = array(
1099 'honorific-prefix' => 'p-honorific-prefix',
1100 'given-name' => 'p-given-name',
1101 'additional-name' => 'p-additional-name',
1102 'family-name' => 'p-family-name',
1103 'honorific-suffix' => 'p-honorific-suffix',
1104 'nickname' => 'p-nickname',
1105 'email' => 'u-email',
1107 'photo' => 'u-photo',
1110 'category' => 'p-category',
1111 'adr' => 'p-adr h-adr',
1112 'extended-address' => 'p-extended-address',
1113 'street-address' => 'p-street-address',
1114 'locality' => 'p-locality',
1115 'region' => 'p-region',
1116 'postal-code' => 'p-postal-code',
1117 'country-name' => 'p-country-name',
1118 'label' => 'p-label',
1119 'geo' => 'p-geo h-geo',
1120 'latitude' => 'p-latitude',
1121 'longitude' => 'p-longitude',
1124 'bday' => 'dt-bday',
1127 'organization-name' => 'p-organization-name',
1128 'organization-unit' => 'p-organization-unit',
1131 'entry-title' => 'p-name',
1132 'entry-summary' => 'p-summary',
1133 'entry-content' => 'e-content',
1134 'published' => 'dt-published',
1135 'updated' => 'dt-updated',
1136 'author' => 'p-author h-card',
1137 'category' => 'p-category',
1138 'geo' => 'p-geo h-geo',
1139 'latitude' => 'p-latitude',
1140 'longitude' => 'p-longitude',
1144 'ingredient' => 'p-ingredient',
1145 'yield' => 'p-yield',
1146 'instructions' => 'e-instructions',
1147 'duration' => 'dt-duration',
1148 'nutrition' => 'p-nutrition',
1149 'photo' => 'u-photo',
1150 'summary' => 'p-summary',
1151 'author' => 'p-author h-card'
1154 'summary' => 'p-summary',
1155 'contact' => 'h-card p-contact',
1156 'education' => 'h-event p-education',
1157 'experience' => 'h-event p-experience',
1158 'skill' => 'p-skill',
1159 'affiliation' => 'p-affiliation h-card',
1162 'dtstart' => 'dt-start',
1163 'dtend' => 'dt-end',
1164 'duration' => 'dt-duration',
1165 'description' => 'p-description',
1166 'summary' => 'p-summary',
1167 'description' => 'p-description',
1169 'category' => 'p-category',
1170 'location' => 'h-card',
1171 'geo' => 'p-location h-geo'
1174 'summary' => 'p-name',
1175 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec
1176 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo)
1177 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url)
1178 'reviewer' => 'p-reviewer p-author h-card',
1179 'dtreviewed' => 'dt-reviewed',
1180 'rating' => 'p-rating',
1182 'worst' => 'p-worst',
1183 'description' => 'p-description'
1185 'hproduct' => array(
1187 'photo' => 'u-photo',
1188 'brand' => 'p-brand',
1189 'category' => 'p-category',
1190 'description' => 'p-description',
1191 'identifier' => 'u-identifier',
1193 'review' => 'p-review h-review',
1194 'price' => 'p-price'
1199 function parseUriToComponents($uri) {
1202 'authority' => null,
1208 $u = @parse_url($uri);
1210 if(array_key_exists('scheme', $u))
1211 $result['scheme'] = $u['scheme'];
1213 if(array_key_exists('host', $u)) {
1214 if(array_key_exists('user', $u))
1215 $result['authority'] = $u['user'];
1216 if(array_key_exists('pass', $u))
1217 $result['authority'] .= ':' . $u['pass'];
1218 if(array_key_exists('user', $u) || array_key_exists('pass', $u))
1219 $result['authority'] .= '@';
1220 $result['authority'] .= $u['host'];
1221 if(array_key_exists('port', $u))
1222 $result['authority'] .= ':' . $u['port'];
1225 if(array_key_exists('path', $u))
1226 $result['path'] = $u['path'];
1228 if(array_key_exists('query', $u))
1229 $result['query'] = $u['query'];
1231 if(array_key_exists('fragment', $u))
1232 $result['fragment'] = $u['fragment'];
1237 function resolveUrl($baseURI, $referenceURI) {
1240 'authority' => null,
1246 # 5.2.1 Pre-parse the Base URI
1247 # The base URI (Base) is established according to the procedure of
1248 # Section 5.1 and parsed into the five main components described in
1250 $base = parseUriToComponents($baseURI);
1252 # If base path is blank (http://example.com) then set it to /
1253 # (I can't tell if this is actually in the RFC or not, but seems like it makes sense)
1254 if($base['path'] == null)
1255 $base['path'] = '/';
1257 # 5.2.2. Transform References
1259 # The URI reference is parsed into the five URI components
1260 # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R);
1261 $reference = parseUriToComponents($referenceURI);
1263 # A non-strict parser may ignore a scheme in the reference
1264 # if it is identical to the base URI's scheme.
1267 if($reference['scheme']) {
1268 $target['scheme'] = $reference['scheme'];
1269 $target['authority'] = $reference['authority'];
1270 $target['path'] = removeDotSegments($reference['path']);
1271 $target['query'] = $reference['query'];
1273 if($reference['authority']) {
1274 $target['authority'] = $reference['authority'];
1275 $target['path'] = removeDotSegments($reference['path']);
1276 $target['query'] = $reference['query'];
1278 if($reference['path'] == '') {
1279 $target['path'] = $base['path'];
1280 if($reference['query']) {
1281 $target['query'] = $reference['query'];
1283 $target['query'] = $base['query'];
1286 if(substr($reference['path'], 0, 1) == '/') {
1287 $target['path'] = removeDotSegments($reference['path']);
1289 $target['path'] = mergePaths($base, $reference);
1290 $target['path'] = removeDotSegments($target['path']);
1292 $target['query'] = $reference['query'];
1294 $target['authority'] = $base['authority'];
1296 $target['scheme'] = $base['scheme'];
1298 $target['fragment'] = $reference['fragment'];
1300 # 5.3 Component Recomposition
1302 if($target['scheme']) {
1303 $result .= $target['scheme'] . ':';
1305 if($target['authority']) {
1306 $result .= '//' . $target['authority'];
1308 $result .= $target['path'];
1309 if($target['query']) {
1310 $result .= '?' . $target['query'];
1312 if($target['fragment']) {
1313 $result .= '#' . $target['fragment'];
1314 } elseif($referenceURI == '#') {
1321 function mergePaths($base, $reference) {
1322 # If the base URI has a defined authority component and an empty
1324 if($base['authority'] && $base['path'] == null) {
1325 # then return a string consisting of "/" concatenated with the
1326 # reference's path; otherwise,
1327 $merged = '/' . $reference['path'];
1329 if(($pos=strrpos($base['path'], '/')) !== false) {
1330 # return a string consisting of the reference's path component
1331 # appended to all but the last segment of the base URI's path (i.e.,
1332 # excluding any characters after the right-most "/" in the base URI
1334 $merged = substr($base['path'], 0, $pos + 1) . $reference['path'];
1336 # or excluding the entire base URI path if it does not contain
1337 # any "/" characters).
1338 $merged = $base['path'];
1344 # 5.2.4.A Remove leading ../ or ./
1345 function removeLeadingDotSlash(&$input) {
1346 if(substr($input, 0, 3) == '../') {
1347 $input = substr($input, 3);
1348 } elseif(substr($input, 0, 2) == './') {
1349 $input = substr($input, 2);
1353 # 5.2.4.B Replace leading /. with /
1354 function removeLeadingSlashDot(&$input) {
1355 if(substr($input, 0, 3) == '/./') {
1356 $input = '/' . substr($input, 3);
1358 $input = '/' . substr($input, 2);
1362 # 5.2.4.C Given leading /../ remove component from output buffer
1363 function removeOneDirLevel(&$input, &$output) {
1364 if(substr($input, 0, 4) == '/../') {
1365 $input = '/' . substr($input, 4);
1367 $input = '/' . substr($input, 3);
1369 $output = substr($output, 0, strrpos($output, '/'));
1372 # 5.2.4.D Remove . and .. if it's the only thing in the input
1373 function removeLoneDotDot(&$input) {
1375 $input = substr($input, 1);
1377 $input = substr($input, 2);
1381 # 5.2.4.E Move one segment from input to output
1382 function moveOneSegmentFromInput(&$input, &$output) {
1383 if(substr($input, 0, 1) != '/') {
1384 $pos = strpos($input, '/');
1386 $pos = strpos($input, '/', 1);
1389 if($pos === false) {
1393 $output .= substr($input, 0, $pos);
1394 $input = substr($input, $pos);
1398 # 5.2.4 Remove Dot Segments
1399 function removeDotSegments($path) {
1400 # 1. The input buffer is initialized with the now-appended path
1401 # components and the output buffer is initialized to the empty
1408 # 2. While the input buffer is not empty, loop as follows:
1412 if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') {
1413 # A. If the input buffer begins with a prefix of "../" or "./",
1414 # then remove that prefix from the input buffer; otherwise,
1415 removeLeadingDotSlash($input);
1416 } elseif(substr($input, 0, 3) == '/./' || $input == '/.') {
1417 # B. if the input buffer begins with a prefix of "/./" or "/.",
1418 # where "." is a complete path segment, then replace that
1419 # prefix with "/" in the input buffer; otherwise,
1420 removeLeadingSlashDot($input);
1421 } elseif(substr($input, 0, 4) == '/../' || $input == '/..') {
1422 # C. if the input buffer begins with a prefix of "/../" or "/..",
1423 # where ".." is a complete path segment, then replace that
1424 # prefix with "/" in the input buffer and remove the last
1425 # segment and its preceding "/" (if any) from the output
1426 # buffer; otherwise,
1427 removeOneDirLevel($input, $output);
1428 } elseif($input == '.' || $input == '..') {
1429 # D. if the input buffer consists only of "." or "..", then remove
1430 # that from the input buffer; otherwise,
1431 removeLoneDotDot($input);
1433 # E. move the first path segment in the input buffer to the end of
1434 # the output buffer and any subsequent characters up to, but not including,
1435 # the next "/" character or the end of the input buffer
1436 moveOneSegmentFromInput($input, $output);