curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
- $response = curl_exec($ch);
+ $html = curl_exec($ch);
$info = $curlInfo = curl_getinfo($ch);
curl_close($ch);
return null;
}
- $html = mb_substr($response, $info['header_size']);
return parse($html, $url, $convertClassic);
}
* @return string|array The prefixed name of the first microfomats class found or false
*/
function mfNamesFromClass($class, $prefix='h-') {
- $class = str_replace([' ', ' ', "\n"], ' ', $class);
+ $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
$classes = explode(' ', $class);
$matches = array();
foreach ($classes as $classname) {
- if (strpos($classname, $prefix) === 0 && $classname !== $prefix) {
+ $compare_classname = strtolower(' ' . $classname);
+ $compare_prefix = strtolower(' ' . $prefix);
+ if (stristr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
$matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
}
}
$prefixes = array('p-', 'u-', 'dt-', 'e-');
$propertyNames = array();
- $class = str_replace([' ', ' ', "\n"], ' ', $class);
+ $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
foreach (explode(' ', $class) as $classname) {
foreach ($prefixes as $prefix) {
- if (strpos($classname, $prefix) === 0 and $classname !== $prefix) {
+ $compare_classname = strtolower(' ' . $classname);
+ if (stristr($compare_classname, $prefix) && ($compare_classname != $prefix)) {
$propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix)));
}
}
preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
// if no am/pm specified
- if ( empty($matches[4]) ) {
+ if (empty($matches[4])) {
return $time;
}
// else am/pm specified
$hh = $matches[1];
// add 12 to the pm hours
- if ( $meridiem == 'pm' && ($hh < 12) )
- {
+ if ($meridiem == 'pm' && ($hh < 12)) {
$hh += 12;
}
$hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
// minutes
- $mm = ( empty($matches[2]) ) ? '00' : $matches[2];
+ $mm = (empty($matches[2]) ) ? '00' : $matches[2];
// seconds, only if supplied
- if ( !empty($matches[3]) )
- {
+ if (!empty($matches[3])) {
$ss = $matches[3];
}
- if ( empty($ss) ) {
+ if (empty($ss)) {
return sprintf('%s:%s', $hh, $mm);
}
else {
return sprintf('%s:%s:%s', $hh, $mm, $ss);
}
-
}
-
}
/**
}
break;
}
+
+ // Ignore <template> elements as per the HTML5 spec
+ foreach ($this->xpath->query('//template') as $templateEl) {
+ $templateEl->parentNode->removeChild($templateEl);
+ }
$this->baseurl = $baseurl;
$this->doc = $doc;
return true;
}
-
+
+ private function resolveChildUrls(DOMElement $el) {
+ $hyperlinkChildren = $this->xpath->query('.//*[@src or @href or @data]', $el);
+
+ foreach ($hyperlinkChildren as $child) {
+ if ($child->hasAttribute('href'))
+ $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
+ if ($child->hasAttribute('src'))
+ $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
+ if ($child->hasAttribute('data'))
+ $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
+ }
+ }
+
+ public function textContent(DOMElement $el) {
+ $this->resolveChildUrls($el);
+
+ $clonedEl = $el->cloneNode(true);
+
+ foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
+ $newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
+ $imgEl->parentNode->replaceChild($newNode, $imgEl);
+ }
+
+ return $clonedEl->textContent;
+ }
+
// TODO: figure out if this has problems with sms: and geo: URLs
public function resolveUrl($url) {
// If the URL is seriously malformed it’s probably beyond the scope of this
// Process value-class stuff
$val = '';
foreach ($valueClassElements as $el) {
- $val .= $el->textContent;
+ $val .= $this->textContent($el);
}
return unicodeTrim($val);
} elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') {
$pValue = $p->getAttribute('value');
} else {
- $pValue = unicodeTrim($p->textContent);
+ $pValue = unicodeTrim($this->textContent($p));
}
return $pValue;
} elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) {
return $u->getAttribute('value');
} else {
- return unicodeTrim($u->textContent);
+ return unicodeTrim($this->textContent($u));
}
}
// Expand relative URLs within children of this element
// TODO: as it is this is not relative to only children, make this .// and rerun tests
- $hyperlinkChildren = $this->xpath->query('//*[@src or @href or @data]', $e);
-
- foreach ($hyperlinkChildren as $child) {
- if ($child->hasAttribute('href'))
- $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href')));
- if ($child->hasAttribute('src'))
- $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src')));
- if ($child->hasAttribute('data'))
- $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data')));
- }
-
+ $this->resolveChildUrls($e);
+
$html = '';
foreach ($e->childNodes as $node) {
$html .= $node->C14N();
return array(
'html' => $html,
- 'value' => unicodeTrim($e->textContent)
+ 'value' => unicodeTrim($this->textContent($e))
);
}
'hrecipe' => 'h-recipe',
'hresume' => 'h-resume',
'hevent' => 'h-event',
- 'hreview' => 'h-review'
+ 'hreview' => 'h-review',
+ 'hproduct' => 'h-product'
);
public $classicPropertyMap = array(
'best' => 'p-best',
'worst' => 'p-worst',
'description' => 'p-description'
+ ),
+ 'hproduct' => array(
+ 'fn' => 'p-name',
+ 'photo' => 'u-photo',
+ 'brand' => 'p-brand',
+ 'category' => 'p-category',
+ 'description' => 'p-description',
+ 'identifier' => 'u-identifier',
+ 'url' => 'u-url',
+ 'review' => 'p-review h-review',
+ 'price' => 'p-price'
)
);
}