. */ /** * @package FeedSubPlugin * @maintainer Brion Vibber */ if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); } class FeedSubBadURLException extends FeedSubException { } class FeedSubBadResponseException extends FeedSubException { } class FeedSubEmptyException extends FeedSubException { } class FeedSubBadHTMLException extends FeedSubException { } class FeedSubUnrecognizedTypeException extends FeedSubException { } class FeedSubNoFeedException extends FeedSubException { } class FeedSubBadXmlException extends FeedSubException { } class FeedSubNoHubException extends FeedSubException { } /** * Given a web page or feed URL, discover the final location of the feed * and return its current contents. * * @example * $feed = new FeedDiscovery(); * if ($feed->discoverFromURL($url)) { * print $feed->uri; * print $feed->type; * processFeed($feed->feed); // DOMDocument * } */ class FeedDiscovery { public $uri; public $type; public $feed; /** Post-initialize query helper... */ public function getLink($rel, $type=null) { // @fixme check for non-Atom links in RSS2 feeds as well return self::getAtomLink($rel, $type); } public function getAtomLink($rel, $type=null) { return ActivityUtils::getLink($this->feed->documentElement, $rel, $type); } /** * @param string $url * @param bool $htmlOk pass false here if you don't want to follow web pages. * @return string with validated URL * @throws FeedSubBadURLException * @throws FeedSubBadHtmlException * @throws FeedSubNoFeedException * @throws FeedSubEmptyException * @throws FeedSubUnrecognizedTypeException */ function discoverFromURL($url, $htmlOk=true) { try { $client = new HTTPClient(); $response = $client->get($url); } catch (HTTP_Request2_Exception $e) { common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage()); throw new FeedSubBadURLException($e); } if ($htmlOk) { $type = $response->getHeader('Content-Type'); $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type); if ($isHtml) { $target = $this->discoverFromHTML($response->getUrl(), $response->getBody()); if (!$target) { throw new FeedSubNoFeedException($url); } return $this->discoverFromURL($target, false); } } return $this->initFromResponse($response); } function discoverFromFeedURL($url) { return $this->discoverFromURL($url, false); } function initFromResponse($response) { if (!$response->isOk()) { throw new FeedSubBadResponseException($response->getStatus()); } $sourceurl = $response->getUrl(); $body = $response->getBody(); if (!$body) { throw new FeedSubEmptyException($sourceurl); } $type = $response->getHeader('Content-Type'); if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) { return $this->init($sourceurl, $type, $body); } else { common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl"); throw new FeedSubUnrecognizedTypeException($type); } } function init($sourceurl, $type, $body) { $feed = new DOMDocument(); if ($feed->loadXML($body)) { $this->uri = $sourceurl; $this->type = $type; $this->feed = $feed; return $this->uri; } else { throw new FeedSubBadXmlException($url); } } /** * @param string $url source URL, used to resolve relative links * @param string $body HTML body text * @return mixed string with URL or false if no target found */ function discoverFromHTML($url, $body) { // DOMDocument::loadHTML may throw warnings on unrecognized elements. $old = error_reporting(error_reporting() & ~E_WARNING); $dom = new DOMDocument(); $ok = $dom->loadHTML($body); error_reporting($old); if (!$ok) { throw new FeedSubBadHtmlException(); } // Autodiscovery links may be relative to the page's URL or $base = false; $nodes = $dom->getElementsByTagName('base'); for ($i = 0; $i < $nodes->length; $i++) { $node = $nodes->item($i); if ($node->hasAttributes()) { $href = $node->attributes->getNamedItem('href'); if ($href) { $base = trim($href->value); } } } if ($base) { $base = $this->resolveURI($base, $url); } else { $base = $url; } // Ok... now on to the links! // Types listed in order of priority -- we'll prefer Atom if available. // @fixme merge with the munger link checks $feeds = array( 'application/atom+xml' => false, 'application/rss+xml' => false, ); $nodes = $dom->getElementsByTagName('link'); for ($i = 0; $i < $nodes->length; $i++) { $node = $nodes->item($i); if ($node->hasAttributes()) { $rel = $node->attributes->getNamedItem('rel'); $type = $node->attributes->getNamedItem('type'); $href = $node->attributes->getNamedItem('href'); if ($rel && $type && $href) { $rel = trim($rel->value); $type = trim($type->value); $href = trim($href->value); if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) { // Save the first feed found of each type... $feeds[$type] = $this->resolveURI($href, $base); } } } } // Return the highest-priority feed found foreach ($feeds as $type => $url) { if ($url) { return $url; } } return false; } /** * Resolve a possibly relative URL against some absolute base URL * @param string $rel relative or absolute URL * @param string $base absolute URL * @return string absolute URL, or original URL if could not be resolved. */ function resolveURI($rel, $base) { require_once "Net/URL2.php"; try { $relUrl = new Net_URL2($rel); if ($relUrl->isAbsolute()) { return $rel; } $baseUrl = new Net_URL2($base); $absUrl = $baseUrl->resolve($relUrl); return $absUrl->getURL(); } catch (Exception $e) { common_log(LOG_WARNING, 'Unable to resolve relative link "' . $rel . '" against base "' . $base . '": ' . $e->getMessage()); return $rel; } } }