3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET')) {
29 class FeedSubBadURLException extends FeedSubException
33 class FeedSubBadResponseException extends FeedSubException
37 class FeedSubEmptyException extends FeedSubException
41 class FeedSubBadHTMLException extends FeedSubException
45 class FeedSubUnrecognizedTypeException extends FeedSubException
49 class FeedSubNoFeedException extends FeedSubException
53 class FeedSubBadXmlException extends FeedSubException
57 class FeedSubNoHubException extends FeedSubException
62 * Given a web page or feed URL, discover the final location of the feed
63 * and return its current contents.
66 * $feed = new FeedDiscovery();
67 * if ($feed->discoverFromURL($url)) {
70 * processFeed($feed->feed); // DOMDocument
80 /** Post-initialize query helper... */
81 public function getLink($rel, $type=null)
83 // @fixme check for non-Atom links in RSS2 feeds as well
84 return self::getAtomLink($rel, $type);
87 public function getAtomLink($rel, $type=null)
89 return ActivityUtils::getLink($this->root, $rel, $type);
93 * Get the referenced PuSH hub link from an Atom feed.
95 * @return mixed string or false
97 public function getHubLink()
99 return $this->getAtomLink('hub');
104 * @param bool $htmlOk pass false here if you don't want to follow web pages.
105 * @return string with validated URL
106 * @throws FeedSubBadURLException
107 * @throws FeedSubBadHtmlException
108 * @throws FeedSubNoFeedException
109 * @throws FeedSubEmptyException
110 * @throws FeedSubUnrecognizedTypeException
112 function discoverFromURL($url, $htmlOk=true)
115 $client = new HTTPClient();
116 $response = $client->get($url);
117 } catch (HTTP_Request2_Exception $e) {
118 common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
119 throw new FeedSubBadURLException($e->getMessage());
123 $type = $response->getHeader('Content-Type');
124 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
126 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
128 throw new FeedSubNoFeedException($url);
130 return $this->discoverFromURL($target, false);
134 return $this->initFromResponse($response);
137 function discoverFromFeedURL($url)
139 return $this->discoverFromURL($url, false);
142 function initFromResponse($response)
144 if (!$response->isOk()) {
145 throw new FeedSubBadResponseException($response->getStatus());
148 $sourceurl = $response->getUrl();
149 $body = $response->getBody();
151 throw new FeedSubEmptyException($sourceurl);
154 $type = $response->getHeader('Content-Type');
155 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
156 return $this->init($sourceurl, $type, $body);
158 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
159 throw new FeedSubUnrecognizedTypeException($type);
163 function init($sourceurl, $type, $body)
165 $feed = new DOMDocument();
166 if ($feed->loadXML($body)) {
167 $this->uri = $sourceurl;
171 $el = $this->feed->documentElement;
173 // Looking for the "root" element: RSS channel or Atom feed
175 if ($el->tagName == 'rss') {
176 $channels = $el->getElementsByTagName('channel');
177 if ($channels->length > 0) {
178 $this->root = $channels->item(0);
180 throw new FeedSubBadXmlException($sourceurl);
182 } else if ($el->tagName == 'feed') {
185 throw new FeedSubBadXmlException($sourceurl);
190 throw new FeedSubBadXmlException($sourceurl);
195 * @param string $url source URL, used to resolve relative links
196 * @param string $body HTML body text
197 * @return mixed string with URL or false if no target found
199 function discoverFromHTML($url, $body)
201 // DOMDocument::loadHTML may throw warnings on unrecognized elements,
202 // and notices on unrecognized namespaces.
203 $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
204 $dom = new DOMDocument();
205 $ok = $dom->loadHTML($body);
206 error_reporting($old);
209 throw new FeedSubBadHtmlException();
212 // Autodiscovery links may be relative to the page's URL or <base href>
214 $nodes = $dom->getElementsByTagName('base');
215 for ($i = 0; $i < $nodes->length; $i++) {
216 $node = $nodes->item($i);
217 if ($node->hasAttributes()) {
218 $href = $node->attributes->getNamedItem('href');
220 $base = trim($href->value);
225 $base = $this->resolveURI($base, $url);
230 // Ok... now on to the links!
231 // Types listed in order of priority -- we'll prefer Atom if available.
232 // @fixme merge with the munger link checks
234 'application/atom+xml' => false,
235 'application/rss+xml' => false,
238 $nodes = $dom->getElementsByTagName('link');
239 for ($i = 0; $i < $nodes->length; $i++) {
240 $node = $nodes->item($i);
241 if ($node->hasAttributes()) {
242 $rel = $node->attributes->getNamedItem('rel');
243 $type = $node->attributes->getNamedItem('type');
244 $href = $node->attributes->getNamedItem('href');
245 if ($rel && $type && $href) {
246 $rel = array_filter(explode(" ", $rel->value));
247 $type = trim($type->value);
248 $href = trim($href->value);
250 if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
251 // Save the first feed found of each type...
252 $feeds[$type] = $this->resolveURI($href, $base);
258 // Return the highest-priority feed found
259 foreach ($feeds as $type => $url) {
269 * Resolve a possibly relative URL against some absolute base URL
270 * @param string $rel relative or absolute URL
271 * @param string $base absolute URL
272 * @return string absolute URL, or original URL if could not be resolved.
274 function resolveURI($rel, $base)
276 require_once "Net/URL2.php";
278 $relUrl = new Net_URL2($rel);
279 if ($relUrl->isAbsolute()) {
282 $baseUrl = new Net_URL2($base);
283 $absUrl = $baseUrl->resolve($relUrl);
284 return $absUrl->getURL();
285 } catch (Exception $e) {
286 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
287 $rel . '" against base "' . $base . '": ' . $e->getMessage());