3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET')) {
29 class FeedSubBadURLException extends FeedSubException
33 class FeedSubBadResponseException extends FeedSubException
37 class FeedSubEmptyException extends FeedSubException
41 class FeedSubBadHTMLException extends FeedSubException
45 class FeedSubUnrecognizedTypeException extends FeedSubException
49 class FeedSubNoFeedException extends FeedSubException
53 class FeedSubNoSalmonException extends FeedSubException
57 class FeedSubBadXmlException extends FeedSubException
61 class FeedSubNoHubException extends FeedSubException
66 * Given a web page or feed URL, discover the final location of the feed
67 * and return its current contents.
70 * $feed = new FeedDiscovery();
71 * if ($feed->discoverFromURL($url)) {
74 * processFeed($feed->feed); // DOMDocument
84 /** Post-initialize query helper... */
85 public function getLink($rel, $type=null)
87 // @fixme check for non-Atom links in RSS2 feeds as well
88 return self::getAtomLink($rel, $type);
91 public function getAtomLink($rel, $type=null)
93 return ActivityUtils::getLink($this->root, $rel, $type);
97 * Get the referenced WebSub hub link from an Atom feed.
99 * @return mixed string or false
101 public function getHubLink()
103 return $this->getAtomLink('hub');
108 * @param bool $htmlOk pass false here if you don't want to follow web pages.
109 * @return string with validated URL
110 * @throws FeedSubBadURLException
111 * @throws FeedSubBadHtmlException
112 * @throws FeedSubNoFeedException
113 * @throws FeedSubEmptyException
114 * @throws FeedSubUnrecognizedTypeException
116 function discoverFromURL($url, $htmlOk=true)
119 $client = new HTTPClient();
120 $response = $client->get($url);
121 } catch (Exception $e) {
122 common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
123 throw new FeedSubBadURLException($e->getMessage());
127 $type = $response->getHeader('Content-Type');
128 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
130 $target = $this->discoverFromHTML($response->getEffectiveUrl(), $response->getBody());
132 throw new FeedSubNoFeedException($url);
134 return $this->discoverFromURL($target, false);
138 return $this->initFromResponse($response);
141 function discoverFromFeedURL($url)
143 return $this->discoverFromURL($url, false);
146 function initFromResponse($response)
148 if (!$response->isOk()) {
149 throw new FeedSubBadResponseException($response->getStatus());
152 $sourceurl = $response->getEffectiveUrl();
153 $body = $response->getBody();
155 throw new FeedSubEmptyException($sourceurl);
158 $type = $response->getHeader('Content-Type');
159 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
160 return $this->init($sourceurl, $type, $body);
162 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
163 throw new FeedSubUnrecognizedTypeException($type);
167 function init($sourceurl, $type, $body)
169 $feed = new DOMDocument();
170 if ($feed->loadXML($body)) {
171 $this->uri = $sourceurl;
175 $el = $this->feed->documentElement;
177 // Looking for the "root" element: RSS channel or Atom feed
179 if ($el->tagName == 'rss') {
180 $channels = $el->getElementsByTagName('channel');
181 if ($channels->length > 0) {
182 $this->root = $channels->item(0);
184 throw new FeedSubBadXmlException($sourceurl);
186 } else if ($el->tagName == 'feed') {
189 throw new FeedSubBadXmlException($sourceurl);
194 throw new FeedSubBadXmlException($sourceurl);
199 * @param string $url source URL, used to resolve relative links
200 * @param string $body HTML body text
201 * @return mixed string with URL or false if no target found
203 function discoverFromHTML($url, $body)
205 // DOMDocument::loadHTML may throw warnings on unrecognized elements,
206 // and notices on unrecognized namespaces.
207 $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
208 $dom = new DOMDocument();
209 $ok = $dom->loadHTML($body);
210 error_reporting($old);
213 throw new FeedSubBadHtmlException();
216 // Autodiscovery links may be relative to the page's URL or <base href>
218 $nodes = $dom->getElementsByTagName('base');
219 for ($i = 0; $i < $nodes->length; $i++) {
220 $node = $nodes->item($i);
221 if ($node->hasAttributes()) {
222 $href = $node->attributes->getNamedItem('href');
224 $base = trim($href->value);
229 $base = $this->resolveURI($base, $url);
234 // Ok... now on to the links!
235 // Types listed in order of priority -- we'll prefer Atom if available.
236 // @fixme merge with the munger link checks
238 'application/atom+xml' => false,
239 'application/rss+xml' => false,
242 $nodes = $dom->getElementsByTagName('link');
243 for ($i = 0; $i < $nodes->length; $i++) {
244 $node = $nodes->item($i);
245 if ($node->hasAttributes()) {
246 $rel = $node->attributes->getNamedItem('rel');
247 $type = $node->attributes->getNamedItem('type');
248 $href = $node->attributes->getNamedItem('href');
249 if ($rel && $type && $href) {
250 $rel = array_filter(explode(" ", $rel->value));
251 $type = trim($type->value);
252 $href = trim($href->value);
254 if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
255 // Save the first feed found of each type...
256 $feeds[$type] = $this->resolveURI($href, $base);
262 // Return the highest-priority feed found
263 foreach ($feeds as $type => $url) {
273 * Resolve a possibly relative URL against some absolute base URL
274 * @param string $rel relative or absolute URL
275 * @param string $base absolute URL
276 * @return string absolute URL, or original URL if could not be resolved.
278 function resolveURI($rel, $base)
280 require_once "Net/URL2.php";
282 $relUrl = new Net_URL2($rel);
283 if ($relUrl->isAbsolute()) {
286 $baseUrl = new Net_URL2($base);
287 $absUrl = $baseUrl->resolve($relUrl);
288 return $absUrl->getURL();
289 } catch (Exception $e) {
290 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
291 $rel . '" against base "' . $base . '": ' . $e->getMessage());