3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
27 class FeedSubBadURLException extends FeedSubException
31 class FeedSubBadResponseException extends FeedSubException
35 class FeedSubEmptyException extends FeedSubException
39 class FeedSubBadHTMLException extends FeedSubException
43 class FeedSubUnrecognizedTypeException extends FeedSubException
47 class FeedSubNoFeedException extends FeedSubException
51 class FeedSubBadXmlException extends FeedSubException
55 class FeedSubNoHubException extends FeedSubException
60 * Given a web page or feed URL, discover the final location of the feed
61 * and return its current contents.
64 * $feed = new FeedDiscovery();
65 * if ($feed->discoverFromURL($url)) {
68 * processFeed($feed->feed); // DOMDocument
78 /** Post-initialize query helper... */
79 public function getLink($rel, $type=null)
81 // @fixme check for non-Atom links in RSS2 feeds as well
82 return self::getAtomLink($rel, $type);
85 public function getAtomLink($rel, $type=null)
87 return ActivityUtils::getLink($this->root, $rel, $type);
91 * Get the referenced PuSH hub link from an Atom feed.
93 * @return mixed string or false
95 public function getHubLink()
97 return $this->getAtomLink('hub');
102 * @param bool $htmlOk pass false here if you don't want to follow web pages.
103 * @return string with validated URL
104 * @throws FeedSubBadURLException
105 * @throws FeedSubBadHtmlException
106 * @throws FeedSubNoFeedException
107 * @throws FeedSubEmptyException
108 * @throws FeedSubUnrecognizedTypeException
110 function discoverFromURL($url, $htmlOk=true)
113 $client = new HTTPClient();
114 $response = $client->get($url);
115 } catch (HTTP_Request2_Exception $e) {
116 common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
117 throw new FeedSubBadURLException($e->getMessage());
121 $type = $response->getHeader('Content-Type');
122 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
124 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
126 throw new FeedSubNoFeedException($url);
128 return $this->discoverFromURL($target, false);
132 return $this->initFromResponse($response);
135 function discoverFromFeedURL($url)
137 return $this->discoverFromURL($url, false);
140 function initFromResponse($response)
142 if (!$response->isOk()) {
143 throw new FeedSubBadResponseException($response->getStatus());
146 $sourceurl = $response->getUrl();
147 $body = $response->getBody();
149 throw new FeedSubEmptyException($sourceurl);
152 $type = $response->getHeader('Content-Type');
153 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
154 return $this->init($sourceurl, $type, $body);
156 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
157 throw new FeedSubUnrecognizedTypeException($type);
161 function init($sourceurl, $type, $body)
163 $feed = new DOMDocument();
164 if ($feed->loadXML($body)) {
165 $this->uri = $sourceurl;
169 $el = $this->feed->documentElement;
171 // Looking for the "root" element: RSS channel or Atom feed
173 if ($el->tagName == 'rss') {
174 $channels = $el->getElementsByTagName('channel');
175 if ($channels->length > 0) {
176 $this->root = $channels->item(0);
178 throw new FeedSubBadXmlException($sourceurl);
180 } else if ($el->tagName == 'feed') {
183 throw new FeedSubBadXmlException($sourceurl);
188 throw new FeedSubBadXmlException($sourceurl);
193 * @param string $url source URL, used to resolve relative links
194 * @param string $body HTML body text
195 * @return mixed string with URL or false if no target found
197 function discoverFromHTML($url, $body)
199 // DOMDocument::loadHTML may throw warnings on unrecognized elements,
200 // and notices on unrecognized namespaces.
201 $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
202 $dom = new DOMDocument();
203 $ok = $dom->loadHTML($body);
204 error_reporting($old);
207 throw new FeedSubBadHtmlException();
210 // Autodiscovery links may be relative to the page's URL or <base href>
212 $nodes = $dom->getElementsByTagName('base');
213 for ($i = 0; $i < $nodes->length; $i++) {
214 $node = $nodes->item($i);
215 if ($node->hasAttributes()) {
216 $href = $node->attributes->getNamedItem('href');
218 $base = trim($href->value);
223 $base = $this->resolveURI($base, $url);
228 // Ok... now on to the links!
229 // Types listed in order of priority -- we'll prefer Atom if available.
230 // @fixme merge with the munger link checks
232 'application/atom+xml' => false,
233 'application/rss+xml' => false,
236 $nodes = $dom->getElementsByTagName('link');
237 for ($i = 0; $i < $nodes->length; $i++) {
238 $node = $nodes->item($i);
239 if ($node->hasAttributes()) {
240 $rel = $node->attributes->getNamedItem('rel');
241 $type = $node->attributes->getNamedItem('type');
242 $href = $node->attributes->getNamedItem('href');
243 if ($rel && $type && $href) {
244 $rel = array_filter(explode(" ", $rel->value));
245 $type = trim($type->value);
246 $href = trim($href->value);
248 if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
249 // Save the first feed found of each type...
250 $feeds[$type] = $this->resolveURI($href, $base);
256 // Return the highest-priority feed found
257 foreach ($feeds as $type => $url) {
267 * Resolve a possibly relative URL against some absolute base URL
268 * @param string $rel relative or absolute URL
269 * @param string $base absolute URL
270 * @return string absolute URL, or original URL if could not be resolved.
272 function resolveURI($rel, $base)
274 require_once "Net/URL2.php";
276 $relUrl = new Net_URL2($rel);
277 if ($relUrl->isAbsolute()) {
280 $baseUrl = new Net_URL2($base);
281 $absUrl = $baseUrl->resolve($relUrl);
282 return $absUrl->getURL();
283 } catch (Exception $e) {
284 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
285 $rel . '" against base "' . $base . '": ' . $e->getMessage());