3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
27 class FeedSubBadURLException extends FeedSubException
31 class FeedSubBadResponseException extends FeedSubException
35 class FeedSubEmptyException extends FeedSubException
39 class FeedSubBadHTMLException extends FeedSubException
43 class FeedSubUnrecognizedTypeException extends FeedSubException
47 class FeedSubNoFeedException extends FeedSubException
51 class FeedSubBadXmlException extends FeedSubException
55 class FeedSubNoHubException extends FeedSubException
60 * Given a web page or feed URL, discover the final location of the feed
61 * and return its current contents.
64 * $feed = new FeedDiscovery();
65 * if ($feed->discoverFromURL($url)) {
68 * processFeed($feed->feed); // DOMDocument
78 /** Post-initialize query helper... */
79 public function getLink($rel, $type=null)
81 // @fixme check for non-Atom links in RSS2 feeds as well
82 return self::getAtomLink($rel, $type);
85 public function getAtomLink($rel, $type=null)
87 return ActivityUtils::getLink($this->root, $rel, $type);
92 * @param bool $htmlOk pass false here if you don't want to follow web pages.
93 * @return string with validated URL
94 * @throws FeedSubBadURLException
95 * @throws FeedSubBadHtmlException
96 * @throws FeedSubNoFeedException
97 * @throws FeedSubEmptyException
98 * @throws FeedSubUnrecognizedTypeException
100 function discoverFromURL($url, $htmlOk=true)
103 $client = new HTTPClient();
104 $response = $client->get($url);
105 } catch (HTTP_Request2_Exception $e) {
106 common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
107 throw new FeedSubBadURLException($e->getMessage());
111 $type = $response->getHeader('Content-Type');
112 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
114 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
116 throw new FeedSubNoFeedException($url);
118 return $this->discoverFromURL($target, false);
122 return $this->initFromResponse($response);
125 function discoverFromFeedURL($url)
127 return $this->discoverFromURL($url, false);
130 function initFromResponse($response)
132 if (!$response->isOk()) {
133 throw new FeedSubBadResponseException($response->getStatus());
136 $sourceurl = $response->getUrl();
137 $body = $response->getBody();
139 throw new FeedSubEmptyException($sourceurl);
142 $type = $response->getHeader('Content-Type');
143 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
144 return $this->init($sourceurl, $type, $body);
146 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
147 throw new FeedSubUnrecognizedTypeException($type);
151 function init($sourceurl, $type, $body)
153 $feed = new DOMDocument();
154 if ($feed->loadXML($body)) {
155 $this->uri = $sourceurl;
159 $el = $this->feed->documentElement;
161 // Looking for the "root" element: RSS channel or Atom feed
163 if ($el->tagName == 'rss') {
164 $channels = $el->getElementsByTagName('channel');
165 if ($channels->length > 0) {
166 $this->root = $channels->item(0);
168 throw new FeedSubBadXmlException($sourceurl);
170 } else if ($el->tagName == 'feed') {
173 throw new FeedSubBadXmlException($sourceurl);
178 throw new FeedSubBadXmlException($sourceurl);
183 * @param string $url source URL, used to resolve relative links
184 * @param string $body HTML body text
185 * @return mixed string with URL or false if no target found
187 function discoverFromHTML($url, $body)
189 // DOMDocument::loadHTML may throw warnings on unrecognized elements.
190 $old = error_reporting(error_reporting() & ~E_WARNING);
191 $dom = new DOMDocument();
192 $ok = $dom->loadHTML($body);
193 error_reporting($old);
196 throw new FeedSubBadHtmlException();
199 // Autodiscovery links may be relative to the page's URL or <base href>
201 $nodes = $dom->getElementsByTagName('base');
202 for ($i = 0; $i < $nodes->length; $i++) {
203 $node = $nodes->item($i);
204 if ($node->hasAttributes()) {
205 $href = $node->attributes->getNamedItem('href');
207 $base = trim($href->value);
212 $base = $this->resolveURI($base, $url);
217 // Ok... now on to the links!
218 // Types listed in order of priority -- we'll prefer Atom if available.
219 // @fixme merge with the munger link checks
221 'application/atom+xml' => false,
222 'application/rss+xml' => false,
225 $nodes = $dom->getElementsByTagName('link');
226 for ($i = 0; $i < $nodes->length; $i++) {
227 $node = $nodes->item($i);
228 if ($node->hasAttributes()) {
229 $rel = $node->attributes->getNamedItem('rel');
230 $type = $node->attributes->getNamedItem('type');
231 $href = $node->attributes->getNamedItem('href');
232 if ($rel && $type && $href) {
233 $rel = array_filter(explode(" ", $rel->value));
234 $type = trim($type->value);
235 $href = trim($href->value);
237 if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
238 // Save the first feed found of each type...
239 $feeds[$type] = $this->resolveURI($href, $base);
245 // Return the highest-priority feed found
246 foreach ($feeds as $type => $url) {
256 * Resolve a possibly relative URL against some absolute base URL
257 * @param string $rel relative or absolute URL
258 * @param string $base absolute URL
259 * @return string absolute URL, or original URL if could not be resolved.
261 function resolveURI($rel, $base)
263 require_once "Net/URL2.php";
265 $relUrl = new Net_URL2($rel);
266 if ($relUrl->isAbsolute()) {
269 $baseUrl = new Net_URL2($base);
270 $absUrl = $baseUrl->resolve($relUrl);
271 return $absUrl->getURL();
272 } catch (Exception $e) {
273 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
274 $rel . '" against base "' . $base . '": ' . $e->getMessage());