3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
27 class FeedSubBadURLException extends FeedSubException
31 class FeedSubBadResponseException extends FeedSubException
35 class FeedSubEmptyException extends FeedSubException
39 class FeedSubBadHTMLException extends FeedSubException
43 class FeedSubUnrecognizedTypeException extends FeedSubException
47 class FeedSubNoFeedException extends FeedSubException
51 class FeedSubBadXmlException extends FeedSubException
55 class FeedSubNoHubException extends FeedSubException
60 * Given a web page or feed URL, discover the final location of the feed
61 * and return its current contents.
64 * $feed = new FeedDiscovery();
65 * if ($feed->discoverFromURL($url)) {
68 * processFeed($feed->feed); // DOMDocument
77 /** Post-initialize query helper... */
78 public function getLink($rel, $type=null)
80 // @fixme check for non-Atom links in RSS2 feeds as well
81 return self::getAtomLink($rel, $type);
84 public function getAtomLink($rel, $type=null)
86 return ActivityUtils::getLink($this->feed->documentElement, $rel, $type);
91 * @param bool $htmlOk pass false here if you don't want to follow web pages.
92 * @return string with validated URL
93 * @throws FeedSubBadURLException
94 * @throws FeedSubBadHtmlException
95 * @throws FeedSubNoFeedException
96 * @throws FeedSubEmptyException
97 * @throws FeedSubUnrecognizedTypeException
99 function discoverFromURL($url, $htmlOk=true)
102 $client = new HTTPClient();
103 $response = $client->get($url);
104 } catch (HTTP_Request2_Exception $e) {
105 common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
106 throw new FeedSubBadURLException($e);
110 $type = $response->getHeader('Content-Type');
111 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
113 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
115 throw new FeedSubNoFeedException($url);
117 return $this->discoverFromURL($target, false);
121 return $this->initFromResponse($response);
124 function discoverFromFeedURL($url)
126 return $this->discoverFromURL($url, false);
129 function initFromResponse($response)
131 if (!$response->isOk()) {
132 throw new FeedSubBadResponseException($response->getCode());
135 $sourceurl = $response->getUrl();
136 $body = $response->getBody();
138 throw new FeedSubEmptyException($sourceurl);
141 $type = $response->getHeader('Content-Type');
142 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
143 return $this->init($sourceurl, $type, $body);
145 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
146 throw new FeedSubUnrecognizedTypeException($type);
150 function init($sourceurl, $type, $body)
152 $feed = new DOMDocument();
153 if ($feed->loadXML($body)) {
154 $this->uri = $sourceurl;
159 throw new FeedSubBadXmlException($url);
164 * @param string $url source URL, used to resolve relative links
165 * @param string $body HTML body text
166 * @return mixed string with URL or false if no target found
168 function discoverFromHTML($url, $body)
170 // DOMDocument::loadHTML may throw warnings on unrecognized elements.
171 $old = error_reporting(error_reporting() & ~E_WARNING);
172 $dom = new DOMDocument();
173 $ok = $dom->loadHTML($body);
174 error_reporting($old);
177 throw new FeedSubBadHtmlException();
180 // Autodiscovery links may be relative to the page's URL or <base href>
182 $nodes = $dom->getElementsByTagName('base');
183 for ($i = 0; $i < $nodes->length; $i++) {
184 $node = $nodes->item($i);
185 if ($node->hasAttributes()) {
186 $href = $node->attributes->getNamedItem('href');
188 $base = trim($href->value);
193 $base = $this->resolveURI($base, $url);
198 // Ok... now on to the links!
199 // Types listed in order of priority -- we'll prefer Atom if available.
200 // @fixme merge with the munger link checks
202 'application/atom+xml' => false,
203 'application/rss+xml' => false,
206 $nodes = $dom->getElementsByTagName('link');
207 for ($i = 0; $i < $nodes->length; $i++) {
208 $node = $nodes->item($i);
209 if ($node->hasAttributes()) {
210 $rel = $node->attributes->getNamedItem('rel');
211 $type = $node->attributes->getNamedItem('type');
212 $href = $node->attributes->getNamedItem('href');
213 if ($rel && $type && $href) {
214 $rel = trim($rel->value);
215 $type = trim($type->value);
216 $href = trim($href->value);
218 if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) {
219 // Save the first feed found of each type...
220 $feeds[$type] = $this->resolveURI($href, $base);
226 // Return the highest-priority feed found
227 foreach ($feeds as $type => $url) {
237 * Resolve a possibly relative URL against some absolute base URL
238 * @param string $rel relative or absolute URL
239 * @param string $base absolute URL
240 * @return string absolute URL, or original URL if could not be resolved.
242 function resolveURI($rel, $base)
244 require_once "Net/URL2.php";
246 $relUrl = new Net_URL2($rel);
247 if ($relUrl->isAbsolute()) {
250 $baseUrl = new Net_URL2($base);
251 $absUrl = $baseUrl->resolve($relUrl);
252 return $absUrl->getURL();
253 } catch (Exception $e) {
254 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
255 $rel . '" against base "' . $base . '": ' . $e->getMessage());