3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2009, StatusNet, Inc.
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * @package FeedSubPlugin
22 * @maintainer Brion Vibber <brion@status.net>
25 if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
27 class FeedSubBadURLException extends FeedSubException
31 class FeedSubBadResponseException extends FeedSubException
35 class FeedSubEmptyException extends FeedSubException
39 class FeedSubBadHTMLException extends FeedSubException
43 class FeedSubUnrecognizedTypeException extends FeedSubException
47 class FeedSubNoFeedException extends FeedSubException
52 * Given a web page or feed URL, discover the final location of the feed
53 * and return its current contents.
56 * $feed = new FeedDiscovery();
57 * if ($feed->discoverFromURL($url)) {
60 * processFeed($feed->body);
70 public function feedMunger()
72 require_once 'XML/Feed/Parser.php';
73 $feed = new XML_Feed_Parser($this->body, false, false, true); // @fixme
74 return new FeedMunger($feed, $this->uri);
79 * @param bool $htmlOk pass false here if you don't want to follow web pages.
80 * @return string with validated URL
81 * @throws FeedSubBadURLException
82 * @throws FeedSubBadHtmlException
83 * @throws FeedSubNoFeedException
84 * @throws FeedSubEmptyException
85 * @throws FeedSubUnrecognizedTypeException
87 function discoverFromURL($url, $htmlOk=true)
90 $client = new HTTPClient();
91 $response = $client->get($url);
92 } catch (HTTP_Request2_Exception $e) {
93 throw new FeedSubBadURLException($e);
97 $type = $response->getHeader('Content-Type');
98 $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
100 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
102 throw new FeedSubNoFeedException($url);
104 return $this->discoverFromURL($target, false);
108 return $this->initFromResponse($response);
111 function initFromResponse($response)
113 if (!$response->isOk()) {
114 throw new FeedSubBadResponseException($response->getCode());
117 $sourceurl = $response->getUrl();
118 $body = $response->getBody();
120 throw new FeedSubEmptyException($sourceurl);
123 $type = $response->getHeader('Content-Type');
124 if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
125 $this->uri = $sourceurl;
130 common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
131 throw new FeedSubUnrecognizedTypeException($type);
136 * @param string $url source URL, used to resolve relative links
137 * @param string $body HTML body text
138 * @return mixed string with URL or false if no target found
140 function discoverFromHTML($url, $body)
142 // DOMDocument::loadHTML may throw warnings on unrecognized elements.
143 $old = error_reporting(error_reporting() & ~E_WARNING);
144 $dom = new DOMDocument();
145 $ok = $dom->loadHTML($body);
146 error_reporting($old);
149 throw new FeedSubBadHtmlException();
152 // Autodiscovery links may be relative to the page's URL or <base href>
154 $nodes = $dom->getElementsByTagName('base');
155 for ($i = 0; $i < $nodes->length; $i++) {
156 $node = $nodes->item($i);
157 if ($node->hasAttributes()) {
158 $href = $node->attributes->getNamedItem('href');
160 $base = trim($href->value);
165 $base = $this->resolveURI($base, $url);
170 // Ok... now on to the links!
171 // Types listed in order of priority -- we'll prefer Atom if available.
172 // @fixme merge with the munger link checks
174 'application/atom+xml' => false,
175 'application/rss+xml' => false,
178 $nodes = $dom->getElementsByTagName('link');
179 for ($i = 0; $i < $nodes->length; $i++) {
180 $node = $nodes->item($i);
181 if ($node->hasAttributes()) {
182 $rel = $node->attributes->getNamedItem('rel');
183 $type = $node->attributes->getNamedItem('type');
184 $href = $node->attributes->getNamedItem('href');
185 if ($rel && $type && $href) {
186 $rel = trim($rel->value);
187 $type = trim($type->value);
188 $href = trim($href->value);
190 if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) {
191 // Save the first feed found of each type...
192 $feeds[$type] = $this->resolveURI($href, $base);
198 // Return the highest-priority feed found
199 foreach ($feeds as $type => $url) {
209 * Resolve a possibly relative URL against some absolute base URL
210 * @param string $rel relative or absolute URL
211 * @param string $base absolute URL
212 * @return string absolute URL, or original URL if could not be resolved.
214 function resolveURI($rel, $base)
216 require_once "Net/URL2.php";
218 $relUrl = new Net_URL2($rel);
219 if ($relUrl->isAbsolute()) {
222 $baseUrl = new Net_URL2($base);
223 $absUrl = $baseUrl->resolve($relUrl);
224 return $absUrl->getURL();
225 } catch (Exception $e) {
226 common_log(LOG_WARNING, 'Unable to resolve relative link "' .
227 $rel . '" against base "' . $base . '": ' . $e->getMessage());