9 * This source file is subject to the new BSD license that is bundled
10 * with this package in the file LICENSE.
11 * It is also available through the world-wide-web at this URL:
12 * http://phergie.org/license
15 * @package Phergie_Plugin_Url
16 * @author Phergie Development Team <team@phergie.org>
17 * @copyright 2008-2010 Phergie Development Team (http://phergie.org)
18 * @license http://phergie.org/license New BSD License
19 * @link http://pear.phergie.org/package/Phergie_Plugin_Url
23 * Monitors incoming messages for instances of URLs and responds with messages
24 * containing relevant information about detected URLs.
26 * Has an utility method accessible via
27 * $this->getPlugin('Url')->getTitle('http://foo..').
30 * @package Phergie_Plugin_Url
31 * @author Phergie Development Team <team@phergie.org>
32 * @license http://phergie.org/license New BSD License
33 * @link http://pear.phergie.org/package/Phergie_Plugin_Url
34 * @uses Phergie_Plugin_Http pear.phergie.org
36 class Phergie_Plugin_Url extends Phergie_Plugin_Abstract
41 * Can use the variables %nick%, %title% and %link% in it to display
42 * page titles and links
46 protected $baseFormat = '%message%';
47 protected $messageFormat = '[ %link% ] %title%';
50 * Flag indicating whether a single response should be sent for a single
51 * message containing multiple links
55 protected $mergeLinks = true;
58 * Max length of the fetched URL title
62 protected $titleLength = 40;
65 * Url cache to prevent spamming, especially with multiple bots on the
70 protected $urlCache = array();
71 protected $shortCache = array();
74 * Time in seconds to store the cached entries
76 * Setting it to 0 or below disables the cache expiration
80 protected $expire = 1800;
83 * Number of entries to keep in the cache at one time per channel
85 * Setting it to 0 or below disables the cache limit
89 protected $limit = 10;
92 * Flag that determines if the plugin will fall back to using an HTTP
93 * stream when a URL using SSL is detected and OpenSSL support isn't
94 * available in the PHP installation in use
98 protected $sslFallback = true;
101 * Flag that is set to true by the custom error handler if an HTTP error
102 * code has been received
106 protected $errorStatus = false;
107 protected $errorMessage = null;
110 * Flag indicating whether or not to display error messages as the title
111 * if a link posted encounters an error
115 protected $showErrors = true;
118 * Flag indicating whether to detect schemeless URLS (i.e. "example.com")
122 protected $detectSchemeless = false;
125 * List of error messages to return when the requested URL returns an
130 protected $httpErrors = array(
131 100 => '100 Continue',
133 201 => '201 Created',
134 204 => '204 No Content',
135 206 => '206 Partial Content',
136 300 => '300 Multiple Choices',
137 301 => '301 Moved Permanently',
139 303 => '303 See Other',
140 304 => '304 Not Modified',
141 307 => '307 Temporary Redirect',
142 400 => '400 Bad Request',
143 401 => '401 Unauthorized',
144 403 => '403 Forbidden',
145 404 => '404 Not Found',
146 405 => '405 Method Not Allowed',
147 406 => '406 Not Acceptable',
148 408 => '408 Request Timeout',
150 413 => '413 Request Entity Too Large',
151 414 => '414 Request URI Too Long',
152 415 => '415 Unsupported Media Type',
153 416 => '416 Requested Range Not Satisfiable',
154 417 => '417 Expectation Failed',
155 500 => '500 Internal Server Error',
156 501 => '501 Method Not Implemented',
157 503 => '503 Service Unavailable',
158 506 => '506 Variant Also Negotiates'
162 * An array containing a list of TLDs used for non-scheme matches
166 protected $tldList = array();
171 protected $shortener;
176 protected $renderers = array();
179 * Initializes settings, checks dependencies.
183 public function onConnect()
185 // make the shortener configurable
186 $shortener = $this->getConfig('url.shortener', 'Trim');
187 $shortener = "Phergie_Plugin_Url_Shorten_{$shortener}";
188 $this->shortener = new $shortener($this->plugins->getPlugin('Http'));
190 if (!$this->shortener instanceof Phergie_Plugin_Url_Shorten_Abstract) {
191 $this->fail("Declared shortener class {$shortener} is not of proper ancestry");
194 // Get a list of valid TLDs
195 if (!is_array($this->tldList) || count($this->tldList) <= 6) {
196 $tldPath = dirname(__FILE__) . '/Url/url.tld.txt';
197 $this->tldList = explode("\n", file_get_contents($tldPath));
198 $this->debug('Loaded ' . count($this->tldList) . ' tlds');
199 rsort($this->tldList);
202 // load config (a bit ugly, but focusing on porting):
205 'detect_schemeless' => 'detectSchemeless',
206 'base_format' => 'baseFormat',
207 'message_format' => 'messageFormat',
208 'merge_links' => 'mergeLinks',
209 'title_length' => 'titleLength',
210 'show_errors' => 'showErrors',
211 'expire' => 'expire',
212 ) as $config => $local) {
213 if (isset($this->config["url.{$config}"])) {
214 $this->$local = $this->config["uri.{$config}"];
220 * Checks an incoming message for the presence of a URL and, if one is
221 * found, responds with its title if it is an HTML document and the
222 * shortened equivalent of its original URL if it meets length requirements.
224 * @todo Update this to pull configuration settings from $this->config
225 * rather than caching them as class properties
228 public function onPrivmsg()
230 $source = $this->getEvent()->getSource();
231 $user = $this->getEvent()->getNick();
233 $pattern = '#'.($this->detectSchemeless ? '' : 'https?://').'(?:([0-9]{1,3}(?:\.[0-9]{1,3}){3})(?![^/]) | ('
234 .($this->detectSchemeless ? '(?<!http:/|https:/)[@/\\\]' : '').')?(?:(?:[a-z0-9_-]+\.?)+\.[a-z0-9]{1,6}))[^\s]*#xis';
237 if (preg_match_all($pattern, $this->getEvent()->getArgument(1), $matches, PREG_SET_ORDER)) {
238 $responses = array();
239 foreach ($matches as $m) {
240 $url = trim(rtrim($m[0], ', ].?!;'));
242 // Check to see if the URL was from an email address, is a directory, etc
244 $this->debug('Invalid Url: URL is either an email or a directory path. (' . $url . ')');
248 // Parse the given URL
249 if (!$parsed = $this->parseUrl($url)) {
250 $this->debug('Invalid Url: Could not parse the URL. (' . $url . ')');
254 // allow out-of-class renderers to handle this URL
255 foreach ($this->renderers as $renderer) {
256 if ($renderer->renderUrl($parsed) === true) {
257 // renderers should return true if they've fully
258 // rendered the passed URL (they're responsible
259 // for their own output)
260 $this->debug('Handled by renderer: ' . get_class($renderer));
265 // Check to see if the given IP/Host is valid
266 if (!empty($m[1]) and !$this->checkValidIP($m[1])) {
267 $this->debug('Invalid Url: ' . $m[1] . ' is not a valid IP address. (' . $url . ')');
271 // Process TLD if it's not an IP
273 // Get the TLD from the host
274 $pos = strrpos($parsed['host'], '.');
275 $parsed['tld'] = ($pos !== false ? substr($parsed['host'], ($pos+1)) : '');
277 // Check to see if the URL has a valid TLD
278 if (is_array($this->tldList) && !in_array(strtolower($parsed['tld']), $this->tldList)) {
279 $this->debug('Invalid Url: ' . $parsed['tld'] . ' is not a supported TLD. (' . $url . ')');
284 // Check to see if the URL is to a secured site or not and handle it accordingly
285 if ($parsed['scheme'] == 'https' && !extension_loaded('openssl')) {
286 if (!$this->sslFallback) {
287 $this->debug('Invalid Url: HTTPS is an invalid scheme, OpenSSL isn\'t available. (' . $url . ')');
290 $parsed['scheme'] = 'http';
294 if (!in_array($parsed['scheme'], array('http', 'https'))) {
295 $this->debug('Invalid Url: ' . $parsed['scheme'] . ' is not a supported scheme. (' . $url . ')');
298 $url = $this->glueURL($parsed);
302 $shortenedUrl = $this->shortener->shorten($url);
303 if (!$shortenedUrl) {
304 $this->debug('Invalid Url: Unable to shorten. (' . $url . ')');
309 if ($this->checkUrlCache($url, $shortenedUrl)) {
310 $this->debug('Invalid Url: URL is in the cache. (' . $url . ')');
314 $title = self::getTitle($url);
315 if (!empty($title)) {
316 $responses[] = str_replace(
325 ), $this->messageFormat
330 $this->updateUrlCache($url, $shortenedUrl);
331 unset($title, $shortenedUrl, $title);
334 * Check to see if there were any URL responses, format them and handle if they
335 * get merged into one message or not
337 if (count($responses) > 0) {
338 if ($this->mergeLinks) {
339 $message = str_replace(
344 implode('; ', $responses),
348 $this->doPrivmsg($source, $message);
350 foreach ($responses as $response) {
351 $message = str_replace(
356 implode('; ', $responses),
360 $this->doPrivmsg($source, $message);
368 * Checks a given URL (+shortened) against the cache to verify if they were
369 * previously posted on the channel.
371 * @param string $url The URL to check against
372 * @param string $shortenedUrl The shortened URL to check against
376 protected function checkUrlCache($url, $shortenedUrl)
378 $source = $this->getEvent()->getSource();
381 * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
382 * and minimize the size of the cache for less cache bloat.
384 $url = $this->getUrlChecksum($url);
385 $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
388 'url' => isset($this->urlCache[$source][$url]) ? $this->urlCache[$source][$url] : null,
389 'shortened' => isset($this->shortCache[$source][$shortenedUrl]) ? $this->shortCache[$source][$shortenedUrl] : null
392 $expire = $this->expire;
393 $this->debug("Cache expire: {$expire}");
395 * If cache expiration is enabled, check to see if the given url has expired in the cache
396 * If expire is disabled, simply check to see if the url is listed
398 if (($expire > 0 && (($cache['url'] + $expire) > time() || ($cache['shortened'] + $expire) > time()))
399 || ($expire <= 0 && (isset($cache['url']) || isset($cache['shortened'])))
401 unset($cache, $url, $shortenedUrl, $expire);
404 unset($cache, $url, $shortenedUrl, $expire);
409 * Updates the cache and adds the given URL (+shortened) to the cache. It
410 * also handles cleaning the cache of old entries as well.
412 * @param string $url The URL to add to the cache
413 * @param string $shortenedUrl The shortened to add to the cache
417 protected function updateUrlCache($url, $shortenedUrl)
419 $source = $this->getEvent()->getSource();
422 * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
423 * and minimize the size of the cache for less cache bloat.
425 $url = $this->getUrlChecksum($url);
426 $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
429 // Handle the URL cache and remove old entries that surpass the limit if enabled
430 $this->urlCache[$source][$url] = $time;
431 if ($this->limit > 0 && count($this->urlCache[$source]) > $this->limit) {
432 asort($this->urlCache[$source], SORT_NUMERIC);
433 array_shift($this->urlCache[$source]);
436 // Handle the shortened cache and remove old entries that surpass the limit if enabled
437 $this->shortCache[$source][$shortenedUrl] = $time;
438 if ($this->limit > 0 && count($this->shortCache[$source]) > $this->limit) {
439 asort($this->shortCache[$source], SORT_NUMERIC);
440 array_shift($this->shortCache[$source]);
442 unset($url, $shortenedUrl, $time);
446 * Transliterates a UTF-8 string into corresponding ASCII characters and
447 * truncates and appends an ellipsis to the string if it exceeds a given
450 * @param string $str String to decode
451 * @param int $trim Maximum string length, optional
455 protected function decode($str, $trim = null)
457 $out = $this->decodeTranslit($str);
459 $out = substr($out, 0, $trim) . (strlen($out) > $trim ? '...' : '');
465 * Custom error handler meant to handle 404 errors and such
467 * @param int $errno the error code
468 * @param string $errstr the error string
469 * @param string $errfile file the error occured in
470 * @param int $errline line the error occured on
474 public function onPhpError($errno, $errstr, $errfile, $errline)
476 if ($errno === E_WARNING) {
477 // Check to see if there was HTTP warning while connecting to the site
478 if (preg_match('{HTTP/1\.[01] ([0-9]{3})}i', $errstr, $m)) {
479 $this->errorStatus = $m[1];
480 $this->errorMessage = (isset($this->httpErrors[$m[1]]) ? $this->httpErrors[$m[1]] : $m[1]);
481 $this->debug('PHP Warning: ' . $errstr . 'in ' . $errfile . ' on line ' . $errline);
485 // Safely ignore these SSL warnings so they don't appear in the log
486 if (stripos($errstr, 'SSL: fatal protocol error in') !== false
487 || stripos($errstr, 'failed to open stream') !== false
488 || stripos($errstr, 'HTTP request failed') !== false
489 || stripos($errstr, 'SSL: An existing connection was forcibly closed by the remote host') !== false
490 || stripos($errstr, 'Failed to enable crypto in') !== false
491 || stripos($errstr, 'SSL: An established connection was aborted by the software in your host machine') !== false
492 || stripos($errstr, 'SSL operation failed with code') !== false
493 || stripos($errstr, 'unable to connect to') !== false
495 $this->errorStatus = true;
496 $this->debug('PHP Warning: ' . $errstr . 'in ' . $errfile . ' on line ' . $errline);
504 * Takes a url, parses and cleans the URL without of all the junk
505 * and then return the hex checksum of the url.
507 * @param string $url url to checksum
509 * @return string the hex checksum of the cleaned url
511 protected function getUrlChecksum($url)
513 $checksum = strtolower(urldecode($this->glueUrl($url, true)));
514 $checksum = preg_replace('#\s#', '', $this->decodeTranslit($checksum));
515 return dechex(crc32($checksum));
519 * Parses a given URI and procceses the output to remove redundant
522 * @param string $url the url to parse
524 * @return array the url components
526 protected function parseUrl($url)
528 if (is_array($url)) return $url;
530 $url = trim(ltrim($url, ' /@\\'));
531 if (!preg_match('&^(?:([a-z][-+.a-z0-9]*):)&xis', $url, $matches)) {
532 $url = 'http://' . $url;
534 $parsed = parse_url($url);
536 if (!isset($parsed['scheme'])) {
537 $parsed['scheme'] = 'http';
539 $parsed['scheme'] = strtolower($parsed['scheme']);
541 if (isset($parsed['path']) && !isset($parsed['host'])) {
542 $host = $parsed['path'];
544 if (strpos($parsed['path'], '/') !== false) {
545 list($host, $path) = array_pad(explode('/', $parsed['path'], 2), 2, null);
547 $parsed['host'] = $host;
548 $parsed['path'] = $path;
555 * Parses a given URI and then glues it back together in the proper format.
556 * If base is set, then it chops off the scheme, user and pass and fragment
557 * information to return a more unique base URI.
559 * @param string $uri uri to rebuild
560 * @param string $base set to true to only return the base components
562 * @return string the rebuilt uri
564 protected function glueUrl($uri, $base = false)
567 if (!is_array($parsed)) {
568 $parsed = $this->parseUrl($parsed);
571 if (is_array($parsed)) {
574 $uri .= (!empty($parsed['scheme']) ? $parsed['scheme'] . ':' .
575 ((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '');
576 $uri .= (!empty($parsed['user']) ? $parsed['user'] .
577 (!empty($parsed['pass']) ? ':' . $parsed['pass'] : '') . '@' : '');
579 if ($base && !empty($parsed['host'])) {
580 $parsed['host'] = trim($parsed['host']);
581 if (substr($parsed['host'], 0, 4) == 'www.') {
582 $parsed['host'] = substr($parsed['host'], 4);
585 $uri .= (!empty($parsed['host']) ? $parsed['host'] : '');
586 if (!empty($parsed['port'])
587 && (($parsed['scheme'] == 'http' && $parsed['port'] == 80)
588 || ($parsed['scheme'] == 'https' && $parsed['port'] == 443))
590 unset($parsed['port']);
592 $uri .= (!empty($parsed['port']) ? ':' . $parsed['port'] : '');
593 if (!empty($parsed['path']) && (!$base || $base && $parsed['path'] != '/')) {
594 $uri .= (substr($parsed['path'], 0, 1) == '/') ? $parsed['path'] : ('/' . $parsed['path']);
596 $uri .= (!empty($parsed['query']) ? '?' . $parsed['query'] : '');
598 $uri .= (!empty($parsed['fragment']) ? '#' . $parsed['fragment'] : '');
605 * Checks the given string to see if its a valid IP4 address
607 * @param string $ip the ip to validate
611 protected function checkValidIP($ip)
613 return long2ip(ip2long($ip)) === $ip;
617 * Returns the title of the given page
619 * @param string $url url to the page
621 * @return string title
623 public function getTitle($url)
625 $http = $this->plugins->getPlugin('Http');
628 'user_agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12'
631 $response = $http->get($url, array(), $options);
633 $header = $response->getHeaders('Content-Type');
634 if (!preg_match('#^(text/x?html|application/xhtml+xml)(?:;.*)?$#', $header)) {
638 $content = $response->getContent();
640 if (preg_match('#<title[^>]*>(.*?)</title>#is', $content, $match)) {
641 $title = html_entity_decode(trim($match[1]));
646 if ($response->isError()) {
647 $title = $response->getCodeAsString();
657 * Output a debug message
659 * @param string $msg the message to output
663 protected function debug($msg)
665 echo "(DEBUG:Url) $msg\n";
669 * Placeholder/porting helper. Has no function.
671 * @param string $str a string to return
675 protected function decodeTranslit($str)
677 // placeholder/porting helper
682 * Add a renderer to the stack
684 * @param object $obj the renderer to add
688 public function registerRenderer($obj)
690 $this->renderers[] = $obj;
691 array_unique($this->renderers);