9 * This source file is subject to the new BSD license that is bundled
10 * with this package in the file LICENSE.
11 * It is also available through the world-wide-web at this URL:
12 * http://phergie.org/license
15 * @package Phergie_Plugin_Url
16 * @author Phergie Development Team <team@phergie.org>
17 * @copyright 2008-2010 Phergie Development Team (http://phergie.org)
18 * @license http://phergie.org/license New BSD License
19 * @link http://pear.phergie.org/package/Phergie_Plugin_Url
23 * Monitors incoming messages for instances of URLs and responds with messages
24 * containing relevant information about detected URLs.
26 * Has an utility method accessible via
27 * $this->getPlugin('Url')->getTitle('http://foo..').
30 * @package Phergie_Plugin_Url
31 * @author Phergie Development Team <team@phergie.org>
32 * @license http://phergie.org/license New BSD License
33 * @link http://pear.phergie.org/package/Phergie_Plugin_Url
34 * @uses Phergie_Plugin_Encoding pear.phergie.org
35 * @uses Phergie_Plugin_Http pear.phergie.org
36 * @uses Phergie_Plugin_Tld pear.phergie.org
38 class Phergie_Plugin_Url extends Phergie_Plugin_Abstract
43 * Can use the variables %nick%, %title% and %link% in it to display
44 * page titles and links
48 protected $baseFormat = '%message%';
49 protected $messageFormat = '[ %link% ] %title%';
52 * Flag indicating whether a single response should be sent for a single
53 * message containing multiple links
57 protected $mergeLinks = true;
60 * Max length of the fetched URL title
64 protected $titleLength = 40;
67 * Url cache to prevent spamming, especially with multiple bots on the
72 protected $urlCache = array();
73 protected $shortCache = array();
76 * Time in seconds to store the cached entries
78 * Setting it to 0 or below disables the cache expiration
82 protected $expire = 1800;
85 * Number of entries to keep in the cache at one time per channel
87 * Setting it to 0 or below disables the cache limit
91 protected $limit = 10;
94 * Flag that determines if the plugin will fall back to using an HTTP
95 * stream when a URL using SSL is detected and OpenSSL support isn't
96 * available in the PHP installation in use
100 protected $sslFallback = true;
103 * Flag that is set to true by the custom error handler if an HTTP error
104 * code has been received
108 protected $errorStatus = false;
109 protected $errorMessage = null;
112 * Flag indicating whether or not to display error messages as the title
113 * if a link posted encounters an error
117 protected $showErrors = true;
120 * Flag indicating whether to detect schemeless URLS (i.e. "example.com")
124 protected $detectSchemeless = false;
129 protected $shortener;
134 protected $renderers = array();
137 * Checks for dependencies.
141 public function onLoad()
143 $plugins = $this->plugins;
144 $plugins->getPlugin('Encoding');
145 $plugins->getPlugin('Http');
146 $plugins->getPlugin('Tld');
148 // make the shortener configurable
149 $shortener = $this->getConfig('url.shortener', 'Trim');
150 $shortener = "Phergie_Plugin_Url_Shorten_{$shortener}";
151 $this->shortener = new $shortener($this->plugins->getPlugin('Http'));
153 if (!$this->shortener instanceof Phergie_Plugin_Url_Shorten_Abstract) {
154 $this->fail("Declared shortener class {$shortener} is not of proper ancestry");
157 // load config (a bit ugly, but focusing on porting):
160 'detect_schemeless' => 'detectSchemeless',
161 'base_format' => 'baseFormat',
162 'message_format' => 'messageFormat',
163 'merge_links' => 'mergeLinks',
164 'title_length' => 'titleLength',
165 'show_errors' => 'showErrors',
166 'expire' => 'expire',
167 ) as $config => $local) {
168 if (isset($this->config["url.{$config}"])) {
169 $this->$local = $this->config["uri.{$config}"];
175 * Checks an incoming message for the presence of a URL and, if one is
176 * found, responds with its title if it is an HTML document and the
177 * shortened equivalent of its original URL if it meets length requirements.
179 * @todo Update this to pull configuration settings from $this->config
180 * rather than caching them as class properties
183 public function onPrivmsg()
189 * Checks an incoming message for the presence of a URL and, if one is
190 * found, responds with its title if it is an HTML document and the
191 * shortened equivalent of its original URL if it meets length requirements.
193 * @todo Update this to pull configuration settings from $this->config
194 * rather than caching them as class properties
197 public function onAction()
203 * Handles message events and responds with url titles.
207 protected function handleMsg()
209 $source = $this->getEvent()->getSource();
210 $user = $this->getEvent()->getNick();
212 $responses = array();
213 $urls = $this->findUrls($this->getEvent()->getArgument(1));
215 foreach ($urls as $parsed) {
216 $url = $parsed['glued'];
218 // allow out-of-class renderers to handle this URL
219 foreach ($this->renderers as $renderer) {
220 if ($renderer->renderUrl($parsed) === true) {
221 // renderers should return true if they've fully
222 // rendered the passed URL (they're responsible
223 // for their own output)
224 $this->debug('Handled by renderer: ' . get_class($renderer));
230 $shortenedUrl = $this->shortener->shorten($url);
231 if (!$shortenedUrl) {
232 $this->debug('Invalid Url: Unable to shorten. (' . $url . ')');
233 $shortenedUrl = $url;
237 if ($this->checkUrlCache($url, $shortenedUrl)) {
238 $this->debug('Invalid Url: URL is in the cache. (' . $url . ')');
242 $title = $this->getTitle($url);
243 if (!empty($title)) {
244 $responses[] = str_replace(
253 ), $this->messageFormat
258 $this->updateUrlCache($url, $shortenedUrl);
259 unset($title, $shortenedUrl, $title);
262 // Check to see if there were any URL responses, format them and handle if they
263 // get merged into one message or not
264 if (count($responses) > 0) {
265 if ($this->mergeLinks) {
266 $message = str_replace(
271 implode('; ', $responses),
275 $this->doPrivmsg($source, $message);
277 foreach ($responses as $response) {
278 $message = str_replace(
283 implode('; ', $responses),
287 $this->doPrivmsg($source, $message);
294 * Detect URLs in a given string.
296 * @param string $message the string to detect urls in
298 * @return array the array of urls found
300 public function findUrls($message)
302 $pattern = '#'.($this->detectSchemeless ? '' : 'https?://').'(?:([0-9]{1,3}(?:\.[0-9]{1,3}){3})(?![^/]) | ('
303 .($this->detectSchemeless ? '(?<!http:/|https:/)[@/\\\]' : '').')?(?:(?:[a-z0-9_-]+\.?)+\.[a-z0-9]{1,6}))[^\s]*#xis';
307 if (preg_match_all($pattern, $message, $matches, PREG_SET_ORDER)) {
308 foreach ($matches as $m) {
309 $url = trim(rtrim($m[0], ', ].?!;'));
311 // Check to see if the URL was from an email address, is a directory, etc
313 $this->debug('Invalid Url: URL is either an email or a directory path. (' . $url . ')');
317 // Parse the given URL
318 if (!$parsed = $this->parseUrl($url)) {
319 $this->debug('Invalid Url: Could not parse the URL. (' . $url . ')');
323 // Check to see if the given IP/Host is valid
324 if (!empty($m[1]) and !$this->checkValidIP($m[1])) {
325 $this->debug('Invalid Url: ' . $m[1] . ' is not a valid IP address. (' . $url . ')');
329 // Process TLD if it's not an IP
331 // Get the TLD from the host
332 $pos = strrpos($parsed['host'], '.');
333 $parsed['tld'] = ($pos !== false ? substr($parsed['host'], ($pos+1)) : '');
335 // Check to see if the URL has a valid TLD
336 if ($this->plugins->tld->getTld($parsed['tld']) === false) {
337 $this->debug('Invalid Url: ' . $parsed['tld'] . ' is not a supported TLD. (' . $url . ')');
342 // Check to see if the URL is to a secured site or not and handle it accordingly
343 if ($parsed['scheme'] == 'https' && !extension_loaded('openssl')) {
344 if (!$this->sslFallback) {
345 $this->debug('Invalid Url: HTTPS is an invalid scheme, OpenSSL isn\'t available. (' . $url . ')');
348 $parsed['scheme'] = 'http';
352 if (!in_array($parsed['scheme'], array('http', 'https'))) {
353 $this->debug('Invalid Url: ' . $parsed['scheme'] . ' is not a supported scheme. (' . $url . ')');
357 $urls[] = $parsed + array('glued' => $this->glueURL($parsed));
365 * Checks a given URL (+shortened) against the cache to verify if they were
366 * previously posted on the channel.
368 * @param string $url The URL to check against
369 * @param string $shortenedUrl The shortened URL to check against
373 protected function checkUrlCache($url, $shortenedUrl)
375 $source = $this->getEvent()->getSource();
378 * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
379 * and minimize the size of the cache for less cache bloat.
381 $url = $this->getUrlChecksum($url);
382 $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
385 'url' => isset($this->urlCache[$source][$url]) ? $this->urlCache[$source][$url] : null,
386 'shortened' => isset($this->shortCache[$source][$shortenedUrl]) ? $this->shortCache[$source][$shortenedUrl] : null
389 $expire = $this->expire;
390 $this->debug("Cache expire: {$expire}");
392 * If cache expiration is enabled, check to see if the given url has expired in the cache
393 * If expire is disabled, simply check to see if the url is listed
395 if (($expire > 0 && (($cache['url'] + $expire) > time() || ($cache['shortened'] + $expire) > time()))
396 || ($expire <= 0 && (isset($cache['url']) || isset($cache['shortened'])))
398 unset($cache, $url, $shortenedUrl, $expire);
401 unset($cache, $url, $shortenedUrl, $expire);
406 * Updates the cache and adds the given URL (+shortened) to the cache. It
407 * also handles cleaning the cache of old entries as well.
409 * @param string $url The URL to add to the cache
410 * @param string $shortenedUrl The shortened to add to the cache
414 protected function updateUrlCache($url, $shortenedUrl)
416 $source = $this->getEvent()->getSource();
419 * Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
420 * and minimize the size of the cache for less cache bloat.
422 $url = $this->getUrlChecksum($url);
423 $shortenedUrl = $this->getUrlChecksum($shortenedUrl);
426 // Handle the URL cache and remove old entries that surpass the limit if enabled
427 $this->urlCache[$source][$url] = $time;
428 if ($this->limit > 0 && count($this->urlCache[$source]) > $this->limit) {
429 asort($this->urlCache[$source], SORT_NUMERIC);
430 array_shift($this->urlCache[$source]);
433 // Handle the shortened cache and remove old entries that surpass the limit if enabled
434 $this->shortCache[$source][$shortenedUrl] = $time;
435 if ($this->limit > 0 && count($this->shortCache[$source]) > $this->limit) {
436 asort($this->shortCache[$source], SORT_NUMERIC);
437 array_shift($this->shortCache[$source]);
439 unset($url, $shortenedUrl, $time);
443 * Transliterates a UTF-8 string into corresponding ASCII characters and
444 * truncates and appends an ellipsis to the string if it exceeds a given
447 * @param string $str String to decode
448 * @param int $trim Maximum string length, optional
452 protected function decode($str, $trim = null)
454 $out = $this->plugins->encoding->transliterate($str);
456 $out = substr($out, 0, $trim) . (strlen($out) > $trim ? '...' : '');
462 * Takes a url, parses and cleans the URL without of all the junk
463 * and then return the hex checksum of the url.
465 * @param string $url url to checksum
467 * @return string the hex checksum of the cleaned url
469 protected function getUrlChecksum($url)
471 $checksum = strtolower(urldecode($this->glueUrl($url, true)));
472 $checksum = preg_replace('#\s#', '', $this->plugins->encoding->transliterate($checksum));
473 return dechex(crc32($checksum));
477 * Parses a given URI and procceses the output to remove redundant
480 * @param string $url the url to parse
482 * @return array the url components
484 protected function parseUrl($url)
486 if (is_array($url)) return $url;
488 $url = trim(ltrim($url, ' /@\\'));
489 if (!preg_match('&^(?:([a-z][-+.a-z0-9]*):)&xis', $url, $matches)) {
490 $url = 'http://' . $url;
492 $parsed = parse_url($url);
494 if (!isset($parsed['scheme'])) {
495 $parsed['scheme'] = 'http';
497 $parsed['scheme'] = strtolower($parsed['scheme']);
499 if (isset($parsed['path']) && !isset($parsed['host'])) {
500 $host = $parsed['path'];
502 if (strpos($parsed['path'], '/') !== false) {
503 list($host, $path) = array_pad(explode('/', $parsed['path'], 2), 2, null);
505 $parsed['host'] = $host;
506 $parsed['path'] = $path;
513 * Parses a given URI and then glues it back together in the proper format.
514 * If base is set, then it chops off the scheme, user and pass and fragment
515 * information to return a more unique base URI.
517 * @param string $uri uri to rebuild
518 * @param string $base set to true to only return the base components
520 * @return string the rebuilt uri
522 protected function glueUrl($uri, $base = false)
525 if (!is_array($parsed)) {
526 $parsed = $this->parseUrl($parsed);
529 if (is_array($parsed)) {
532 $uri .= (!empty($parsed['scheme']) ? $parsed['scheme'] . ':' .
533 ((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '');
534 $uri .= (!empty($parsed['user']) ? $parsed['user'] .
535 (!empty($parsed['pass']) ? ':' . $parsed['pass'] : '') . '@' : '');
537 if ($base && !empty($parsed['host'])) {
538 $parsed['host'] = trim($parsed['host']);
539 if (substr($parsed['host'], 0, 4) == 'www.') {
540 $parsed['host'] = substr($parsed['host'], 4);
543 $uri .= (!empty($parsed['host']) ? $parsed['host'] : '');
544 if (!empty($parsed['port'])
545 && (($parsed['scheme'] == 'http' && $parsed['port'] == 80)
546 || ($parsed['scheme'] == 'https' && $parsed['port'] == 443))
548 unset($parsed['port']);
550 $uri .= (!empty($parsed['port']) ? ':' . $parsed['port'] : '');
551 if (!empty($parsed['path']) && (!$base || $base && $parsed['path'] != '/')) {
552 $uri .= (substr($parsed['path'], 0, 1) == '/') ? $parsed['path'] : ('/' . $parsed['path']);
554 $uri .= (!empty($parsed['query']) ? '?' . $parsed['query'] : '');
556 $uri .= (!empty($parsed['fragment']) ? '#' . $parsed['fragment'] : '');
563 * Checks the given string to see if its a valid IP4 address
565 * @param string $ip the ip to validate
569 protected function checkValidIP($ip)
571 return long2ip(ip2long($ip)) === $ip;
575 * Returns the title of the given page
577 * @param string $url url to the page
579 * @return string title
581 public function getTitle($url)
583 $http = $this->plugins->getPlugin('Http');
586 'user_agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12'
589 $response = $http->head($url, array(), $options);
590 $header = $response->getHeaders('Content-Type');
592 if (!preg_match('#^(text/x?html|application/xhtml+xml)(?:;.*)?$#', $header)) {
595 $response = $http->get($url, array(), $options);
596 $content = $response->getContent();
597 if (preg_match('#<title[^>]*>(.*?)</title>#is', $content, $match)) {
598 $title = preg_replace('/[\s\v]+/', ' ', trim($match[1]));
601 $encoding = $this->plugins->getPlugin('Encoding');
602 $title = $encoding->decodeEntities($title);
605 if ($response->isError()) {
606 $title = $response->getCodeAsString();
616 * Output a debug message
618 * @param string $msg the message to output
622 protected function debug($msg)
624 echo "(DEBUG:Url) $msg\n";
628 * Add a renderer to the stack
630 * @param object $obj the renderer to add
634 public function registerRenderer($obj)
636 $this->renderers[spl_object_hash($obj)] = $obj;