mod/parse_url.php

   1 <?php
   2 /**
   3  * @file mod/parse_url.php
   4  *
   5  * @todo https://developers.google.com/+/plugins/snippet/
   6  *
   7  * @verbatim
   8  * <meta itemprop="name" content="Toller Titel">
   9  * <meta itemprop="description" content="Eine tolle Beschreibung">
  10  * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  11  *
  12  * <body itemscope itemtype="http://schema.org/Product">
  13  *   <h1 itemprop="name">Shiny Trinket</h1>
  14  *   <img itemprop="image" src="{image-url}" />
  15  *   <p itemprop="description">Shiny trinkets are shiny.</p>
  16  * </body>
  17  * @endverbatim
  18 */
  19
  20 if(!function_exists('deletenode')) {
  21         function deletenode(&$doc, $node)
  22         {
  23                 $xpath = new DomXPath($doc);
  24                 $list = $xpath->query("//".$node);
  25                 foreach ($list as $child)
  26                         $child->parentNode->removeChild($child);
  27         }
  28 }
  29
  30 function completeurl($url, $scheme) {
  31         $urlarr = parse_url($url);
  32
  33         if (isset($urlarr["scheme"]))
  34                 return($url);
  35
  36         $schemearr = parse_url($scheme);
  37
  38         $complete = $schemearr["scheme"]."://".$schemearr["host"];
  39
  40         if (@$schemearr["port"] != "")
  41                 $complete .= ":".$schemearr["port"];
  42
  43                 if(strpos($urlarr['path'],'/') !== 0)
  44                         $complete .= '/';
  45
  46         $complete .= $urlarr["path"];
  47
  48         if (@$urlarr["query"] != "")
  49                 $complete .= "?".$urlarr["query"];
  50
  51         if (@$urlarr["fragment"] != "")
  52                 $complete .= "#".$urlarr["fragment"];
  53
  54         return($complete);
  55 }
  56
  57 function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) {
  58
  59         if ($url == "")
  60                 return false;
  61
  62         $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
  63                 dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed));
  64
  65         if ($r)
  66                 $data = $r[0]["content"];
  67
  68         if (!is_null($data)) {
  69                 $data = unserialize($data);
  70                 return $data;
  71         }
  72
  73         $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed);
  74
  75         q("INSERT INTO `parsed_url` (`url`, `guessing`, `oembed`, `content`, `created`) VALUES ('%s', %d, %d, '%s', '%s')",
  76                 dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed), dbesc(serialize($data)), dbesc(datetime_convert()));
  77
  78         return $data;
  79 }
  80
  81 function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
  82         require_once("include/network.php");
  83
  84         $a = get_app();
  85
  86         $siteinfo = array();
  87
  88         if ($count > 10) {
  89                 logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
  90                 return($siteinfo);
  91         }
  92
  93         $url = trim($url, "'");
  94         $url = trim($url, '"');
  95
  96         $url = original_url($url);
  97
  98         $siteinfo["url"] = $url;
  99         $siteinfo["type"] = "link";
 100
 101         $stamp1 = microtime(true);
 102
 103         $ch = curl_init();
 104         curl_setopt($ch, CURLOPT_URL, $url);
 105         curl_setopt($ch, CURLOPT_HEADER, 1);
 106         curl_setopt($ch, CURLOPT_NOBODY, 1);
 107         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
 108         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 109         //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 110         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 111
 112         $header = curl_exec($ch);
 113         $curl_info = @curl_getinfo($ch);
 114         $http_code = $curl_info['http_code'];
 115         curl_close($ch);
 116
 117         $a->save_timestamp($stamp1, "network");
 118
 119         if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307"))
 120                 AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) {
 121                 if ($curl_info['redirect_url'] != "")
 122                         $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count);
 123                 else
 124                         $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count);
 125                 return($siteinfo);
 126         }
 127
 128         // if the file is too large then exit
 129         if ($curl_info["download_content_length"] > 1000000)
 130                 return($siteinfo);
 131
 132         // if it isn't a HTML file then exit
 133         if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html"))
 134                 return($siteinfo);
 135
 136         if ($do_oembed) {
 137                 require_once("include/oembed.php");
 138
 139                 $oembed_data = oembed_fetch_url($url);
 140
 141                 if ($oembed_data->type != "error")
 142                         $siteinfo["type"] = $oembed_data->type;
 143
 144                 if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) {
 145                         if (isset($oembed_data->title))
 146                                 $siteinfo["title"] = $oembed_data->title;
 147                         if (isset($oembed_data->description))
 148                                 $siteinfo["text"] = trim($oembed_data->description);
 149                         if (isset($oembed_data->thumbnail_url))
 150                                 $siteinfo["image"] = $oembed_data->thumbnail_url;
 151                 }
 152         }
 153
 154         $stamp1 = microtime(true);
 155
 156         // Now fetch the body as well
 157         $ch = curl_init();
 158         curl_setopt($ch, CURLOPT_URL, $url);
 159         curl_setopt($ch, CURLOPT_HEADER, 1);
 160         curl_setopt($ch, CURLOPT_NOBODY, 0);
 161         curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 162         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 163         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 164
 165         $header = curl_exec($ch);
 166         $curl_info = @curl_getinfo($ch);
 167         $http_code = $curl_info['http_code'];
 168         curl_close($ch);
 169
 170         $a->save_timestamp($stamp1, "network");
 171
 172         // Fetch the first mentioned charset. Can be in body or header
 173         $charset = "";
 174         if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 175                 $charset = trim(trim(trim(array_pop($matches)), ';,'));
 176
 177         if ($charset == "")
 178                 $charset = "utf-8";
 179
 180         $pos = strpos($header, "\r\n\r\n");
 181
 182         if ($pos)
 183                 $body = trim(substr($header, $pos));
 184         else
 185                 $body = $header;
 186
 187         if (($charset != '') AND (strtoupper($charset) != "UTF-8")) {
 188                 logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
 189                 //$body = mb_convert_encoding($body, "UTF-8", $charset);
 190                 $body = iconv($charset, "UTF-8//TRANSLIT", $body);
 191         }
 192
 193         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
 194
 195         $doc = new DOMDocument();
 196         @$doc->loadHTML($body);
 197
 198         deletenode($doc, 'style');
 199         deletenode($doc, 'script');
 200         deletenode($doc, 'option');
 201         deletenode($doc, 'h1');
 202         deletenode($doc, 'h2');
 203         deletenode($doc, 'h3');
 204         deletenode($doc, 'h4');
 205         deletenode($doc, 'h5');
 206         deletenode($doc, 'h6');
 207         deletenode($doc, 'ol');
 208         deletenode($doc, 'ul');
 209
 210         $xpath = new DomXPath($doc);
 211
 212         $list = $xpath->query("//meta[@content]");
 213         foreach ($list as $node) {
 214                 $attr = array();
 215                 if ($node->attributes->length)
 216                         foreach ($node->attributes as $attribute)
 217                                 $attr[$attribute->name] = $attribute->value;
 218
 219                 if (@$attr["http-equiv"] == 'refresh') {
 220                         $path = $attr["content"];
 221                         $pathinfo = explode(";", $path);
 222                         $content = "";
 223                         foreach ($pathinfo AS $value) {
 224                                 if (substr(strtolower($value), 0, 4) == "url=")
 225                                         $content = substr($value, 4);
 226                         }
 227                         if ($content != "") {
 228                                 $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count);
 229                                 return($siteinfo);
 230                         }
 231                 }
 232         }
 233
 234         //$list = $xpath->query("head/title");
 235         $list = $xpath->query("//title");
 236         foreach ($list as $node)
 237                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
 238
 239         //$list = $xpath->query("head/meta[@name]");
 240         $list = $xpath->query("//meta[@name]");
 241         foreach ($list as $node) {
 242                 $attr = array();
 243                 if ($node->attributes->length)
 244                         foreach ($node->attributes as $attribute)
 245                                 $attr[$attribute->name] = $attribute->value;
 246
 247                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 248
 249                 if ($attr["content"] != "")
 250                         switch (strtolower($attr["name"])) {
 251                                 case "fulltitle":
 252                                         $siteinfo["title"] = $attr["content"];
 253                                         break;
 254                                 case "description":
 255                                         $siteinfo["text"] = $attr["content"];
 256                                         break;
 257                                 case "thumbnail":
 258                                         $siteinfo["image"] = $attr["content"];
 259                                         break;
 260                                 case "twitter:image":
 261                                         $siteinfo["image"] = $attr["content"];
 262                                         break;
 263                                 case "twitter:image:src":
 264                                         $siteinfo["image"] = $attr["content"];
 265                                         break;
 266                                 case "twitter:card":
 267                                         if (($siteinfo["type"] == "") OR ($attr["content"] == "photo"))
 268                                                 $siteinfo["type"] = $attr["content"];
 269                                         break;
 270                                 case "twitter:description":
 271                                         $siteinfo["text"] = $attr["content"];
 272                                         break;
 273                                 case "twitter:title":
 274                                         $siteinfo["title"] = $attr["content"];
 275                                         break;
 276                                 case "dc.title":
 277                                         $siteinfo["title"] = $attr["content"];
 278                                         break;
 279                                 case "dc.description":
 280                                         $siteinfo["text"] = $attr["content"];
 281                                         break;
 282                                 case "keywords":
 283                                         $keywords = explode(",", $attr["content"]);
 284                                         break;
 285                                 case "news_keywords":
 286                                         $keywords = explode(",", $attr["content"]);
 287                                         break;
 288                         }
 289                 if ($siteinfo["type"] == "summary")
 290                         $siteinfo["type"] = "link";
 291         }
 292
 293         if (isset($keywords)) {
 294                 $siteinfo["keywords"] = array();
 295                 foreach ($keywords as $keyword)
 296                         if (!in_array(trim($keyword), $siteinfo["keywords"]))
 297                                 $siteinfo["keywords"][] = trim($keyword);
 298         }
 299
 300         //$list = $xpath->query("head/meta[@property]");
 301         $list = $xpath->query("//meta[@property]");
 302         foreach ($list as $node) {
 303                 $attr = array();
 304                 if ($node->attributes->length)
 305                         foreach ($node->attributes as $attribute)
 306                                 $attr[$attribute->name] = $attribute->value;
 307
 308                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 309
 310                 if ($attr["content"] != "")
 311                         switch (strtolower($attr["property"])) {
 312                                 case "og:image":
 313                                         $siteinfo["image"] = $attr["content"];
 314                                         break;
 315                                 case "og:title":
 316                                         $siteinfo["title"] = $attr["content"];
 317                                         break;
 318                                 case "og:description":
 319                                         $siteinfo["text"] = $attr["content"];
 320                                         break;
 321                         }
 322         }
 323
 324         if ((@$siteinfo["image"] == "") AND !$no_guessing) {
 325             $list = $xpath->query("//img[@src]");
 326             foreach ($list as $node) {
 327                 $attr = array();
 328                 if ($node->attributes->length)
 329                     foreach ($node->attributes as $attribute)
 330                         $attr[$attribute->name] = $attribute->value;
 331
 332                         $src = completeurl($attr["src"], $url);
 333                         $photodata = @getimagesize($src);
 334
 335                         if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) {
 336                                 if ($photodata[0] > 300) {
 337                                         $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
 338                                         $photodata[0] = 300;
 339                                 }
 340                                 if ($photodata[1] > 300) {
 341                                         $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
 342                                         $photodata[1] = 300;
 343                                 }
 344                                 $siteinfo["images"][] = array("src"=>$src,
 345                                                                 "width"=>$photodata[0],
 346                                                                 "height"=>$photodata[1]);
 347                         }
 348
 349                 }
 350     } else {
 351                 $src = completeurl($siteinfo["image"], $url);
 352
 353                 unset($siteinfo["image"]);
 354
 355                 $photodata = @getimagesize($src);
 356
 357                 if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10))
 358                         $siteinfo["images"][] = array("src"=>$src,
 359                                                         "width"=>$photodata[0],
 360                                                         "height"=>$photodata[1]);
 361         }
 362
 363         if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) {
 364                 $text = "";
 365
 366                 $list = $xpath->query("//div[@class='article']");
 367                 foreach ($list as $node)
 368                         if (strlen($node->nodeValue) > 40)
 369                                 $text .= " ".trim($node->nodeValue);
 370
 371                 if ($text == "") {
 372                         $list = $xpath->query("//div[@class='content']");
 373                         foreach ($list as $node)
 374                                 if (strlen($node->nodeValue) > 40)
 375                                         $text .= " ".trim($node->nodeValue);
 376                 }
 377
 378                 // If none text was found then take the paragraph content
 379                 if ($text == "") {
 380                         $list = $xpath->query("//p");
 381                         foreach ($list as $node)
 382                                 if (strlen($node->nodeValue) > 40)
 383                                         $text .= " ".trim($node->nodeValue);
 384                 }
 385
 386                 if ($text != "") {
 387                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
 388
 389                         while (strpos($text, "  "))
 390                                 $text = trim(str_replace("  ", " ", $text));
 391
 392                         $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...');
 393                 }
 394         }
 395
 396         logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
 397
 398         call_hooks('getsiteinfo', $siteinfo);
 399
 400         return($siteinfo);
 401 }
 402
 403 function arr_add_hashes(&$item,$k) {
 404         $item = '#' . $item;
 405 }
 406
 407 function parse_url_content(&$a) {
 408
 409         $text = null;
 410         $str_tags = '';
 411
 412         $textmode = false;
 413
 414         if(local_user() && (! feature_enabled(local_user(),'richtext')))
 415                 $textmode = true;
 416
 417         //if($textmode)
 418         $br = (($textmode) ? "\n" : '<br />');
 419
 420         if(x($_GET,'binurl'))
 421                 $url = trim(hex2bin($_GET['binurl']));
 422         else
 423                 $url = trim($_GET['url']);
 424
 425         if($_GET['title'])
 426                 $title = strip_tags(trim($_GET['title']));
 427
 428         if($_GET['description'])
 429                 $text = strip_tags(trim($_GET['description']));
 430
 431         if($_GET['tags']) {
 432                 $arr_tags = str_getcsv($_GET['tags']);
 433                 if(count($arr_tags)) {
 434                         array_walk($arr_tags,'arr_add_hashes');
 435                         $str_tags = $br . implode(' ',$arr_tags) . $br;
 436                 }
 437         }
 438
 439         // add url scheme if missing
 440         $arrurl = parse_url($url);
 441         if (!x($arrurl, 'scheme')) {
 442                 if (x($arrurl, 'host'))
 443                         $url = "http:".$url;
 444                 else
 445                         $url = "http://".$url;
 446         }
 447
 448         logger('parse_url: ' . $url);
 449
 450         if($textmode)
 451                 $template = '[bookmark=%s]%s[/bookmark]%s';
 452         else
 453                 $template = "<a class=\"bookmark\" href=\"%s\" >%s</a>%s";
 454
 455         $arr = array('url' => $url, 'text' => '');
 456
 457         call_hooks('parse_link', $arr);
 458
 459         if(strlen($arr['text'])) {
 460                 echo $arr['text'];
 461                 killme();
 462         }
 463
 464
 465         if($url && $title && $text) {
 466
 467                 $title = str_replace(array("\r","\n"),array('',''),$title);
 468
 469                 if($textmode)
 470                         $text = '[quote]' . trim($text) . '[/quote]' . $br;
 471                 else {
 472                         $text = '<blockquote>' . htmlspecialchars(trim($text)) . '</blockquote><br />';
 473                         $title = htmlspecialchars($title);
 474                 }
 475
 476                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 477
 478                 logger('parse_url (unparsed): returns: ' . $result);
 479
 480                 echo $result;
 481                 killme();
 482         }
 483
 484         $siteinfo = parseurl_getsiteinfo($url);
 485
 486 //      if ($textmode) {
 487 //              require_once("include/items.php");
 488 //
 489 //              echo add_page_info_data($siteinfo);
 490 //              killme();
 491 //      }
 492
 493         $url= $siteinfo["url"];
 494
 495         // If the link contains BBCode stuff, make a short link out of this to avoid parsing problems
 496         if (strpos($url, '[') OR strpos($url, ']')) {
 497                 require_once("include/network.php");
 498                 $url = short_link($url);
 499         }
 500
 501         $sitedata = "";
 502
 503         if($siteinfo["title"] != "") {
 504                 $text = $siteinfo["text"];
 505                 $title = $siteinfo["title"];
 506         }
 507
 508         $image = "";
 509
 510         if (($siteinfo["type"] != "video") AND (sizeof($siteinfo["images"]) > 0)){
 511                 /* Execute below code only if image is present in siteinfo */
 512
 513                 $total_images = 0;
 514                 $max_images = get_config('system','max_bookmark_images');
 515                 if($max_images === false)
 516                         $max_images = 2;
 517                 else
 518                         $max_images = intval($max_images);
 519
 520                 foreach ($siteinfo["images"] as $imagedata) {
 521                         if($textmode)
 522                                 $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]' . "\n";
 523                         else
 524                                 $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" /><br />';
 525                         $total_images ++;
 526                         if($max_images && $max_images >= $total_images)
 527                                 break;
 528                 }
 529         }
 530
 531         if(strlen($text)) {
 532                 if($textmode)
 533                         $text = '[quote]'.trim($text).'[/quote]';
 534                 else
 535                         $text = '<blockquote>'.htmlspecialchars(trim($text)).'</blockquote>';
 536         }
 537
 538         if($image)
 539                 $text = $br.$br.$image.$text;
 540         else
 541                 $text = $br.$text;
 542
 543         $title = str_replace(array("\r","\n"),array('',''),$title);
 544
 545         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 546
 547         logger('parse_url: returns: ' . $result);
 548
 549         $sitedata .=  trim($result);
 550
 551         if (($siteinfo["type"] == "video") AND ($url != ""))
 552                 echo "[class=type-video]".$sitedata."[/class]";
 553         elseif (($siteinfo["type"] != "photo"))
 554                 echo "[class=type-link]".$sitedata."[/class]";
 555         else
 556                 echo "[class=type-photo]".$title.$br.$image."[/class]";
 557
 558         killme();
 559 }
 560 ?>