mod/parse_url.php

   1 <?php
   2 /**
   3  * @file mod/parse_url.php
   4  *
   5  * @todo https://developers.google.com/+/plugins/snippet/
   6  *
   7  * @verbatim
   8  * <meta itemprop="name" content="Toller Titel">
   9  * <meta itemprop="description" content="Eine tolle Beschreibung">
  10  * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  11  *
  12  * <body itemscope itemtype="http://schema.org/Product">
  13  *   <h1 itemprop="name">Shiny Trinket</h1>
  14  *   <img itemprop="image" src="{image-url}" />
  15  *   <p itemprop="description">Shiny trinkets are shiny.</p>
  16  * </body>
  17  * @endverbatim
  18 */
  19
  20 if(!function_exists('deletenode')) {
  21         function deletenode(&$doc, $node)
  22         {
  23                 $xpath = new DomXPath($doc);
  24                 $list = $xpath->query("//".$node);
  25                 foreach ($list as $child)
  26                         $child->parentNode->removeChild($child);
  27         }
  28 }
  29
  30 function completeurl($url, $scheme) {
  31         $urlarr = parse_url($url);
  32
  33         if (isset($urlarr["scheme"]))
  34                 return($url);
  35
  36         $schemearr = parse_url($scheme);
  37
  38         $complete = $schemearr["scheme"]."://".$schemearr["host"];
  39
  40         if (@$schemearr["port"] != "")
  41                 $complete .= ":".$schemearr["port"];
  42
  43                 if(strpos($urlarr['path'],'/') !== 0)
  44                         $complete .= '/';
  45
  46         $complete .= $urlarr["path"];
  47
  48         if (@$urlarr["query"] != "")
  49                 $complete .= "?".$urlarr["query"];
  50
  51         if (@$urlarr["fragment"] != "")
  52                 $complete .= "#".$urlarr["fragment"];
  53
  54         return($complete);
  55 }
  56
  57 function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) {
  58
  59         $data = Cache::get("parse_url:".$no_guessing.":".$do_oembed.":".$url);
  60         if (!is_null($data)) {
  61                 $data = unserialize($data);
  62                 return $data;
  63         }
  64
  65         $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed);
  66
  67         Cache::set("parse_url:".$no_guessing.":".$do_oembed.":".$url,serialize($data), CACHE_DAY);
  68
  69         return $data;
  70 }
  71
  72 function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
  73         require_once("include/network.php");
  74         require_once("include/Photo.php");
  75
  76         $a = get_app();
  77
  78         $siteinfo = array();
  79
  80         if ($count > 10) {
  81                 logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
  82                 return($siteinfo);
  83         }
  84
  85         $url = trim($url, "'");
  86         $url = trim($url, '"');
  87
  88         $url = original_url($url);
  89
  90         $siteinfo["url"] = $url;
  91         $siteinfo["type"] = "link";
  92
  93         $stamp1 = microtime(true);
  94
  95         $ch = curl_init();
  96         curl_setopt($ch, CURLOPT_URL, $url);
  97         curl_setopt($ch, CURLOPT_HEADER, 1);
  98         curl_setopt($ch, CURLOPT_NOBODY, 1);
  99         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
 100         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 101         //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 102         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 103
 104         $header = curl_exec($ch);
 105         $curl_info = @curl_getinfo($ch);
 106         $http_code = $curl_info['http_code'];
 107         curl_close($ch);
 108
 109         $a->save_timestamp($stamp1, "network");
 110
 111         if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307"))
 112                 AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) {
 113                 if ($curl_info['redirect_url'] != "")
 114                         $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count);
 115                 else
 116                         $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count);
 117                 return($siteinfo);
 118         }
 119
 120         // if the file is too large then exit
 121         if ($curl_info["download_content_length"] > 1000000)
 122                 return($siteinfo);
 123
 124         // if it isn't a HTML file then exit
 125         if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html"))
 126                 return($siteinfo);
 127
 128         if ($do_oembed) {
 129                 require_once("include/oembed.php");
 130
 131                 $oembed_data = oembed_fetch_url($url);
 132
 133                 if ($oembed_data->type != "error")
 134                         $siteinfo["type"] = $oembed_data->type;
 135
 136                 if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) {
 137                         if (isset($oembed_data->title))
 138                                 $siteinfo["title"] = $oembed_data->title;
 139                         if (isset($oembed_data->description))
 140                                 $siteinfo["text"] = trim($oembed_data->description);
 141                         if (isset($oembed_data->thumbnail_url))
 142                                 $siteinfo["image"] = $oembed_data->thumbnail_url;
 143                 }
 144         }
 145
 146         $stamp1 = microtime(true);
 147
 148         // Now fetch the body as well
 149         $ch = curl_init();
 150         curl_setopt($ch, CURLOPT_URL, $url);
 151         curl_setopt($ch, CURLOPT_HEADER, 1);
 152         curl_setopt($ch, CURLOPT_NOBODY, 0);
 153         curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 154         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 155         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 156
 157         $header = curl_exec($ch);
 158         $curl_info = @curl_getinfo($ch);
 159         $http_code = $curl_info['http_code'];
 160         curl_close($ch);
 161
 162         $a->save_timestamp($stamp1, "network");
 163
 164         // Fetch the first mentioned charset. Can be in body or header
 165         $charset = "";
 166         if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 167                 $charset = trim(trim(trim(array_pop($matches)), ';,'));
 168
 169         if ($charset == "")
 170                 $charset = "utf-8";
 171
 172         $pos = strpos($header, "\r\n\r\n");
 173
 174         if ($pos)
 175                 $body = trim(substr($header, $pos));
 176         else
 177                 $body = $header;
 178
 179         if (($charset != '') AND (strtoupper($charset) != "UTF-8")) {
 180                 logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
 181                 //$body = mb_convert_encoding($body, "UTF-8", $charset);
 182                 $body = iconv($charset, "UTF-8//TRANSLIT", $body);
 183         }
 184
 185         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
 186
 187         $doc = new DOMDocument();
 188         @$doc->loadHTML($body);
 189
 190         deletenode($doc, 'style');
 191         deletenode($doc, 'script');
 192         deletenode($doc, 'option');
 193         deletenode($doc, 'h1');
 194         deletenode($doc, 'h2');
 195         deletenode($doc, 'h3');
 196         deletenode($doc, 'h4');
 197         deletenode($doc, 'h5');
 198         deletenode($doc, 'h6');
 199         deletenode($doc, 'ol');
 200         deletenode($doc, 'ul');
 201
 202         $xpath = new DomXPath($doc);
 203
 204         $list = $xpath->query("//meta[@content]");
 205         foreach ($list as $node) {
 206                 $attr = array();
 207                 if ($node->attributes->length)
 208                         foreach ($node->attributes as $attribute)
 209                                 $attr[$attribute->name] = $attribute->value;
 210
 211                 if (@$attr["http-equiv"] == 'refresh') {
 212                         $path = $attr["content"];
 213                         $pathinfo = explode(";", $path);
 214                         $content = "";
 215                         foreach ($pathinfo AS $value) {
 216                                 if (substr(strtolower($value), 0, 4) == "url=")
 217                                         $content = substr($value, 4);
 218                         }
 219                         if ($content != "") {
 220                                 $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count);
 221                                 return($siteinfo);
 222                         }
 223                 }
 224         }
 225
 226         //$list = $xpath->query("head/title");
 227         $list = $xpath->query("//title");
 228         foreach ($list as $node)
 229                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
 230
 231         //$list = $xpath->query("head/meta[@name]");
 232         $list = $xpath->query("//meta[@name]");
 233         foreach ($list as $node) {
 234                 $attr = array();
 235                 if ($node->attributes->length)
 236                         foreach ($node->attributes as $attribute)
 237                                 $attr[$attribute->name] = $attribute->value;
 238
 239                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 240
 241                 if ($attr["content"] != "")
 242                         switch (strtolower($attr["name"])) {
 243                                 case "fulltitle":
 244                                         $siteinfo["title"] = $attr["content"];
 245                                         break;
 246                                 case "description":
 247                                         $siteinfo["text"] = $attr["content"];
 248                                         break;
 249                                 case "thumbnail":
 250                                         $siteinfo["image"] = $attr["content"];
 251                                         break;
 252                                 case "twitter:image":
 253                                         $siteinfo["image"] = $attr["content"];
 254                                         break;
 255                                 case "twitter:image:src":
 256                                         $siteinfo["image"] = $attr["content"];
 257                                         break;
 258                                 case "twitter:card":
 259                                         if (($siteinfo["type"] == "") OR ($attr["content"] == "photo"))
 260                                                 $siteinfo["type"] = $attr["content"];
 261                                         break;
 262                                 case "twitter:description":
 263                                         $siteinfo["text"] = $attr["content"];
 264                                         break;
 265                                 case "twitter:title":
 266                                         $siteinfo["title"] = $attr["content"];
 267                                         break;
 268                                 case "dc.title":
 269                                         $siteinfo["title"] = $attr["content"];
 270                                         break;
 271                                 case "dc.description":
 272                                         $siteinfo["text"] = $attr["content"];
 273                                         break;
 274                                 case "keywords":
 275                                         $keywords = explode(",", $attr["content"]);
 276                                         break;
 277                                 case "news_keywords":
 278                                         $keywords = explode(",", $attr["content"]);
 279                                         break;
 280                         }
 281                 if ($siteinfo["type"] == "summary")
 282                         $siteinfo["type"] = "link";
 283         }
 284
 285         if (isset($keywords)) {
 286                 $siteinfo["keywords"] = array();
 287                 foreach ($keywords as $keyword)
 288                         if (!in_array(trim($keyword), $siteinfo["keywords"]))
 289                                 $siteinfo["keywords"][] = trim($keyword);
 290         }
 291
 292         //$list = $xpath->query("head/meta[@property]");
 293         $list = $xpath->query("//meta[@property]");
 294         foreach ($list as $node) {
 295                 $attr = array();
 296                 if ($node->attributes->length)
 297                         foreach ($node->attributes as $attribute)
 298                                 $attr[$attribute->name] = $attribute->value;
 299
 300                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 301
 302                 if ($attr["content"] != "")
 303                         switch (strtolower($attr["property"])) {
 304                                 case "og:image":
 305                                         $siteinfo["image"] = $attr["content"];
 306                                         break;
 307                                 case "og:title":
 308                                         $siteinfo["title"] = $attr["content"];
 309                                         break;
 310                                 case "og:description":
 311                                         $siteinfo["text"] = $attr["content"];
 312                                         break;
 313                         }
 314         }
 315
 316         if ((@$siteinfo["image"] == "") AND !$no_guessing) {
 317             $list = $xpath->query("//img[@src]");
 318             foreach ($list as $node) {
 319                 $attr = array();
 320                 if ($node->attributes->length)
 321                     foreach ($node->attributes as $attribute)
 322                         $attr[$attribute->name] = $attribute->value;
 323
 324                         $src = completeurl($attr["src"], $url);
 325                         $photodata = get_photo_info($src);
 326
 327                         if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) {
 328                                 if ($photodata[0] > 300) {
 329                                         $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
 330                                         $photodata[0] = 300;
 331                                 }
 332                                 if ($photodata[1] > 300) {
 333                                         $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
 334                                         $photodata[1] = 300;
 335                                 }
 336                                 $siteinfo["images"][] = array("src"=>$src,
 337                                                                 "width"=>$photodata[0],
 338                                                                 "height"=>$photodata[1]);
 339                         }
 340
 341                 }
 342     } elseif ($siteinfo["image"] != "") {
 343                 $src = completeurl($siteinfo["image"], $url);
 344
 345                 unset($siteinfo["image"]);
 346
 347                 $photodata = get_photo_info($src);
 348
 349                 if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10))
 350                         $siteinfo["images"][] = array("src"=>$src,
 351                                                         "width"=>$photodata[0],
 352                                                         "height"=>$photodata[1]);
 353         }
 354
 355         if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) {
 356                 $text = "";
 357
 358                 $list = $xpath->query("//div[@class='article']");
 359                 foreach ($list as $node)
 360                         if (strlen($node->nodeValue) > 40)
 361                                 $text .= " ".trim($node->nodeValue);
 362
 363                 if ($text == "") {
 364                         $list = $xpath->query("//div[@class='content']");
 365                         foreach ($list as $node)
 366                                 if (strlen($node->nodeValue) > 40)
 367                                         $text .= " ".trim($node->nodeValue);
 368                 }
 369
 370                 // If none text was found then take the paragraph content
 371                 if ($text == "") {
 372                         $list = $xpath->query("//p");
 373                         foreach ($list as $node)
 374                                 if (strlen($node->nodeValue) > 40)
 375                                         $text .= " ".trim($node->nodeValue);
 376                 }
 377
 378                 if ($text != "") {
 379                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
 380
 381                         while (strpos($text, "  "))
 382                                 $text = trim(str_replace("  ", " ", $text));
 383
 384                         $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...');
 385                 }
 386         }
 387
 388         logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
 389
 390         call_hooks('getsiteinfo', $siteinfo);
 391
 392         return($siteinfo);
 393 }
 394
 395 function arr_add_hashes(&$item,$k) {
 396         $item = '#' . $item;
 397 }
 398
 399 function parse_url_content(&$a) {
 400
 401         $text = null;
 402         $str_tags = '';
 403
 404         $textmode = false;
 405
 406         if(local_user() && (! feature_enabled(local_user(),'richtext')))
 407                 $textmode = true;
 408
 409         //if($textmode)
 410         $br = (($textmode) ? "\n" : '<br />');
 411
 412         if(x($_GET,'binurl'))
 413                 $url = trim(hex2bin($_GET['binurl']));
 414         else
 415                 $url = trim($_GET['url']);
 416
 417         if($_GET['title'])
 418                 $title = strip_tags(trim($_GET['title']));
 419
 420         if($_GET['description'])
 421                 $text = strip_tags(trim($_GET['description']));
 422
 423         if($_GET['tags']) {
 424                 $arr_tags = str_getcsv($_GET['tags']);
 425                 if(count($arr_tags)) {
 426                         array_walk($arr_tags,'arr_add_hashes');
 427                         $str_tags = $br . implode(' ',$arr_tags) . $br;
 428                 }
 429         }
 430
 431         // add url scheme if missing
 432         $arrurl = parse_url($url);
 433         if (!x($arrurl, 'scheme')) {
 434                 if (x($arrurl, 'host'))
 435                         $url = "http:".$url;
 436                 else
 437                         $url = "http://".$url;
 438         }
 439
 440         logger('parse_url: ' . $url);
 441
 442         if($textmode)
 443                 $template = '[bookmark=%s]%s[/bookmark]%s';
 444         else
 445                 $template = "<a class=\"bookmark\" href=\"%s\" >%s</a>%s";
 446
 447         $arr = array('url' => $url, 'text' => '');
 448
 449         call_hooks('parse_link', $arr);
 450
 451         if(strlen($arr['text'])) {
 452                 echo $arr['text'];
 453                 killme();
 454         }
 455
 456
 457         if($url && $title && $text) {
 458
 459                 $title = str_replace(array("\r","\n"),array('',''),$title);
 460
 461                 if($textmode)
 462                         $text = '[quote]' . trim($text) . '[/quote]' . $br;
 463                 else {
 464                         $text = '<blockquote>' . htmlspecialchars(trim($text)) . '</blockquote><br />';
 465                         $title = htmlspecialchars($title);
 466                 }
 467
 468                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 469
 470                 logger('parse_url (unparsed): returns: ' . $result);
 471
 472                 echo $result;
 473                 killme();
 474         }
 475
 476         $siteinfo = parseurl_getsiteinfo($url);
 477
 478 //      if ($textmode) {
 479 //              require_once("include/items.php");
 480 //
 481 //              echo add_page_info_data($siteinfo);
 482 //              killme();
 483 //      }
 484
 485         $url= $siteinfo["url"];
 486
 487         // If the link contains BBCode stuff, make a short link out of this to avoid parsing problems
 488         if (strpos($url, '[') OR strpos($url, ']')) {
 489                 require_once("include/network.php");
 490                 $url = short_link($url);
 491         }
 492
 493         $sitedata = "";
 494
 495         if($siteinfo["title"] != "") {
 496                 $text = $siteinfo["text"];
 497                 $title = $siteinfo["title"];
 498         }
 499
 500         $image = "";
 501
 502         if (($siteinfo["type"] != "video") AND (sizeof($siteinfo["images"]) > 0)){
 503                 /* Execute below code only if image is present in siteinfo */
 504
 505                 $total_images = 0;
 506                 $max_images = get_config('system','max_bookmark_images');
 507                 if($max_images === false)
 508                         $max_images = 2;
 509                 else
 510                         $max_images = intval($max_images);
 511
 512                 foreach ($siteinfo["images"] as $imagedata) {
 513                         if($textmode)
 514                                 $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]' . "\n";
 515                         else
 516                                 $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" /><br />';
 517                         $total_images ++;
 518                         if($max_images && $max_images >= $total_images)
 519                                 break;
 520                 }
 521         }
 522
 523         if(strlen($text)) {
 524                 if($textmode)
 525                         $text = '[quote]'.trim($text).'[/quote]';
 526                 else
 527                         $text = '<blockquote>'.htmlspecialchars(trim($text)).'</blockquote>';
 528         }
 529
 530         if($image)
 531                 $text = $br.$br.$image.$text;
 532         else
 533                 $text = $br.$text;
 534
 535         $title = str_replace(array("\r","\n"),array('',''),$title);
 536
 537         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 538
 539         logger('parse_url: returns: ' . $result);
 540
 541         $sitedata .=  trim($result);
 542
 543         if (($siteinfo["type"] == "video") AND ($url != ""))
 544                 echo "[class=type-video]".$sitedata."[/class]";
 545         elseif (($siteinfo["type"] != "photo"))
 546                 echo "[class=type-link]".$sitedata."[/class]";
 547         else
 548                 echo "[class=type-photo]".$title.$br.$image."[/class]";
 549
 550         killme();
 551 }
 552 ?>