mod/parse_url.php

   1 <?php
   2 /**
   3  * @file mod/parse_url.php
   4  *
   5  * @todo https://developers.google.com/+/plugins/snippet/
   6  *
   7  * @verbatim
   8  * <meta itemprop="name" content="Toller Titel">
   9  * <meta itemprop="description" content="Eine tolle Beschreibung">
  10  * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
  11  *
  12  * <body itemscope itemtype="http://schema.org/Product">
  13  *   <h1 itemprop="name">Shiny Trinket</h1>
  14  *   <img itemprop="image" src="{image-url}" />
  15  *   <p itemprop="description">Shiny trinkets are shiny.</p>
  16  * </body>
  17  * @endverbatim
  18 */
  19
  20 if(!function_exists('deletenode')) {
  21         function deletenode(&$doc, $node)
  22         {
  23                 $xpath = new DomXPath($doc);
  24                 $list = $xpath->query("//".$node);
  25                 foreach ($list as $child)
  26                         $child->parentNode->removeChild($child);
  27         }
  28 }
  29
  30 function completeurl($url, $scheme) {
  31         $urlarr = parse_url($url);
  32
  33         if (isset($urlarr["scheme"]))
  34                 return($url);
  35
  36         $schemearr = parse_url($scheme);
  37
  38         $complete = $schemearr["scheme"]."://".$schemearr["host"];
  39
  40         if (@$schemearr["port"] != "")
  41                 $complete .= ":".$schemearr["port"];
  42
  43                 if(strpos($urlarr['path'],'/') !== 0)
  44                         $complete .= '/';
  45
  46         $complete .= $urlarr["path"];
  47
  48         if (@$urlarr["query"] != "")
  49                 $complete .= "?".$urlarr["query"];
  50
  51         if (@$urlarr["fragment"] != "")
  52                 $complete .= "#".$urlarr["fragment"];
  53
  54         return($complete);
  55 }
  56
  57 function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) {
  58
  59         if ($url == "")
  60                 return false;
  61
  62         $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
  63                 dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed));
  64
  65         if ($r)
  66                 $data = $r[0]["content"];
  67
  68         if (!is_null($data)) {
  69                 $data = unserialize($data);
  70                 return $data;
  71         }
  72
  73         $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed);
  74
  75         q("INSERT INTO `parsed_url` (`url`, `guessing`, `oembed`, `content`, `created`) VALUES ('%s', %d, %d, '%s', '%s')",
  76                 dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed), dbesc(serialize($data)), dbesc(datetime_convert()));
  77
  78         return $data;
  79 }
  80
  81 function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) {
  82         require_once("include/network.php");
  83         require_once("include/Photo.php");
  84
  85         $a = get_app();
  86
  87         $siteinfo = array();
  88
  89         if ($count > 10) {
  90                 logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
  91                 return($siteinfo);
  92         }
  93
  94         $url = trim($url, "'");
  95         $url = trim($url, '"');
  96
  97         $url = original_url($url);
  98
  99         $siteinfo["url"] = $url;
 100         $siteinfo["type"] = "link";
 101
 102         $stamp1 = microtime(true);
 103
 104         $ch = curl_init();
 105         curl_setopt($ch, CURLOPT_URL, $url);
 106         curl_setopt($ch, CURLOPT_HEADER, 1);
 107         curl_setopt($ch, CURLOPT_NOBODY, 1);
 108         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
 109         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 110         //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 111         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 112
 113         $header = curl_exec($ch);
 114         $curl_info = @curl_getinfo($ch);
 115         $http_code = $curl_info['http_code'];
 116         curl_close($ch);
 117
 118         $a->save_timestamp($stamp1, "network");
 119
 120         if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307"))
 121                 AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) {
 122                 if ($curl_info['redirect_url'] != "")
 123                         $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count);
 124                 else
 125                         $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count);
 126                 return($siteinfo);
 127         }
 128
 129         // if the file is too large then exit
 130         if ($curl_info["download_content_length"] > 1000000)
 131                 return($siteinfo);
 132
 133         // if it isn't a HTML file then exit
 134         if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html"))
 135                 return($siteinfo);
 136
 137         if ($do_oembed) {
 138                 require_once("include/oembed.php");
 139
 140                 $oembed_data = oembed_fetch_url($url);
 141
 142                 if ($oembed_data->type != "error")
 143                         $siteinfo["type"] = $oembed_data->type;
 144
 145                 if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) {
 146                         if (isset($oembed_data->title))
 147                                 $siteinfo["title"] = $oembed_data->title;
 148                         if (isset($oembed_data->description))
 149                                 $siteinfo["text"] = trim($oembed_data->description);
 150                         if (isset($oembed_data->thumbnail_url))
 151                                 $siteinfo["image"] = $oembed_data->thumbnail_url;
 152                 }
 153         }
 154
 155         $stamp1 = microtime(true);
 156
 157         // Now fetch the body as well
 158         $ch = curl_init();
 159         curl_setopt($ch, CURLOPT_URL, $url);
 160         curl_setopt($ch, CURLOPT_HEADER, 1);
 161         curl_setopt($ch, CURLOPT_NOBODY, 0);
 162         curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 163         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 164         curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
 165
 166         $header = curl_exec($ch);
 167         $curl_info = @curl_getinfo($ch);
 168         $http_code = $curl_info['http_code'];
 169         curl_close($ch);
 170
 171         $a->save_timestamp($stamp1, "network");
 172
 173         // Fetch the first mentioned charset. Can be in body or header
 174         $charset = "";
 175         if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 176                 $charset = trim(trim(trim(array_pop($matches)), ';,'));
 177
 178         if ($charset == "")
 179                 $charset = "utf-8";
 180
 181         $pos = strpos($header, "\r\n\r\n");
 182
 183         if ($pos)
 184                 $body = trim(substr($header, $pos));
 185         else
 186                 $body = $header;
 187
 188         if (($charset != '') AND (strtoupper($charset) != "UTF-8")) {
 189                 logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
 190                 //$body = mb_convert_encoding($body, "UTF-8", $charset);
 191                 $body = iconv($charset, "UTF-8//TRANSLIT", $body);
 192         }
 193
 194         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
 195
 196         $doc = new DOMDocument();
 197         @$doc->loadHTML($body);
 198
 199         deletenode($doc, 'style');
 200         deletenode($doc, 'script');
 201         deletenode($doc, 'option');
 202         deletenode($doc, 'h1');
 203         deletenode($doc, 'h2');
 204         deletenode($doc, 'h3');
 205         deletenode($doc, 'h4');
 206         deletenode($doc, 'h5');
 207         deletenode($doc, 'h6');
 208         deletenode($doc, 'ol');
 209         deletenode($doc, 'ul');
 210
 211         $xpath = new DomXPath($doc);
 212
 213         $list = $xpath->query("//meta[@content]");
 214         foreach ($list as $node) {
 215                 $attr = array();
 216                 if ($node->attributes->length)
 217                         foreach ($node->attributes as $attribute)
 218                                 $attr[$attribute->name] = $attribute->value;
 219
 220                 if (@$attr["http-equiv"] == 'refresh') {
 221                         $path = $attr["content"];
 222                         $pathinfo = explode(";", $path);
 223                         $content = "";
 224                         foreach ($pathinfo AS $value) {
 225                                 if (substr(strtolower($value), 0, 4) == "url=")
 226                                         $content = substr($value, 4);
 227                         }
 228                         if ($content != "") {
 229                                 $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count);
 230                                 return($siteinfo);
 231                         }
 232                 }
 233         }
 234
 235         //$list = $xpath->query("head/title");
 236         $list = $xpath->query("//title");
 237         foreach ($list as $node)
 238                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
 239
 240         //$list = $xpath->query("head/meta[@name]");
 241         $list = $xpath->query("//meta[@name]");
 242         foreach ($list as $node) {
 243                 $attr = array();
 244                 if ($node->attributes->length)
 245                         foreach ($node->attributes as $attribute)
 246                                 $attr[$attribute->name] = $attribute->value;
 247
 248                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 249
 250                 if ($attr["content"] != "")
 251                         switch (strtolower($attr["name"])) {
 252                                 case "fulltitle":
 253                                         $siteinfo["title"] = $attr["content"];
 254                                         break;
 255                                 case "description":
 256                                         $siteinfo["text"] = $attr["content"];
 257                                         break;
 258                                 case "thumbnail":
 259                                         $siteinfo["image"] = $attr["content"];
 260                                         break;
 261                                 case "twitter:image":
 262                                         $siteinfo["image"] = $attr["content"];
 263                                         break;
 264                                 case "twitter:image:src":
 265                                         $siteinfo["image"] = $attr["content"];
 266                                         break;
 267                                 case "twitter:card":
 268                                         if (($siteinfo["type"] == "") OR ($attr["content"] == "photo"))
 269                                                 $siteinfo["type"] = $attr["content"];
 270                                         break;
 271                                 case "twitter:description":
 272                                         $siteinfo["text"] = $attr["content"];
 273                                         break;
 274                                 case "twitter:title":
 275                                         $siteinfo["title"] = $attr["content"];
 276                                         break;
 277                                 case "dc.title":
 278                                         $siteinfo["title"] = $attr["content"];
 279                                         break;
 280                                 case "dc.description":
 281                                         $siteinfo["text"] = $attr["content"];
 282                                         break;
 283                                 case "keywords":
 284                                         $keywords = explode(",", $attr["content"]);
 285                                         break;
 286                                 case "news_keywords":
 287                                         $keywords = explode(",", $attr["content"]);
 288                                         break;
 289                         }
 290                 if ($siteinfo["type"] == "summary")
 291                         $siteinfo["type"] = "link";
 292         }
 293
 294         if (isset($keywords)) {
 295                 $siteinfo["keywords"] = array();
 296                 foreach ($keywords as $keyword)
 297                         if (!in_array(trim($keyword), $siteinfo["keywords"]))
 298                                 $siteinfo["keywords"][] = trim($keyword);
 299         }
 300
 301         //$list = $xpath->query("head/meta[@property]");
 302         $list = $xpath->query("//meta[@property]");
 303         foreach ($list as $node) {
 304                 $attr = array();
 305                 if ($node->attributes->length)
 306                         foreach ($node->attributes as $attribute)
 307                                 $attr[$attribute->name] = $attribute->value;
 308
 309                 $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
 310
 311                 if ($attr["content"] != "")
 312                         switch (strtolower($attr["property"])) {
 313                                 case "og:image":
 314                                         $siteinfo["image"] = $attr["content"];
 315                                         break;
 316                                 case "og:title":
 317                                         $siteinfo["title"] = $attr["content"];
 318                                         break;
 319                                 case "og:description":
 320                                         $siteinfo["text"] = $attr["content"];
 321                                         break;
 322                         }
 323         }
 324
 325         if ((@$siteinfo["image"] == "") AND !$no_guessing) {
 326             $list = $xpath->query("//img[@src]");
 327             foreach ($list as $node) {
 328                 $attr = array();
 329                 if ($node->attributes->length)
 330                     foreach ($node->attributes as $attribute)
 331                         $attr[$attribute->name] = $attribute->value;
 332
 333                         $src = completeurl($attr["src"], $url);
 334                         $photodata = get_photo_info($src);
 335
 336                         if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) {
 337                                 if ($photodata[0] > 300) {
 338                                         $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
 339                                         $photodata[0] = 300;
 340                                 }
 341                                 if ($photodata[1] > 300) {
 342                                         $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
 343                                         $photodata[1] = 300;
 344                                 }
 345                                 $siteinfo["images"][] = array("src"=>$src,
 346                                                                 "width"=>$photodata[0],
 347                                                                 "height"=>$photodata[1]);
 348                         }
 349
 350                 }
 351     } elseif ($siteinfo["image"] != "") {
 352                 $src = completeurl($siteinfo["image"], $url);
 353
 354                 unset($siteinfo["image"]);
 355
 356                 $photodata = get_photo_info($src);
 357
 358                 if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10))
 359                         $siteinfo["images"][] = array("src"=>$src,
 360                                                         "width"=>$photodata[0],
 361                                                         "height"=>$photodata[1]);
 362         }
 363
 364         if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) {
 365                 $text = "";
 366
 367                 $list = $xpath->query("//div[@class='article']");
 368                 foreach ($list as $node)
 369                         if (strlen($node->nodeValue) > 40)
 370                                 $text .= " ".trim($node->nodeValue);
 371
 372                 if ($text == "") {
 373                         $list = $xpath->query("//div[@class='content']");
 374                         foreach ($list as $node)
 375                                 if (strlen($node->nodeValue) > 40)
 376                                         $text .= " ".trim($node->nodeValue);
 377                 }
 378
 379                 // If none text was found then take the paragraph content
 380                 if ($text == "") {
 381                         $list = $xpath->query("//p");
 382                         foreach ($list as $node)
 383                                 if (strlen($node->nodeValue) > 40)
 384                                         $text .= " ".trim($node->nodeValue);
 385                 }
 386
 387                 if ($text != "") {
 388                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
 389
 390                         while (strpos($text, "  "))
 391                                 $text = trim(str_replace("  ", " ", $text));
 392
 393                         $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...');
 394                 }
 395         }
 396
 397         logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
 398
 399         call_hooks('getsiteinfo', $siteinfo);
 400
 401         return($siteinfo);
 402 }
 403
 404 function arr_add_hashes(&$item,$k) {
 405         $item = '#' . $item;
 406 }
 407
 408 function parse_url_content(&$a) {
 409
 410         $text = null;
 411         $str_tags = '';
 412
 413         $textmode = false;
 414
 415         if(local_user() && (! feature_enabled(local_user(),'richtext')))
 416                 $textmode = true;
 417
 418         //if($textmode)
 419         $br = (($textmode) ? "\n" : '<br />');
 420
 421         if(x($_GET,'binurl'))
 422                 $url = trim(hex2bin($_GET['binurl']));
 423         else
 424                 $url = trim($_GET['url']);
 425
 426         if($_GET['title'])
 427                 $title = strip_tags(trim($_GET['title']));
 428
 429         if($_GET['description'])
 430                 $text = strip_tags(trim($_GET['description']));
 431
 432         if($_GET['tags']) {
 433                 $arr_tags = str_getcsv($_GET['tags']);
 434                 if(count($arr_tags)) {
 435                         array_walk($arr_tags,'arr_add_hashes');
 436                         $str_tags = $br . implode(' ',$arr_tags) . $br;
 437                 }
 438         }
 439
 440         // add url scheme if missing
 441         $arrurl = parse_url($url);
 442         if (!x($arrurl, 'scheme')) {
 443                 if (x($arrurl, 'host'))
 444                         $url = "http:".$url;
 445                 else
 446                         $url = "http://".$url;
 447         }
 448
 449         logger('parse_url: ' . $url);
 450
 451         if($textmode)
 452                 $template = '[bookmark=%s]%s[/bookmark]%s';
 453         else
 454                 $template = "<a class=\"bookmark\" href=\"%s\" >%s</a>%s";
 455
 456         $arr = array('url' => $url, 'text' => '');
 457
 458         call_hooks('parse_link', $arr);
 459
 460         if(strlen($arr['text'])) {
 461                 echo $arr['text'];
 462                 killme();
 463         }
 464
 465
 466         if($url && $title && $text) {
 467
 468                 $title = str_replace(array("\r","\n"),array('',''),$title);
 469
 470                 if($textmode)
 471                         $text = '[quote]' . trim($text) . '[/quote]' . $br;
 472                 else {
 473                         $text = '<blockquote>' . htmlspecialchars(trim($text)) . '</blockquote><br />';
 474                         $title = htmlspecialchars($title);
 475                 }
 476
 477                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 478
 479                 logger('parse_url (unparsed): returns: ' . $result);
 480
 481                 echo $result;
 482                 killme();
 483         }
 484
 485         $siteinfo = parseurl_getsiteinfo($url);
 486
 487 //      if ($textmode) {
 488 //              require_once("include/items.php");
 489 //
 490 //              echo add_page_info_data($siteinfo);
 491 //              killme();
 492 //      }
 493
 494         $url= $siteinfo["url"];
 495
 496         // If the link contains BBCode stuff, make a short link out of this to avoid parsing problems
 497         if (strpos($url, '[') OR strpos($url, ']')) {
 498                 require_once("include/network.php");
 499                 $url = short_link($url);
 500         }
 501
 502         $sitedata = "";
 503
 504         if($siteinfo["title"] != "") {
 505                 $text = $siteinfo["text"];
 506                 $title = $siteinfo["title"];
 507         }
 508
 509         $image = "";
 510
 511         if (($siteinfo["type"] != "video") AND (sizeof($siteinfo["images"]) > 0)){
 512                 /* Execute below code only if image is present in siteinfo */
 513
 514                 $total_images = 0;
 515                 $max_images = get_config('system','max_bookmark_images');
 516                 if($max_images === false)
 517                         $max_images = 2;
 518                 else
 519                         $max_images = intval($max_images);
 520
 521                 foreach ($siteinfo["images"] as $imagedata) {
 522                         if($textmode)
 523                                 $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]' . "\n";
 524                         else
 525                                 $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" /><br />';
 526                         $total_images ++;
 527                         if($max_images && $max_images >= $total_images)
 528                                 break;
 529                 }
 530         }
 531
 532         if(strlen($text)) {
 533                 if($textmode)
 534                         $text = '[quote]'.trim($text).'[/quote]';
 535                 else
 536                         $text = '<blockquote>'.htmlspecialchars(trim($text)).'</blockquote>';
 537         }
 538
 539         if($image)
 540                 $text = $br.$br.$image.$text;
 541         else
 542                 $text = $br.$text;
 543
 544         $title = str_replace(array("\r","\n"),array('',''),$title);
 545
 546         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
 547
 548         logger('parse_url: returns: ' . $result);
 549
 550         $sitedata .=  trim($result);
 551
 552         if (($siteinfo["type"] == "video") AND ($url != ""))
 553                 echo "[class=type-video]".$sitedata."[/class]";
 554         elseif (($siteinfo["type"] != "photo"))
 555                 echo "[class=type-link]".$sitedata."[/class]";
 556         else
 557                 echo "[class=type-photo]".$title.$br.$image."[/class]";
 558
 559         killme();
 560 }
 561 ?>