From: rabuzarus Date: Thu, 24 Nov 2016 00:11:22 +0000 (+0100) Subject: parse_url: recognize image/video/audio files + move functions into own class X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=e9226eaf45e70bcd5a9a9f66b6b922dbc15c47ba;p=friendica.git parse_url: recognize image/video/audio files + move functions into own class --- diff --git a/include/ParseUrl.php b/include/ParseUrl.php new file mode 100644 index 0000000000..8a3392e73b --- /dev/null +++ b/include/ParseUrl.php @@ -0,0 +1,482 @@ + 10) { + logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); + return($siteinfo); + } + + $url = trim($url, "'"); + $url = trim($url, '"'); + + $url = original_url($url); + + $siteinfo["url"] = $url; + $siteinfo["type"] = "link"; + + $check_cert = Config::get("system", "verifyssl"); + + $stamp1 = microtime(true); + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 1); + curl_setopt($ch, CURLOPT_NOBODY, 1); + curl_setopt($ch, CURLOPT_TIMEOUT, 3); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); + + $header = curl_exec($ch); + $curl_info = @curl_getinfo($ch); + $http_code = $curl_info["http_code"]; + curl_close($ch); + + $a->save_timestamp($stamp1, "network"); + + if ((($curl_info["http_code"] == "301") || ($curl_info["http_code"] == "302") || ($curl_info["http_code"] == "303") || ($curl_info["http_code"] == "307")) + && (($curl_info["redirect_url"] != "") || ($curl_info["location"] != ""))) { + if ($curl_info["redirect_url"] != "") { + $siteinfo = self::getSiteinfo($curl_info["redirect_url"], $no_guessing, $do_oembed, ++$count); + } else { + $siteinfo = self::getSiteinfo($curl_info["location"], $no_guessing, $do_oembed, ++$count); + } + return($siteinfo); + } + + // If the file is too large then exit + if ($curl_info["download_content_length"] > 1000000) { + return($siteinfo); + } + + // If it isn't a HTML file then exit + if (($curl_info["content_type"] != "") && !strstr(strtolower($curl_info["content_type"]), "html")) { + return($siteinfo); + } + + if ($do_oembed) { + + $oembed_data = oembed_fetch_url($url); + + if (!in_array($oembed_data->type, array("error", "rich"))) { + $siteinfo["type"] = $oembed_data->type; + } + + if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) { + if (isset($oembed_data->title)) { + $siteinfo["title"] = $oembed_data->title; + } + if (isset($oembed_data->description)) { + $siteinfo["text"] = trim($oembed_data->description); + } + if (isset($oembed_data->thumbnail_url)) { + $siteinfo["image"] = $oembed_data->thumbnail_url; + } + } + } + + $stamp1 = microtime(true); + + // Now fetch the body as well + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 1); + curl_setopt($ch, CURLOPT_NOBODY, 0); + curl_setopt($ch, CURLOPT_TIMEOUT, 10); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); + curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); + + $header = curl_exec($ch); + $curl_info = @curl_getinfo($ch); + $http_code = $curl_info["http_code"]; + curl_close($ch); + + $a->save_timestamp($stamp1, "network"); + + // Fetch the first mentioned charset. Can be in body or header + $charset = ""; + if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { + $charset = trim(trim(trim(array_pop($matches)), ';,')); + } + + if ($charset == "") { + $charset = "utf-8"; + } + + $pos = strpos($header, "\r\n\r\n"); + + if ($pos) { + $body = trim(substr($header, $pos)); + } else { + $body = $header; + } + + if (($charset != "") && (strtoupper($charset) != "UTF-8")) { + logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); + //$body = mb_convert_encoding($body, "UTF-8", $charset); + $body = iconv($charset, "UTF-8//TRANSLIT", $body); + } + + $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); + + $doc = new \DOMDocument(); + @$doc->loadHTML($body); + + self::deleteNode($doc, "style"); + self::deleteNode($doc, "script"); + self::deleteNode($doc, "option"); + self::deleteNode($doc, "h1"); + self::deleteNode($doc, "h2"); + self::deleteNode($doc, "h3"); + self::deleteNode($doc, "h4"); + self::deleteNode($doc, "h5"); + self::deleteNode($doc, "h6"); + self::deleteNode($doc, "ol"); + self::deleteNode($doc, "ul"); + + $xpath = new \DomXPath($doc); + + $list = $xpath->query("//meta[@content]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + if (@$attr["http-equiv"] == "refresh") { + $path = $attr["content"]; + $pathinfo = explode(";", $path); + $content = ""; + foreach ($pathinfo as $value) { + if (substr(strtolower($value), 0, 4) == "url=") { + $content = substr($value, 4); + } + } + if ($content != "") { + $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count); + return($siteinfo); + } + } + } + + $list = $xpath->query("//title"); + if ($list->length > 0) { + $siteinfo["title"] = $list->item(0)->nodeValue; + } + + //$list = $xpath->query("head/meta[@name]"); + $list = $xpath->query("//meta[@name]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["name"])) { + case "fulltitle": + $siteinfo["title"] = $attr["content"]; + break; + case "description": + $siteinfo["text"] = $attr["content"]; + break; + case "thumbnail": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image:src": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:card": + if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) { + $siteinfo["type"] = $attr["content"]; + } + break; + case "twitter:description": + $siteinfo["text"] = $attr["content"]; + break; + case "twitter:title": + $siteinfo["title"] = $attr["content"]; + break; + case "dc.title": + $siteinfo["title"] = $attr["content"]; + break; + case "dc.description": + $siteinfo["text"] = $attr["content"]; + break; + case "keywords": + $keywords = explode(",", $attr["content"]); + break; + case "news_keywords": + $keywords = explode(",", $attr["content"]); + break; + } + } + if ($siteinfo["type"] == "summary") { + $siteinfo["type"] = "link"; + } + } + + if (isset($keywords)) { + $siteinfo["keywords"] = array(); + foreach ($keywords as $keyword) { + if (!in_array(trim($keyword), $siteinfo["keywords"])) { + $siteinfo["keywords"][] = trim($keyword); + } + } + } + + //$list = $xpath->query("head/meta[@property]"); + $list = $xpath->query("//meta[@property]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["property"])) { + case "og:image": + $siteinfo["image"] = $attr["content"]; + break; + case "og:title": + $siteinfo["title"] = $attr["content"]; + break; + case "og:description": + $siteinfo["text"] = $attr["content"]; + break; + } + } + } + + if ((@$siteinfo["image"] == "") && !$no_guessing) { + $list = $xpath->query("//img[@src]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $src = self::completeUrl($attr["src"], $url); + $photodata = get_photo_info($src); + + if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) { + if ($photodata[0] > 300) { + $photodata[1] = round($photodata[1] * (300 / $photodata[0])); + $photodata[0] = 300; + } + if ($photodata[1] > 300) { + $photodata[0] = round($photodata[0] * (300 / $photodata[1])); + $photodata[1] = 300; + } + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + + } + } elseif ($siteinfo["image"] != "") { + $src = self::completeUrl($siteinfo["image"], $url); + + unset($siteinfo["image"]); + + $photodata = get_photo_info($src); + + if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) { + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + } + + if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) { + $text = ""; + + $list = $xpath->query("//div[@class='article']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + + if ($text == "") { + $list = $xpath->query("//div[@class='content']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + // If none text was found then take the paragraph content + if ($text == "") { + $list = $xpath->query("//p"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + if ($text != "") { + $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); + + while (strpos($text, " ")) { + $text = trim(str_replace(" ", " ", $text)); + } + + $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...'); + } + } + + logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); + + call_hooks("getsiteinfo", $siteinfo); + + return($siteinfo); + } + + /** + * @brief Convert tags from CSV to an array + * + * @param string $string Tags + * @return array with formatted Hashtags + */ + public static function convertTagsToArray($string) { + $arr_tags = str_getcsv($string); + if (count($arr_tags)) { + // add the # sign to every tag + array_walk($arr_tags, array("self", "arrAddHashes")); + + return $arr_tags; + } + } + + /** + * @brief Add a hasht sign to a string + * + * This method is used as callback function + * + * @param string $tag The pure tag name + * @param int $k Counter for internal use + */ + private static function arrAddHashes(&$tag, $k) { + $tag = "#" . $tag; + } + + private static function deleteNode(&$doc, $node) { + $xpath = new \DomXPath($doc); + $list = $xpath->query("//".$node); + foreach ($list as $child) { + $child->parentNode->removeChild($child); + } + } + + private static function completeUrl($url, $scheme) { + $urlarr = parse_url($url); + + if (isset($urlarr["scheme"])) { + return($url); + } + + $schemearr = parse_url($scheme); + + $complete = $schemearr["scheme"]."://".$schemearr["host"]; + + if (@$schemearr["port"] != "") { + $complete .= ":".$schemearr["port"]; + } + + if (strpos($urlarr["path"],"/") !== 0) { + $complete .= "/"; + } + + $complete .= $urlarr["path"]; + + if (@$urlarr["query"] != "") { + $complete .= "?".$urlarr["query"]; + } + + if (@$urlarr["fragment"] != "") { + $complete .= "#".$urlarr["fragment"]; + } + + return($complete); + } +} diff --git a/include/items.php b/include/items.php index 9b199aed3b..e9354b62d3 100644 --- a/include/items.php +++ b/include/items.php @@ -1,5 +1,11 @@ type == "error") OR ($no_rich_type AND ($j->type == "rich"))) { - require_once("mod/parse_url.php"); - $data = parseurl_getsiteinfo_cached($embedurl, true, false); + $data = ParseUrl::getSiteinfoCached($embedurl, true, false); $j->type = $data["type"]; if ($j->type == "photo") { @@ -143,12 +151,11 @@ function oembed_fetch_url($embedurl, $no_rich_type = false){ function oembed_format_object($j){ require_once("mod/proxy.php"); - $a = get_app(); $embedurl = $j->embedurl; $jhtml = oembed_iframe($j->embedurl,(isset($j->width) ? $j->width : null), (isset($j->height) ? $j->height : null) ); $ret=""; switch ($j->type) { - case "video": { + case "video": if (isset($j->thumbnail_url)) { $tw = (isset($j->thumbnail_width) && intval($j->thumbnail_width)) ? $j->thumbnail_width:200; $th = (isset($j->thumbnail_height) && intval($j->thumbnail_height)) ? $j->thumbnail_height:180; @@ -158,7 +165,7 @@ function oembed_format_object($j){ $th=120; $tw = $th*$tr; $tpl=get_markup_template('oembed_video.tpl'); $ret.=replace_macros($tpl, array( - '$baseurl' => $a->get_baseurl(), + '$baseurl' => App::get_baseurl(), '$embedurl'=>$embedurl, '$escapedhtml'=>base64_encode($jhtml), '$tw'=>$tw, @@ -170,43 +177,49 @@ function oembed_format_object($j){ $ret=$jhtml; } //$ret.="
"; - }; break; - case "photo": { + break; + case "photo": $ret.= ""; - }; break; - case "link": { - }; break; - case "rich": { + break; + case "link": + break; + case "rich": // not so safe.. - if (!get_config("system","no_oembed_rich_content")) + if (!Config::get("system","no_oembed_rich_content")) { $ret.= proxy_parse_html($jhtml); - }; break; + } + break; } // add link to source if not present in "rich" type if ($j->type!='rich' || !strpos($j->html,$embedurl) ){ $ret .= "

"; if (isset($j->title)) { - if (isset($j->provider_name)) + if (isset($j->provider_name)) { $ret .= $j->provider_name.": "; + } $embedlink = (isset($j->title))?$j->title:$embedurl; $ret .= "$embedlink"; - if (isset($j->author_name)) + if (isset($j->author_name)) { $ret.=" (".$j->author_name.")"; + } } elseif (isset($j->provider_name) OR isset($j->author_name)) { $embedlink = ""; - if (isset($j->provider_name)) + if (isset($j->provider_name)) { $embedlink .= $j->provider_name; + } if (isset($j->author_name)) { - if ($embedlink != "") + if ($embedlink != "") { $embedlink .= ": "; + } $embedlink .= $j->author_name; } - if (trim($embedlink) == "") + if (trim($embedlink) == "") { $embedlink = $embedurl; + } $ret .= "$embedlink"; } @@ -247,15 +260,14 @@ function oembed_iframe($src, $width, $height) { } $width = '100%'; - $a = get_app(); - $s = $a->get_baseurl() . '/oembed/'.base64url_encode($src); + $s = App::get_baseurl() . '/oembed/'.base64url_encode($src); return ''; } function oembed_bbcode2html($text){ - $stopoembed = get_config("system","no_oembed"); + $stopoembed = Config::get("system","no_oembed"); if ($stopoembed == true){ return preg_replace("/\[embed\](.+?)\[\/embed\]/is", "". t('Embedding disabled') ." : $1" ,$text); } @@ -268,13 +280,13 @@ function oe_build_xpath($attr, $value){ return "contains( normalize-space( @$attr ), ' $value ' ) or substring( normalize-space( @$attr ), 1, string-length( '$value' ) + 1 ) = '$value ' or substring( normalize-space( @$attr ), string-length( @$attr ) - string-length( '$value' ) ) = ' $value' or @$attr = '$value'"; } -function oe_get_inner_html( $node ) { - $innerHTML= ''; - $children = $node->childNodes; - foreach ($children as $child) { - $innerHTML .= $child->ownerDocument->saveXML( $child ); - } - return $innerHTML; +function oe_get_inner_html($node) { + $innerHTML= ''; + $children = $node->childNodes; + foreach ($children as $child) { + $innerHTML .= $child->ownerDocument->saveXML($child); + } + return $innerHTML; } /** @@ -283,15 +295,16 @@ function oe_get_inner_html( $node ) { */ function oembed_html2bbcode($text) { // start parser only if 'oembed' is in text - if (strpos($text, "oembed")){ + if (strpos($text, "oembed")) { // convert non ascii chars to html entities $html_text = mb_convert_encoding($text, 'HTML-ENTITIES', mb_detect_encoding($text)); // If it doesn't parse at all, just return the text. $dom = @DOMDocument::loadHTML($html_text); - if(! $dom) + if (! $dom) { return $text; + } $xpath = new DOMXPath($dom); $attr = "oembed"; diff --git a/include/plaintext.php b/include/plaintext.php index 539ef020df..d98d736550 100644 --- a/include/plaintext.php +++ b/include/plaintext.php @@ -1,6 +1,15 @@ query("//".$node); - foreach ($list as $child) - $child->parentNode->removeChild($child); - } -} - -function completeurl($url, $scheme) { - $urlarr = parse_url($url); - - if (isset($urlarr["scheme"])) - return($url); - - $schemearr = parse_url($scheme); - - $complete = $schemearr["scheme"]."://".$schemearr["host"]; - - if (@$schemearr["port"] != "") - $complete .= ":".$schemearr["port"]; - - if(strpos($urlarr['path'],'/') !== 0) - $complete .= '/'; +use \Friendica\ParseUrl; - $complete .= $urlarr["path"]; +require_once("include/items.php"); - if (@$urlarr["query"] != "") - $complete .= "?".$urlarr["query"]; - - if (@$urlarr["fragment"] != "") - $complete .= "#".$urlarr["fragment"]; - - return($complete); -} - -function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) { - - if ($url == "") - return false; +function parse_url_content(&$a) { - $r = q("SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d", - dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed)); + $text = null; + $str_tags = ""; - if ($r) - $data = $r[0]["content"]; + $textmode = false; - if (!is_null($data)) { - $data = unserialize($data); - return $data; + if (local_user() && (!feature_enabled(local_user(), "richtext"))) { + $textmode = true; } - $data = parseurl_getsiteinfo($url, $no_guessing, $do_oembed); - - q("INSERT INTO `parsed_url` (`url`, `guessing`, `oembed`, `content`, `created`) VALUES ('%s', %d, %d, '%s', '%s') - ON DUPLICATE KEY UPDATE `content` = '%s', `created` = '%s'", - dbesc(normalise_link($url)), intval(!$no_guessing), intval($do_oembed), - dbesc(serialize($data)), dbesc(datetime_convert()), - dbesc(serialize($data)), dbesc(datetime_convert())); - - return $data; -} - -function parseurl_getsiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { - require_once("include/network.php"); - require_once("include/Photo.php"); - - $a = get_app(); - - $siteinfo = array(); + $br = (($textmode) ? "\n" : "
"); - // Check if the URL does contain a scheme - $scheme = parse_url($url, PHP_URL_SCHEME); - - if ($scheme == "") { - $url = "http://".trim($url, "/"); + if (x($_GET,"binurl")) { + $url = trim(hex2bin($_GET["binurl"])); + } else { + $url = trim($_GET["url"]); } - if ($count > 10) { - logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); - return($siteinfo); + if ($_GET["title"]) { + $title = strip_tags(trim($_GET["title"])); } - $url = trim($url, "'"); - $url = trim($url, '"'); - - $url = original_url($url); - - $siteinfo["url"] = $url; - $siteinfo["type"] = "link"; - - $check_cert = get_config('system','verifyssl'); - - $stamp1 = microtime(true); - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 1); - curl_setopt($ch, CURLOPT_TIMEOUT, 3); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info['http_code']; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - - if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302") OR ($curl_info['http_code'] == "303") OR ($curl_info['http_code'] == "307")) - AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) { - if ($curl_info['redirect_url'] != "") - $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url'], $no_guessing, $do_oembed, ++$count); - else - $siteinfo = parseurl_getsiteinfo($curl_info['location'], $no_guessing, $do_oembed, ++$count); - return($siteinfo); + if ($_GET["description"]) { + $text = strip_tags(trim($_GET["description"])); } - // if the file is too large then exit - if ($curl_info["download_content_length"] > 1000000) - return($siteinfo); - - // if it isn't a HTML file then exit - if (($curl_info["content_type"] != "") AND !strstr(strtolower($curl_info["content_type"]),"html")) - return($siteinfo); - - if ($do_oembed) { - require_once("include/oembed.php"); - - $oembed_data = oembed_fetch_url($url); - - if (!in_array($oembed_data->type, array("error", "rich"))) { - $siteinfo["type"] = $oembed_data->type; - } - - if (($oembed_data->type == "link") AND ($siteinfo["type"] != "photo")) { - if (isset($oembed_data->title)) - $siteinfo["title"] = $oembed_data->title; - if (isset($oembed_data->description)) - $siteinfo["text"] = trim($oembed_data->description); - if (isset($oembed_data->thumbnail_url)) - $siteinfo["image"] = $oembed_data->thumbnail_url; + if ($_GET["tags"]) { + $arr_tags = ParseUrl::convertTagsToArray($_GET["tags"]); + if (count($arr_tags)) { + $str_tags = $br . implode(" ", $arr_tags) . $br; } } - $stamp1 = microtime(true); - - // Now fetch the body as well - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 0); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent()); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (($check_cert) ? true : false)); - curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, (($check_cert) ? 2 : false)); - - $header = curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info['http_code']; - curl_close($ch); - - $a->save_timestamp($stamp1, "network"); - - // Fetch the first mentioned charset. Can be in body or header - $charset = ""; - if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) - $charset = trim(trim(trim(array_pop($matches)), ';,')); - - if ($charset == "") - $charset = "utf-8"; - - $pos = strpos($header, "\r\n\r\n"); - - if ($pos) - $body = trim(substr($header, $pos)); - else - $body = $header; - - if (($charset != '') AND (strtoupper($charset) != "UTF-8")) { - logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); - //$body = mb_convert_encoding($body, "UTF-8", $charset); - $body = iconv($charset, "UTF-8//TRANSLIT", $body); - } - - $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); - - $doc = new DOMDocument(); - @$doc->loadHTML($body); - - deletenode($doc, 'style'); - deletenode($doc, 'script'); - deletenode($doc, 'option'); - deletenode($doc, 'h1'); - deletenode($doc, 'h2'); - deletenode($doc, 'h3'); - deletenode($doc, 'h4'); - deletenode($doc, 'h5'); - deletenode($doc, 'h6'); - deletenode($doc, 'ol'); - deletenode($doc, 'ul'); - - $xpath = new DomXPath($doc); - - $list = $xpath->query("//meta[@content]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - if (@$attr["http-equiv"] == 'refresh') { - $path = $attr["content"]; - $pathinfo = explode(";", $path); - $content = ""; - foreach ($pathinfo AS $value) { - if (substr(strtolower($value), 0, 4) == "url=") - $content = substr($value, 4); - } - if ($content != "") { - $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count); - return($siteinfo); - } + // Add url scheme if it is missing + $arrurl = parse_url($url); + if (!x($arrurl, "scheme")) { + if (x($arrurl, "host")) { + $url = "http:".$url; + } else { + $url = "http://".$url; } } - $list = $xpath->query("//title"); - if ($list->length > 0) - $siteinfo["title"] = $list->item(0)->nodeValue; - - //$list = $xpath->query("head/meta[@name]"); - $list = $xpath->query("//meta[@name]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") - switch (strtolower($attr["name"])) { - case "fulltitle": - $siteinfo["title"] = $attr["content"]; - break; - case "description": - $siteinfo["text"] = $attr["content"]; - break; - case "thumbnail": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image:src": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:card": - if (($siteinfo["type"] == "") OR ($attr["content"] == "photo")) - $siteinfo["type"] = $attr["content"]; - break; - case "twitter:description": - $siteinfo["text"] = $attr["content"]; - break; - case "twitter:title": - $siteinfo["title"] = $attr["content"]; - break; - case "dc.title": - $siteinfo["title"] = $attr["content"]; - break; - case "dc.description": - $siteinfo["text"] = $attr["content"]; - break; - case "keywords": - $keywords = explode(",", $attr["content"]); - break; - case "news_keywords": - $keywords = explode(",", $attr["content"]); - break; + logger("prse_url: " . $url); + + // If the URL is a image, video or audio file format the URL with the corresponding + // BBCode media tag + $redirects = 0; + // Fetch the header of the URL + $result = z_fetch_url($url, false, $redirects, array("novalidate" => true, "nobody" => true)); + if($result["success"]) { + // Convert the header fields into an array + $hdrs = array(); + $h = explode("\n", $result["header"]); + foreach ($h as $l) { + list($k,$v) = array_map("trim", explode(":", trim($l), 2)); + $hdrs[$k] = $v; + } + if (array_key_exists("Content-Type", $hdrs)) { + $type = $hdrs["Content-Type"]; + } + if ($type) { + if(stripos($type, "image/") !== false) { + echo $br . "[img]" . $url . "[/img]" . $br; + killme(); } - if ($siteinfo["type"] == "summary") - $siteinfo["type"] = "link"; - } - - if (isset($keywords)) { - $siteinfo["keywords"] = array(); - foreach ($keywords as $keyword) - if (!in_array(trim($keyword), $siteinfo["keywords"])) - $siteinfo["keywords"][] = trim($keyword); - } - - //$list = $xpath->query("head/meta[@property]"); - $list = $xpath->query("//meta[@property]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") - switch (strtolower($attr["property"])) { - case "og:image": - $siteinfo["image"] = $attr["content"]; - break; - case "og:title": - $siteinfo["title"] = $attr["content"]; - break; - case "og:description": - $siteinfo["text"] = $attr["content"]; - break; + if (stripos($type, "video/") !== false) { + echo $br . "[video]" . $url . "[/video]" . $br; + killme(); } - } - - if ((@$siteinfo["image"] == "") AND !$no_guessing) { - $list = $xpath->query("//img[@src]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) - foreach ($node->attributes as $attribute) - $attr[$attribute->name] = $attribute->value; - - $src = completeurl($attr["src"], $url); - $photodata = get_photo_info($src); - - if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) { - if ($photodata[0] > 300) { - $photodata[1] = round($photodata[1] * (300 / $photodata[0])); - $photodata[0] = 300; - } - if ($photodata[1] > 300) { - $photodata[0] = round($photodata[0] * (300 / $photodata[1])); - $photodata[1] = 300; - } - $siteinfo["images"][] = array("src"=>$src, - "width"=>$photodata[0], - "height"=>$photodata[1]); + if (stripos($type, "audio/") !== false) { + echo $br . "[audio]" . $url . "[/audio]" . $br; + killme(); } - } - } elseif ($siteinfo["image"] != "") { - $src = completeurl($siteinfo["image"], $url); - - unset($siteinfo["image"]); - - $photodata = get_photo_info($src); - - if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10)) - $siteinfo["images"][] = array("src"=>$src, - "width"=>$photodata[0], - "height"=>$photodata[1]); } - if ((@$siteinfo["text"] == "") AND (@$siteinfo["title"] != "") AND !$no_guessing) { - $text = ""; - - $list = $xpath->query("//div[@class='article']"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - - if ($text == "") { - $list = $xpath->query("//div[@class='content']"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - } - - // If none text was found then take the paragraph content - if ($text == "") { - $list = $xpath->query("//p"); - foreach ($list as $node) - if (strlen($node->nodeValue) > 40) - $text .= " ".trim($node->nodeValue); - } - - if ($text != "") { - $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); - - while (strpos($text, " ")) - $text = trim(str_replace(" ", " ", $text)); - - $siteinfo["text"] = trim(html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...'); - } - } - - logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); - - call_hooks('getsiteinfo', $siteinfo); - - return($siteinfo); -} - -function arr_add_hashes(&$item,$k) { - $item = '#' . $item; -} - -function parse_url_content(&$a) { - - require_once("include/items.php"); - - $text = null; - $str_tags = ''; - - $textmode = false; - - if(local_user() && (! feature_enabled(local_user(),'richtext'))) - $textmode = true; - - //if($textmode) - $br = (($textmode) ? "\n" : '
'); - - if(x($_GET,'binurl')) - $url = trim(hex2bin($_GET['binurl'])); - else - $url = trim($_GET['url']); - - if($_GET['title']) - $title = strip_tags(trim($_GET['title'])); - - if($_GET['description']) - $text = strip_tags(trim($_GET['description'])); - - if($_GET['tags']) { - $arr_tags = str_getcsv($_GET['tags']); - if(count($arr_tags)) { - array_walk($arr_tags,'arr_add_hashes'); - $str_tags = $br . implode(' ',$arr_tags) . $br; - } - } - - // add url scheme if missing - $arrurl = parse_url($url); - if (!x($arrurl, 'scheme')) { - if (x($arrurl, 'host')) - $url = "http:".$url; - else - $url = "http://".$url; - } - - logger('parse_url: ' . $url); - - if($textmode) - $template = '[bookmark=%s]%s[/bookmark]%s'; - else + if ($textmode) { + $template = "[bookmark=%s]%s[/bookmark]%s"; + } else { $template = "%s%s"; + } - $arr = array('url' => $url, 'text' => ''); + $arr = array("url" => $url, "text" => ""); - call_hooks('parse_link', $arr); + call_hooks("parse_link", $arr); - if(strlen($arr['text'])) { - echo $arr['text']; + if (strlen($arr["text"])) { + echo $arr["text"]; killme(); } - if($url && $title && $text) { + if ($url && $title && $text) { - $title = str_replace(array("\r","\n"),array('',''),$title); + $title = str_replace(array("\r","\n"),array("",""),$title); - if($textmode) - $text = '[quote]' . trim($text) . '[/quote]' . $br; - else { - $text = '
' . htmlspecialchars(trim($text)) . '

'; + if ($textmode) { + $text = "[quote]" . trim($text) . "[/quote]" . $br; + } else { + $text = "
" . htmlspecialchars(trim($text)) . "

"; $title = htmlspecialchars($title); } - $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags; + $result = sprintf($template, $url, ($title) ? $title : $url, $text) . $str_tags; - logger('parse_url (unparsed): returns: ' . $result); + logger("parse_url (unparsed): returns: " . $result); echo $result; killme(); } - $siteinfo = parseurl_getsiteinfo($url); + // Fetch the information from the webpage + $siteinfo = ParseUrl::getSiteinfo($url); unset($siteinfo["keywords"]); + // Format it as BBCode attachment $info = add_page_info_data($siteinfo); - if (!$textmode) + if (!$textmode) { // Replace ' with ’ - not perfect - but the richtext editor has problems otherwise $info = str_replace(array("'"), array("’"), $info); + } echo $info; killme(); } -?>