From: Hypolite Petovan Date: Thu, 4 Jan 2018 17:03:15 +0000 (-0500) Subject: Move ParseUrl to Util namespace X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=1f805da94fce43067cfd97997706d39e965ab035;p=friendica.git Move ParseUrl to Util namespace --- diff --git a/include/items.php b/include/items.php index 8f15f94799..4bb00cdddd 100644 --- a/include/items.php +++ b/include/items.php @@ -3,7 +3,6 @@ * @file include/items.php */ use Friendica\App; -use Friendica\ParseUrl; use Friendica\Content\Feature; use Friendica\Core\Config; use Friendica\Core\PConfig; @@ -18,6 +17,7 @@ use Friendica\Object\Image; use Friendica\Protocol\DFRN; use Friendica\Protocol\OStatus; use Friendica\Protocol\Feed; +use Friendica\Util\ParseUrl; require_once 'include/bbcode.php'; require_once 'include/tags.php'; diff --git a/include/plaintext.php b/include/plaintext.php index 39dcc9ecdb..5931cba573 100644 --- a/include/plaintext.php +++ b/include/plaintext.php @@ -3,9 +3,9 @@ * @file include/plaintext.php */ use Friendica\App; -use Friendica\ParseUrl; use Friendica\Core\PConfig; use Friendica\Object\Image; +use Friendica\Util\ParseUrl; require_once "include/bbcode.php"; require_once "include/html2plain.php"; diff --git a/mod/parse_url.php b/mod/parse_url.php index 4fe9256349..4fabba6c7e 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -11,7 +11,7 @@ */ use Friendica\App; -use Friendica\ParseUrl; +use Friendica\Util\ParseUrl; require_once("include/items.php"); diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php index 30493e1b8b..51c987755e 100644 --- a/src/Content/OEmbed.php +++ b/src/Content/OEmbed.php @@ -10,7 +10,7 @@ use Friendica\Core\Cache; use Friendica\Core\System; use Friendica\Core\Config; use Friendica\Database\DBM; -use Friendica\ParseUrl; +use Friendica\Util\ParseUrl; use dba; use DOMDocument; use DOMXPath; diff --git a/src/ParseUrl.php b/src/ParseUrl.php deleted file mode 100644 index 0c67589ddf..0000000000 --- a/src/ParseUrl.php +++ /dev/null @@ -1,512 +0,0 @@ - The url of the parsed page - * string 'type' => Content type - * string 'title' => The title of the content - * string 'text' => The description for the content - * string 'image' => A preview image of the content (only available - * if $no_geuessing = false - * array'images' = Array of preview pictures - * string 'keywords' => The tags which belong to the content - * - * @see ParseUrl::getSiteinfo() for more information about scraping - * embeddable content - */ - public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) - { - if ($url == "") { - return false; - } - - $r = q( - "SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d", - dbesc(normalise_link($url)), - intval(!$no_guessing), - intval($do_oembed) - ); - - if ($r) { - $data = $r[0]["content"]; - } - - if (!is_null($data)) { - $data = unserialize($data); - return $data; - } - - $data = self::getSiteinfo($url, $no_guessing, $do_oembed); - - dba::insert( - 'parsed_url', - array( - 'url' => normalise_link($url), 'guessing' => !$no_guessing, - 'oembed' => $do_oembed, 'content' => serialize($data), - 'created' => datetime_convert()), - true - ); - - return $data; - } - /** - * @brief Parse a page for embeddable content information - * - * This method parses to url for meta data which can be used to embed - * the content. If available it prioritizes Open Graph meta tags. - * If this is not available it uses the twitter cards meta tags. - * As fallback it uses standard html elements with meta informations - * like \Awesome Title\ or - * \ - * - * @param string $url The url of the page which should be scraped - * @param bool $no_guessing If true the parse doens't search for - * preview pictures - * @param bool $do_oembed The false option is used by the function fetch_oembed() - * to avoid endless loops - * @param int $count Internal counter to avoid endless loops - * - * @return array which contains needed data for embedding - * string 'url' => The url of the parsed page - * string 'type' => Content type - * string 'title' => The title of the content - * string 'text' => The description for the content - * string 'image' => A preview image of the content (only available - * if $no_geuessing = false - * array'images' = Array of preview pictures - * string 'keywords' => The tags which belong to the content - * - * @todo https://developers.google.com/+/plugins/snippet/ - * @verbatim - * - * - * - * - * - *

Shiny Trinket

- * - *

Shiny trinkets are shiny.

- * - * @endverbatim - */ - public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) - { - $a = get_app(); - - $siteinfo = array(); - - // Check if the URL does contain a scheme - $scheme = parse_url($url, PHP_URL_SCHEME); - - if ($scheme == "") { - $url = "http://".trim($url, "/"); - } - - if ($count > 10) { - logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); - return($siteinfo); - } - - $url = trim($url, "'"); - $url = trim($url, '"'); - - $url = strip_tracking_query_params($url); - - $siteinfo["url"] = $url; - $siteinfo["type"] = "link"; - - $data = z_fetch_url($url); - if (!$data['success']) { - return($siteinfo); - } - - // If the file is too large then exit - if ($data["info"]["download_content_length"] > 1000000) { - return($siteinfo); - } - - // If it isn't a HTML file then exit - if (($data["info"]["content_type"] != "") && !strstr(strtolower($data["info"]["content_type"]), "html")) { - return($siteinfo); - } - - $header = $data["header"]; - $body = $data["body"]; - - if ($do_oembed) { - $oembed_data = OEmbed::fetchURL($url); - - if (!in_array($oembed_data->type, array("error", "rich", ""))) { - $siteinfo["type"] = $oembed_data->type; - } - - if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) { - if (isset($oembed_data->title)) { - $siteinfo["title"] = trim($oembed_data->title); - } - if (isset($oembed_data->description)) { - $siteinfo["text"] = trim($oembed_data->description); - } - if (isset($oembed_data->thumbnail_url)) { - $siteinfo["image"] = $oembed_data->thumbnail_url; - } - } - } - - // Fetch the first mentioned charset. Can be in body or header - $charset = ""; - if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { - $charset = trim(trim(trim(array_pop($matches)), ';,')); - } - - if ($charset == "") { - $charset = "utf-8"; - } - - if (($charset != "") && (strtoupper($charset) != "UTF-8")) { - logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); - //$body = mb_convert_encoding($body, "UTF-8", $charset); - $body = iconv($charset, "UTF-8//TRANSLIT", $body); - } - - $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); - - $doc = new DOMDocument(); - @$doc->loadHTML($body); - - XML::deleteNode($doc, "style"); - XML::deleteNode($doc, "script"); - XML::deleteNode($doc, "option"); - XML::deleteNode($doc, "h1"); - XML::deleteNode($doc, "h2"); - XML::deleteNode($doc, "h3"); - XML::deleteNode($doc, "h4"); - XML::deleteNode($doc, "h5"); - XML::deleteNode($doc, "h6"); - XML::deleteNode($doc, "ol"); - XML::deleteNode($doc, "ul"); - - $xpath = new DOMXPath($doc); - - $list = $xpath->query("//meta[@content]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $attr[$attribute->name] = $attribute->value; - } - } - - if (@$attr["http-equiv"] == "refresh") { - $path = $attr["content"]; - $pathinfo = explode(";", $path); - $content = ""; - foreach ($pathinfo as $value) { - if (substr(strtolower($value), 0, 4) == "url=") { - $content = substr($value, 4); - } - } - if ($content != "") { - $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count); - return($siteinfo); - } - } - } - - $list = $xpath->query("//title"); - if ($list->length > 0) { - $siteinfo["title"] = trim($list->item(0)->nodeValue); - } - - //$list = $xpath->query("head/meta[@name]"); - $list = $xpath->query("//meta[@name]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $attr[$attribute->name] = $attribute->value; - } - } - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") { - switch (strtolower($attr["name"])) { - case "fulltitle": - $siteinfo["title"] = trim($attr["content"]); - break; - case "description": - $siteinfo["text"] = trim($attr["content"]); - break; - case "thumbnail": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:image:src": - $siteinfo["image"] = $attr["content"]; - break; - case "twitter:card": - if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) { - $siteinfo["type"] = $attr["content"]; - } - break; - case "twitter:description": - $siteinfo["text"] = trim($attr["content"]); - break; - case "twitter:title": - $siteinfo["title"] = trim($attr["content"]); - break; - case "dc.title": - $siteinfo["title"] = trim($attr["content"]); - break; - case "dc.description": - $siteinfo["text"] = trim($attr["content"]); - break; - case "keywords": - $keywords = explode(",", $attr["content"]); - break; - case "news_keywords": - $keywords = explode(",", $attr["content"]); - break; - } - } - if ($siteinfo["type"] == "summary") { - $siteinfo["type"] = "link"; - } - } - - if (isset($keywords)) { - $siteinfo["keywords"] = array(); - foreach ($keywords as $keyword) { - if (!in_array(trim($keyword), $siteinfo["keywords"])) { - $siteinfo["keywords"][] = trim($keyword); - } - } - } - - //$list = $xpath->query("head/meta[@property]"); - $list = $xpath->query("//meta[@property]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $attr[$attribute->name] = $attribute->value; - } - } - - $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); - - if ($attr["content"] != "") { - switch (strtolower($attr["property"])) { - case "og:image": - $siteinfo["image"] = $attr["content"]; - break; - case "og:title": - $siteinfo["title"] = trim($attr["content"]); - break; - case "og:description": - $siteinfo["text"] = trim($attr["content"]); - break; - } - } - } - - if ((@$siteinfo["image"] == "") && !$no_guessing) { - $list = $xpath->query("//img[@src]"); - foreach ($list as $node) { - $attr = array(); - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $attr[$attribute->name] = $attribute->value; - } - } - - $src = self::completeUrl($attr["src"], $url); - $photodata = Image::getInfoFromURL($src); - - if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) { - if ($photodata[0] > 300) { - $photodata[1] = round($photodata[1] * (300 / $photodata[0])); - $photodata[0] = 300; - } - if ($photodata[1] > 300) { - $photodata[0] = round($photodata[0] * (300 / $photodata[1])); - $photodata[1] = 300; - } - $siteinfo["images"][] = array("src" => $src, - "width" => $photodata[0], - "height" => $photodata[1]); - } - } - } elseif ($siteinfo["image"] != "") { - $src = self::completeUrl($siteinfo["image"], $url); - - unset($siteinfo["image"]); - - $photodata = Image::getInfoFromURL($src); - - if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) { - $siteinfo["images"][] = array("src" => $src, - "width" => $photodata[0], - "height" => $photodata[1]); - } - } - - if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) { - $text = ""; - - $list = $xpath->query("//div[@class='article']"); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= " ".trim($node->nodeValue); - } - } - - if ($text == "") { - $list = $xpath->query("//div[@class='content']"); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= " ".trim($node->nodeValue); - } - } - } - - // If none text was found then take the paragraph content - if ($text == "") { - $list = $xpath->query("//p"); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= " ".trim($node->nodeValue); - } - } - } - - if ($text != "") { - $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); - - while (strpos($text, " ")) { - $text = trim(str_replace(" ", " ", $text)); - } - - $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...'); - } - } - - logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); - - call_hooks("getsiteinfo", $siteinfo); - - return($siteinfo); - } - - /** - * @brief Convert tags from CSV to an array - * - * @param string $string Tags - * @return array with formatted Hashtags - */ - public static function convertTagsToArray($string) - { - $arr_tags = str_getcsv($string); - if (count($arr_tags)) { - // add the # sign to every tag - array_walk($arr_tags, array("self", "arrAddHashes")); - - return $arr_tags; - } - } - - /** - * @brief Add a hasht sign to a string - * - * This method is used as callback function - * - * @param string $tag The pure tag name - * @param int $k Counter for internal use - * @return void - */ - private static function arrAddHashes(&$tag, $k) - { - $tag = "#" . $tag; - } - - /** - * @brief Add a scheme to an url - * - * The src attribute of some html elements (e.g. images) - * can miss the scheme so we need to add the correct - * scheme - * - * @param string $url The url which possibly does have - * a missing scheme (a link to an image) - * @param string $scheme The url with a correct scheme - * (e.g. the url from the webpage which does contain the image) - * - * @return string The url with a scheme - */ - private static function completeUrl($url, $scheme) - { - $urlarr = parse_url($url); - - // If the url does allready have an scheme - // we can stop the process here - if (isset($urlarr["scheme"])) { - return($url); - } - - $schemearr = parse_url($scheme); - - $complete = $schemearr["scheme"]."://".$schemearr["host"]; - - if (@$schemearr["port"] != "") { - $complete .= ":".$schemearr["port"]; - } - - if (strpos($urlarr["path"], "/") !== 0) { - $complete .= "/"; - } - - $complete .= $urlarr["path"]; - - if (@$urlarr["query"] != "") { - $complete .= "?".$urlarr["query"]; - } - - if (@$urlarr["fragment"] != "") { - $complete .= "#".$urlarr["fragment"]; - } - - return($complete); - } -} diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php new file mode 100644 index 0000000000..7154e0f4af --- /dev/null +++ b/src/Util/ParseUrl.php @@ -0,0 +1,512 @@ + The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @see ParseUrl::getSiteinfo() for more information about scraping + * embeddable content + */ + public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) + { + if ($url == "") { + return false; + } + + $r = q( + "SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d", + dbesc(normalise_link($url)), + intval(!$no_guessing), + intval($do_oembed) + ); + + if ($r) { + $data = $r[0]["content"]; + } + + if (!is_null($data)) { + $data = unserialize($data); + return $data; + } + + $data = self::getSiteinfo($url, $no_guessing, $do_oembed); + + dba::insert( + 'parsed_url', + array( + 'url' => normalise_link($url), 'guessing' => !$no_guessing, + 'oembed' => $do_oembed, 'content' => serialize($data), + 'created' => datetime_convert()), + true + ); + + return $data; + } + /** + * @brief Parse a page for embeddable content information + * + * This method parses to url for meta data which can be used to embed + * the content. If available it prioritizes Open Graph meta tags. + * If this is not available it uses the twitter cards meta tags. + * As fallback it uses standard html elements with meta informations + * like \Awesome Title\ or + * \ + * + * @param string $url The url of the page which should be scraped + * @param bool $no_guessing If true the parse doens't search for + * preview pictures + * @param bool $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * @param int $count Internal counter to avoid endless loops + * + * @return array which contains needed data for embedding + * string 'url' => The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @todo https://developers.google.com/+/plugins/snippet/ + * @verbatim + * + * + * + * + * + *

Shiny Trinket

+ * + *

Shiny trinkets are shiny.

+ * + * @endverbatim + */ + public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) + { + $a = get_app(); + + $siteinfo = array(); + + // Check if the URL does contain a scheme + $scheme = parse_url($url, PHP_URL_SCHEME); + + if ($scheme == "") { + $url = "http://".trim($url, "/"); + } + + if ($count > 10) { + logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG); + return($siteinfo); + } + + $url = trim($url, "'"); + $url = trim($url, '"'); + + $url = strip_tracking_query_params($url); + + $siteinfo["url"] = $url; + $siteinfo["type"] = "link"; + + $data = z_fetch_url($url); + if (!$data['success']) { + return($siteinfo); + } + + // If the file is too large then exit + if ($data["info"]["download_content_length"] > 1000000) { + return($siteinfo); + } + + // If it isn't a HTML file then exit + if (($data["info"]["content_type"] != "") && !strstr(strtolower($data["info"]["content_type"]), "html")) { + return($siteinfo); + } + + $header = $data["header"]; + $body = $data["body"]; + + if ($do_oembed) { + $oembed_data = OEmbed::fetchURL($url); + + if (!in_array($oembed_data->type, array("error", "rich", ""))) { + $siteinfo["type"] = $oembed_data->type; + } + + if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) { + if (isset($oembed_data->title)) { + $siteinfo["title"] = trim($oembed_data->title); + } + if (isset($oembed_data->description)) { + $siteinfo["text"] = trim($oembed_data->description); + } + if (isset($oembed_data->thumbnail_url)) { + $siteinfo["image"] = $oembed_data->thumbnail_url; + } + } + } + + // Fetch the first mentioned charset. Can be in body or header + $charset = ""; + if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) { + $charset = trim(trim(trim(array_pop($matches)), ';,')); + } + + if ($charset == "") { + $charset = "utf-8"; + } + + if (($charset != "") && (strtoupper($charset) != "UTF-8")) { + logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG); + //$body = mb_convert_encoding($body, "UTF-8", $charset); + $body = iconv($charset, "UTF-8//TRANSLIT", $body); + } + + $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8"); + + $doc = new DOMDocument(); + @$doc->loadHTML($body); + + XML::deleteNode($doc, "style"); + XML::deleteNode($doc, "script"); + XML::deleteNode($doc, "option"); + XML::deleteNode($doc, "h1"); + XML::deleteNode($doc, "h2"); + XML::deleteNode($doc, "h3"); + XML::deleteNode($doc, "h4"); + XML::deleteNode($doc, "h5"); + XML::deleteNode($doc, "h6"); + XML::deleteNode($doc, "ol"); + XML::deleteNode($doc, "ul"); + + $xpath = new DOMXPath($doc); + + $list = $xpath->query("//meta[@content]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + if (@$attr["http-equiv"] == "refresh") { + $path = $attr["content"]; + $pathinfo = explode(";", $path); + $content = ""; + foreach ($pathinfo as $value) { + if (substr(strtolower($value), 0, 4) == "url=") { + $content = substr($value, 4); + } + } + if ($content != "") { + $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count); + return($siteinfo); + } + } + } + + $list = $xpath->query("//title"); + if ($list->length > 0) { + $siteinfo["title"] = trim($list->item(0)->nodeValue); + } + + //$list = $xpath->query("head/meta[@name]"); + $list = $xpath->query("//meta[@name]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["name"])) { + case "fulltitle": + $siteinfo["title"] = trim($attr["content"]); + break; + case "description": + $siteinfo["text"] = trim($attr["content"]); + break; + case "thumbnail": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:image:src": + $siteinfo["image"] = $attr["content"]; + break; + case "twitter:card": + if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) { + $siteinfo["type"] = $attr["content"]; + } + break; + case "twitter:description": + $siteinfo["text"] = trim($attr["content"]); + break; + case "twitter:title": + $siteinfo["title"] = trim($attr["content"]); + break; + case "dc.title": + $siteinfo["title"] = trim($attr["content"]); + break; + case "dc.description": + $siteinfo["text"] = trim($attr["content"]); + break; + case "keywords": + $keywords = explode(",", $attr["content"]); + break; + case "news_keywords": + $keywords = explode(",", $attr["content"]); + break; + } + } + if ($siteinfo["type"] == "summary") { + $siteinfo["type"] = "link"; + } + } + + if (isset($keywords)) { + $siteinfo["keywords"] = array(); + foreach ($keywords as $keyword) { + if (!in_array(trim($keyword), $siteinfo["keywords"])) { + $siteinfo["keywords"][] = trim($keyword); + } + } + } + + //$list = $xpath->query("head/meta[@property]"); + $list = $xpath->query("//meta[@property]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8")); + + if ($attr["content"] != "") { + switch (strtolower($attr["property"])) { + case "og:image": + $siteinfo["image"] = $attr["content"]; + break; + case "og:title": + $siteinfo["title"] = trim($attr["content"]); + break; + case "og:description": + $siteinfo["text"] = trim($attr["content"]); + break; + } + } + } + + if ((@$siteinfo["image"] == "") && !$no_guessing) { + $list = $xpath->query("//img[@src]"); + foreach ($list as $node) { + $attr = array(); + if ($node->attributes->length) { + foreach ($node->attributes as $attribute) { + $attr[$attribute->name] = $attribute->value; + } + } + + $src = self::completeUrl($attr["src"], $url); + $photodata = Image::getInfoFromURL($src); + + if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) { + if ($photodata[0] > 300) { + $photodata[1] = round($photodata[1] * (300 / $photodata[0])); + $photodata[0] = 300; + } + if ($photodata[1] > 300) { + $photodata[0] = round($photodata[0] * (300 / $photodata[1])); + $photodata[1] = 300; + } + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + } + } elseif ($siteinfo["image"] != "") { + $src = self::completeUrl($siteinfo["image"], $url); + + unset($siteinfo["image"]); + + $photodata = Image::getInfoFromURL($src); + + if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) { + $siteinfo["images"][] = array("src" => $src, + "width" => $photodata[0], + "height" => $photodata[1]); + } + } + + if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) { + $text = ""; + + $list = $xpath->query("//div[@class='article']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + + if ($text == "") { + $list = $xpath->query("//div[@class='content']"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + // If none text was found then take the paragraph content + if ($text == "") { + $list = $xpath->query("//p"); + foreach ($list as $node) { + if (strlen($node->nodeValue) > 40) { + $text .= " ".trim($node->nodeValue); + } + } + } + + if ($text != "") { + $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text)); + + while (strpos($text, " ")) { + $text = trim(str_replace(" ", " ", $text)); + } + + $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...'); + } + } + + logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG); + + call_hooks("getsiteinfo", $siteinfo); + + return($siteinfo); + } + + /** + * @brief Convert tags from CSV to an array + * + * @param string $string Tags + * @return array with formatted Hashtags + */ + public static function convertTagsToArray($string) + { + $arr_tags = str_getcsv($string); + if (count($arr_tags)) { + // add the # sign to every tag + array_walk($arr_tags, array("self", "arrAddHashes")); + + return $arr_tags; + } + } + + /** + * @brief Add a hasht sign to a string + * + * This method is used as callback function + * + * @param string $tag The pure tag name + * @param int $k Counter for internal use + * @return void + */ + private static function arrAddHashes(&$tag, $k) + { + $tag = "#" . $tag; + } + + /** + * @brief Add a scheme to an url + * + * The src attribute of some html elements (e.g. images) + * can miss the scheme so we need to add the correct + * scheme + * + * @param string $url The url which possibly does have + * a missing scheme (a link to an image) + * @param string $scheme The url with a correct scheme + * (e.g. the url from the webpage which does contain the image) + * + * @return string The url with a scheme + */ + private static function completeUrl($url, $scheme) + { + $urlarr = parse_url($url); + + // If the url does allready have an scheme + // we can stop the process here + if (isset($urlarr["scheme"])) { + return($url); + } + + $schemearr = parse_url($scheme); + + $complete = $schemearr["scheme"]."://".$schemearr["host"]; + + if (@$schemearr["port"] != "") { + $complete .= ":".$schemearr["port"]; + } + + if (strpos($urlarr["path"], "/") !== 0) { + $complete .= "/"; + } + + $complete .= $urlarr["path"]; + + if (@$urlarr["query"] != "") { + $complete .= "?".$urlarr["query"]; + } + + if (@$urlarr["fragment"] != "") { + $complete .= "#".$urlarr["fragment"]; + } + + return($complete); + } +}