Move ParseUrl to Util namespace

author Hypolite Petovan <mrpetovan@gmail.com>

Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)

committer Hypolite Petovan <mrpetovan@gmail.com>

Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)
author Hypolite Petovan <mrpetovan@gmail.com>
Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)
committer Hypolite Petovan <mrpetovan@gmail.com>
Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)
diff --git a/include/items.php b/include/items.php

index 8f15f94799be7e9463daf13409a1dc68e740faca..4bb00cdddd34f0b0927821576215629e8a73537f 100644 (file)
--- a/include/items.php
+++ b/include/items.php
@@ -3,7 +3,6 @@
   * @file include/items.php
   */
  use Friendica\App;
-use Friendica\ParseUrl;
  use Friendica\Content\Feature;
  use Friendica\Core\Config;
  use Friendica\Core\PConfig;
@@ -18,6 +17,7 @@ use Friendica\Object\Image;
  use Friendica\Protocol\DFRN;
  use Friendica\Protocol\OStatus;
  use Friendica\Protocol\Feed;
+use Friendica\Util\ParseUrl;
  
  require_once 'include/bbcode.php';
  require_once 'include/tags.php';
diff --git a/include/plaintext.php b/include/plaintext.php

index 39dcc9ecdbd7d888cdc6c601bde0acc3570a4dd9..5931cba5737ca158c1340b97009216a76eee9f18 100644 (file)
--- a/include/plaintext.php
+++ b/include/plaintext.php
@@ -3,9 +3,9 @@
   * @file include/plaintext.php
   */
  use Friendica\App;
-use Friendica\ParseUrl;
  use Friendica\Core\PConfig;
  use Friendica\Object\Image;
+use Friendica\Util\ParseUrl;
  
  require_once "include/bbcode.php";
  require_once "include/html2plain.php";
diff --git a/mod/parse_url.php b/mod/parse_url.php

index 4fe9256349fe9a863509984d77995fd13c7e7945..4fabba6c7e7c001e65b06cf783e9c05c0cd30641 100644 (file)
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -11,7 +11,7 @@
  */
  
  use Friendica\App;
-use Friendica\ParseUrl;
+use Friendica\Util\ParseUrl;
  
  require_once("include/items.php");
  
diff --git a/src/Content/OEmbed.php b/src/Content/OEmbed.php

index 30493e1b8b4a8e893e6a607cc27cac2796f6f5fa..51c987755e1b6f2211d06cfda02ba3be0356e878 100644 (file)
--- a/src/Content/OEmbed.php
+++ b/src/Content/OEmbed.php
@@ -10,7 +10,7 @@ use Friendica\Core\Cache;
  use Friendica\Core\System;\r
  use Friendica\Core\Config;\r
  use Friendica\Database\DBM;\r
-use Friendica\ParseUrl;\r
+use Friendica\Util\ParseUrl;\r
  use dba;\r
  use DOMDocument;\r
  use DOMXPath;\r
diff --git a/src/ParseUrl.php b/src/ParseUrl.php

deleted file mode 100644 (file)

index 0c67589..0000000
--- a/src/ParseUrl.php
+++ /dev/null
@@ -1,512 +0,0 @@
-<?php
-/**
- * @file include/ParseUrl.php
- * @brief Get informations about a given URL
- */
-namespace Friendica;
-
-use Friendica\Content\OEmbed;
-use Friendica\Object\Image;
-use Friendica\Util\XML;
-
-use dba;
-use DOMXPath;
-use DOMDocument;
-
-require_once 'include/dba.php';
-require_once "include/network.php";
-
-/**
- * @brief Class with methods for extracting certain content from an url
- */
-class ParseUrl
-{
-       /**
-        * @brief Search for chached embeddable data of an url otherwise fetch it
-        *
-        * @param string $url         The url of the page which should be scraped
-        * @param bool $no_guessing If true the parse doens't search for
-        *                          preview pictures
-        * @param bool $do_oembed   The false option is used by the function fetch_oembed()
-        *                          to avoid endless loops
-        *
-        * @return array which contains needed data for embedding
-        *    string 'url' => The url of the parsed page
-        *    string 'type' => Content type
-        *    string 'title' => The title of the content
-        *    string 'text' => The description for the content
-        *    string 'image' => A preview image of the content (only available
-        *                if $no_geuessing = false
-        *    array'images' = Array of preview pictures
-        *    string 'keywords' => The tags which belong to the content
-        *
-        * @see ParseUrl::getSiteinfo() for more information about scraping
-        * embeddable content
-        */
-       public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
-       {
-               if ($url == "") {
-                       return false;
-               }
-
-               $r = q(
-                       "SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
-                       dbesc(normalise_link($url)),
-                       intval(!$no_guessing),
-                       intval($do_oembed)
-               );
-
-               if ($r) {
-                       $data = $r[0]["content"];
-               }
-
-               if (!is_null($data)) {
-                       $data = unserialize($data);
-                       return $data;
-               }
-
-               $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
-
-               dba::insert(
-                       'parsed_url',
-                       array(
-                               'url' => normalise_link($url), 'guessing' => !$no_guessing,
-                               'oembed' => $do_oembed, 'content' => serialize($data),
-                               'created' => datetime_convert()),
-                       true
-               );
-
-               return $data;
-       }
-       /**
-        * @brief Parse a page for embeddable content information
-        *
-        * This method parses to url for meta data which can be used to embed
-        * the content. If available it prioritizes Open Graph meta tags.
-        * If this is not available it uses the twitter cards meta tags.
-        * As fallback it uses standard html elements with meta informations
-        * like \<title\>Awesome Title\</title\> or
-        * \<meta name="description" content="An awesome description"\>
-        *
-        * @param string $url         The url of the page which should be scraped
-        * @param bool $no_guessing If true the parse doens't search for
-        *                          preview pictures
-        * @param bool $do_oembed   The false option is used by the function fetch_oembed()
-        *                          to avoid endless loops
-        * @param int $count       Internal counter to avoid endless loops
-        *
-        * @return array which contains needed data for embedding
-        *    string 'url' => The url of the parsed page
-        *    string 'type' => Content type
-        *    string 'title' => The title of the content
-        *    string 'text' => The description for the content
-        *    string 'image' => A preview image of the content (only available
-        *                if $no_geuessing = false
-        *    array'images' = Array of preview pictures
-        *    string 'keywords' => The tags which belong to the content
-        *
-        * @todo https://developers.google.com/+/plugins/snippet/
-        * @verbatim
-        * <meta itemprop="name" content="Awesome title">
-        * <meta itemprop="description" content="An awesome description">
-        * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
-        *
-        * <body itemscope itemtype="http://schema.org/Product">
-        *   <h1 itemprop="name">Shiny Trinket</h1>
-        *   <img itemprop="image" src="{image-url}" />
-        *   <p itemprop="description">Shiny trinkets are shiny.</p>
-        * </body>
-        * @endverbatim
-        */
-       public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
-       {
-               $a = get_app();
-
-               $siteinfo = array();
-
-               // Check if the URL does contain a scheme
-               $scheme = parse_url($url, PHP_URL_SCHEME);
-
-               if ($scheme == "") {
-                       $url = "http://".trim($url, "/");
-               }
-
-               if ($count > 10) {
-                       logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
-                       return($siteinfo);
-               }
-
-               $url = trim($url, "'");
-               $url = trim($url, '"');
-
-               $url = strip_tracking_query_params($url);
-
-               $siteinfo["url"] = $url;
-               $siteinfo["type"] = "link";
-
-               $data = z_fetch_url($url);
-               if (!$data['success']) {
-                       return($siteinfo);
-               }
-
-               // If the file is too large then exit
-               if ($data["info"]["download_content_length"] > 1000000) {
-                       return($siteinfo);
-               }
-
-               // If it isn't a HTML file then exit
-               if (($data["info"]["content_type"] != "") && !strstr(strtolower($data["info"]["content_type"]), "html")) {
-                       return($siteinfo);
-               }
-
-               $header = $data["header"];
-               $body = $data["body"];
-
-               if ($do_oembed) {
-                       $oembed_data = OEmbed::fetchURL($url);
-
-                       if (!in_array($oembed_data->type, array("error", "rich", ""))) {
-                               $siteinfo["type"] = $oembed_data->type;
-                       }
-
-                       if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) {
-                               if (isset($oembed_data->title)) {
-                                       $siteinfo["title"] = trim($oembed_data->title);
-                               }
-                               if (isset($oembed_data->description)) {
-                                       $siteinfo["text"] = trim($oembed_data->description);
-                               }
-                               if (isset($oembed_data->thumbnail_url)) {
-                                       $siteinfo["image"] = $oembed_data->thumbnail_url;
-                               }
-                       }
-               }
-
-               // Fetch the first mentioned charset. Can be in body or header
-               $charset = "";
-               if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
-                       $charset = trim(trim(trim(array_pop($matches)), ';,'));
-               }
-
-               if ($charset == "") {
-                       $charset = "utf-8";
-               }
-
-               if (($charset != "") && (strtoupper($charset) != "UTF-8")) {
-                       logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
-                       //$body = mb_convert_encoding($body, "UTF-8", $charset);
-                       $body = iconv($charset, "UTF-8//TRANSLIT", $body);
-               }
-
-               $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
-
-               $doc = new DOMDocument();
-               @$doc->loadHTML($body);
-
-               XML::deleteNode($doc, "style");
-               XML::deleteNode($doc, "script");
-               XML::deleteNode($doc, "option");
-               XML::deleteNode($doc, "h1");
-               XML::deleteNode($doc, "h2");
-               XML::deleteNode($doc, "h3");
-               XML::deleteNode($doc, "h4");
-               XML::deleteNode($doc, "h5");
-               XML::deleteNode($doc, "h6");
-               XML::deleteNode($doc, "ol");
-               XML::deleteNode($doc, "ul");
-
-               $xpath = new DOMXPath($doc);
-
-               $list = $xpath->query("//meta[@content]");
-               foreach ($list as $node) {
-                       $attr = array();
-                       if ($node->attributes->length) {
-                               foreach ($node->attributes as $attribute) {
-                                       $attr[$attribute->name] = $attribute->value;
-                               }
-                       }
-
-                       if (@$attr["http-equiv"] == "refresh") {
-                               $path = $attr["content"];
-                               $pathinfo = explode(";", $path);
-                               $content = "";
-                               foreach ($pathinfo as $value) {
-                                       if (substr(strtolower($value), 0, 4) == "url=") {
-                                               $content = substr($value, 4);
-                                       }
-                               }
-                               if ($content != "") {
-                                       $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count);
-                                       return($siteinfo);
-                               }
-                       }
-               }
-
-               $list = $xpath->query("//title");
-               if ($list->length > 0) {
-                       $siteinfo["title"] = trim($list->item(0)->nodeValue);
-               }
-
-               //$list = $xpath->query("head/meta[@name]");
-               $list = $xpath->query("//meta[@name]");
-               foreach ($list as $node) {
-                       $attr = array();
-                       if ($node->attributes->length) {
-                               foreach ($node->attributes as $attribute) {
-                                       $attr[$attribute->name] = $attribute->value;
-                               }
-                       }
-
-                       $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
-
-                       if ($attr["content"] != "") {
-                               switch (strtolower($attr["name"])) {
-                                       case "fulltitle":
-                                               $siteinfo["title"] = trim($attr["content"]);
-                                               break;
-                                       case "description":
-                                               $siteinfo["text"] = trim($attr["content"]);
-                                               break;
-                                       case "thumbnail":
-                                               $siteinfo["image"] = $attr["content"];
-                                               break;
-                                       case "twitter:image":
-                                               $siteinfo["image"] = $attr["content"];
-                                               break;
-                                       case "twitter:image:src":
-                                               $siteinfo["image"] = $attr["content"];
-                                               break;
-                                       case "twitter:card":
-                                               if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) {
-                                                       $siteinfo["type"] = $attr["content"];
-                                               }
-                                               break;
-                                       case "twitter:description":
-                                               $siteinfo["text"] = trim($attr["content"]);
-                                               break;
-                                       case "twitter:title":
-                                               $siteinfo["title"] = trim($attr["content"]);
-                                               break;
-                                       case "dc.title":
-                                               $siteinfo["title"] = trim($attr["content"]);
-                                               break;
-                                       case "dc.description":
-                                               $siteinfo["text"] = trim($attr["content"]);
-                                               break;
-                                       case "keywords":
-                                               $keywords = explode(",", $attr["content"]);
-                                               break;
-                                       case "news_keywords":
-                                               $keywords = explode(",", $attr["content"]);
-                                               break;
-                               }
-                       }
-                       if ($siteinfo["type"] == "summary") {
-                               $siteinfo["type"] = "link";
-                       }
-               }
-
-               if (isset($keywords)) {
-                       $siteinfo["keywords"] = array();
-                       foreach ($keywords as $keyword) {
-                               if (!in_array(trim($keyword), $siteinfo["keywords"])) {
-                                       $siteinfo["keywords"][] = trim($keyword);
-                               }
-                       }
-               }
-
-               //$list = $xpath->query("head/meta[@property]");
-               $list = $xpath->query("//meta[@property]");
-               foreach ($list as $node) {
-                       $attr = array();
-                       if ($node->attributes->length) {
-                               foreach ($node->attributes as $attribute) {
-                                       $attr[$attribute->name] = $attribute->value;
-                               }
-                       }
-
-                       $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
-
-                       if ($attr["content"] != "") {
-                               switch (strtolower($attr["property"])) {
-                                       case "og:image":
-                                               $siteinfo["image"] = $attr["content"];
-                                               break;
-                                       case "og:title":
-                                               $siteinfo["title"] = trim($attr["content"]);
-                                               break;
-                                       case "og:description":
-                                               $siteinfo["text"] = trim($attr["content"]);
-                                               break;
-                               }
-                       }
-               }
-
-               if ((@$siteinfo["image"] == "") && !$no_guessing) {
-                       $list = $xpath->query("//img[@src]");
-                       foreach ($list as $node) {
-                               $attr = array();
-                               if ($node->attributes->length) {
-                                       foreach ($node->attributes as $attribute) {
-                                               $attr[$attribute->name] = $attribute->value;
-                                       }
-                               }
-
-                               $src = self::completeUrl($attr["src"], $url);
-                               $photodata = Image::getInfoFromURL($src);
-
-                               if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
-                                       if ($photodata[0] > 300) {
-                                               $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
-                                               $photodata[0] = 300;
-                                       }
-                                       if ($photodata[1] > 300) {
-                                               $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
-                                               $photodata[1] = 300;
-                                       }
-                                       $siteinfo["images"][] = array("src" => $src,
-                                                                       "width" => $photodata[0],
-                                                                       "height" => $photodata[1]);
-                               }
-                       }
-               } elseif ($siteinfo["image"] != "") {
-                       $src = self::completeUrl($siteinfo["image"], $url);
-
-                       unset($siteinfo["image"]);
-
-                       $photodata = Image::getInfoFromURL($src);
-
-                       if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) {
-                               $siteinfo["images"][] = array("src" => $src,
-                                                               "width" => $photodata[0],
-                                                               "height" => $photodata[1]);
-                       }
-               }
-
-               if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) {
-                       $text = "";
-
-                       $list = $xpath->query("//div[@class='article']");
-                       foreach ($list as $node) {
-                               if (strlen($node->nodeValue) > 40) {
-                                       $text .= " ".trim($node->nodeValue);
-                               }
-                       }
-
-                       if ($text == "") {
-                               $list = $xpath->query("//div[@class='content']");
-                               foreach ($list as $node) {
-                                       if (strlen($node->nodeValue) > 40) {
-                                               $text .= " ".trim($node->nodeValue);
-                                       }
-                               }
-                       }
-
-                       // If none text was found then take the paragraph content
-                       if ($text == "") {
-                               $list = $xpath->query("//p");
-                               foreach ($list as $node) {
-                                       if (strlen($node->nodeValue) > 40) {
-                                               $text .= " ".trim($node->nodeValue);
-                                       }
-                               }
-                       }
-
-                       if ($text != "") {
-                               $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
-
-                               while (strpos($text, "  ")) {
-                                       $text = trim(str_replace("  ", " ", $text));
-                               }
-
-                               $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...');
-                       }
-               }
-
-               logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
-
-               call_hooks("getsiteinfo", $siteinfo);
-
-               return($siteinfo);
-       }
-
-       /**
-        * @brief Convert tags from CSV to an array
-        *
-        * @param string $string Tags
-        * @return array with formatted Hashtags
-        */
-       public static function convertTagsToArray($string)
-       {
-               $arr_tags = str_getcsv($string);
-               if (count($arr_tags)) {
-                       // add the # sign to every tag
-                       array_walk($arr_tags, array("self", "arrAddHashes"));
-
-                       return $arr_tags;
-               }
-       }
-
-       /**
-        * @brief Add a hasht sign to a string
-        *
-        *  This method is used as callback function
-        *
-        * @param string $tag The pure tag name
-        * @param int    $k   Counter for internal use
-        * @return void
-        */
-       private static function arrAddHashes(&$tag, $k)
-       {
-               $tag = "#" . $tag;
-       }
-
-       /**
-        * @brief Add a scheme to an url
-        *
-        * The src attribute of some html elements (e.g. images)
-        * can miss the scheme so we need to add the correct
-        * scheme
-        *
-        * @param string $url    The url which possibly does have
-        *                       a missing scheme (a link to an image)
-        * @param string $scheme The url with a correct scheme
-        *                       (e.g. the url from the webpage which does contain the image)
-        *
-        * @return string The url with a scheme
-        */
-       private static function completeUrl($url, $scheme)
-       {
-               $urlarr = parse_url($url);
-
-               // If the url does allready have an scheme
-               // we can stop the process here
-               if (isset($urlarr["scheme"])) {
-                       return($url);
-               }
-
-               $schemearr = parse_url($scheme);
-
-               $complete = $schemearr["scheme"]."://".$schemearr["host"];
-
-               if (@$schemearr["port"] != "") {
-                       $complete .= ":".$schemearr["port"];
-               }
-
-               if (strpos($urlarr["path"], "/") !== 0) {
-                       $complete .= "/";
-               }
-
-               $complete .= $urlarr["path"];
-
-               if (@$urlarr["query"] != "") {
-                       $complete .= "?".$urlarr["query"];
-               }
-
-               if (@$urlarr["fragment"] != "") {
-                       $complete .= "#".$urlarr["fragment"];
-               }
-
-               return($complete);
-       }
-}
diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php

new file mode 100644 (file)

index 0000000..7154e0f
--- /dev/null
+++ b/src/Util/ParseUrl.php
@@ -0,0 +1,512 @@
+<?php
+/**
+ * @file src/Util/ParseUrl.php
+ * @brief Get informations about a given URL
+ */
+namespace Friendica\Util;
+
+use Friendica\Content\OEmbed;
+use Friendica\Object\Image;
+use Friendica\Util\XML;
+
+use dba;
+use DOMXPath;
+use DOMDocument;
+
+require_once 'include/dba.php';
+require_once "include/network.php";
+
+/**
+ * @brief Class with methods for extracting certain content from an url
+ */
+class ParseUrl
+{
+       /**
+        * @brief Search for chached embeddable data of an url otherwise fetch it
+        *
+        * @param string $url         The url of the page which should be scraped
+        * @param bool $no_guessing If true the parse doens't search for
+        *                          preview pictures
+        * @param bool $do_oembed   The false option is used by the function fetch_oembed()
+        *                          to avoid endless loops
+        *
+        * @return array which contains needed data for embedding
+        *    string 'url' => The url of the parsed page
+        *    string 'type' => Content type
+        *    string 'title' => The title of the content
+        *    string 'text' => The description for the content
+        *    string 'image' => A preview image of the content (only available
+        *                if $no_geuessing = false
+        *    array'images' = Array of preview pictures
+        *    string 'keywords' => The tags which belong to the content
+        *
+        * @see ParseUrl::getSiteinfo() for more information about scraping
+        * embeddable content
+        */
+       public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
+       {
+               if ($url == "") {
+                       return false;
+               }
+
+               $r = q(
+                       "SELECT * FROM `parsed_url` WHERE `url` = '%s' AND `guessing` = %d AND `oembed` = %d",
+                       dbesc(normalise_link($url)),
+                       intval(!$no_guessing),
+                       intval($do_oembed)
+               );
+
+               if ($r) {
+                       $data = $r[0]["content"];
+               }
+
+               if (!is_null($data)) {
+                       $data = unserialize($data);
+                       return $data;
+               }
+
+               $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
+
+               dba::insert(
+                       'parsed_url',
+                       array(
+                               'url' => normalise_link($url), 'guessing' => !$no_guessing,
+                               'oembed' => $do_oembed, 'content' => serialize($data),
+                               'created' => datetime_convert()),
+                       true
+               );
+
+               return $data;
+       }
+       /**
+        * @brief Parse a page for embeddable content information
+        *
+        * This method parses to url for meta data which can be used to embed
+        * the content. If available it prioritizes Open Graph meta tags.
+        * If this is not available it uses the twitter cards meta tags.
+        * As fallback it uses standard html elements with meta informations
+        * like \<title\>Awesome Title\</title\> or
+        * \<meta name="description" content="An awesome description"\>
+        *
+        * @param string $url         The url of the page which should be scraped
+        * @param bool $no_guessing If true the parse doens't search for
+        *                          preview pictures
+        * @param bool $do_oembed   The false option is used by the function fetch_oembed()
+        *                          to avoid endless loops
+        * @param int $count       Internal counter to avoid endless loops
+        *
+        * @return array which contains needed data for embedding
+        *    string 'url' => The url of the parsed page
+        *    string 'type' => Content type
+        *    string 'title' => The title of the content
+        *    string 'text' => The description for the content
+        *    string 'image' => A preview image of the content (only available
+        *                if $no_geuessing = false
+        *    array'images' = Array of preview pictures
+        *    string 'keywords' => The tags which belong to the content
+        *
+        * @todo https://developers.google.com/+/plugins/snippet/
+        * @verbatim
+        * <meta itemprop="name" content="Awesome title">
+        * <meta itemprop="description" content="An awesome description">
+        * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
+        *
+        * <body itemscope itemtype="http://schema.org/Product">
+        *   <h1 itemprop="name">Shiny Trinket</h1>
+        *   <img itemprop="image" src="{image-url}" />
+        *   <p itemprop="description">Shiny trinkets are shiny.</p>
+        * </body>
+        * @endverbatim
+        */
+       public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
+       {
+               $a = get_app();
+
+               $siteinfo = array();
+
+               // Check if the URL does contain a scheme
+               $scheme = parse_url($url, PHP_URL_SCHEME);
+
+               if ($scheme == "") {
+                       $url = "http://".trim($url, "/");
+               }
+
+               if ($count > 10) {
+                       logger("parseurl_getsiteinfo: Endless loop detected for ".$url, LOGGER_DEBUG);
+                       return($siteinfo);
+               }
+
+               $url = trim($url, "'");
+               $url = trim($url, '"');
+
+               $url = strip_tracking_query_params($url);
+
+               $siteinfo["url"] = $url;
+               $siteinfo["type"] = "link";
+
+               $data = z_fetch_url($url);
+               if (!$data['success']) {
+                       return($siteinfo);
+               }
+
+               // If the file is too large then exit
+               if ($data["info"]["download_content_length"] > 1000000) {
+                       return($siteinfo);
+               }
+
+               // If it isn't a HTML file then exit
+               if (($data["info"]["content_type"] != "") && !strstr(strtolower($data["info"]["content_type"]), "html")) {
+                       return($siteinfo);
+               }
+
+               $header = $data["header"];
+               $body = $data["body"];
+
+               if ($do_oembed) {
+                       $oembed_data = OEmbed::fetchURL($url);
+
+                       if (!in_array($oembed_data->type, array("error", "rich", ""))) {
+                               $siteinfo["type"] = $oembed_data->type;
+                       }
+
+                       if (($oembed_data->type == "link") && ($siteinfo["type"] != "photo")) {
+                               if (isset($oembed_data->title)) {
+                                       $siteinfo["title"] = trim($oembed_data->title);
+                               }
+                               if (isset($oembed_data->description)) {
+                                       $siteinfo["text"] = trim($oembed_data->description);
+                               }
+                               if (isset($oembed_data->thumbnail_url)) {
+                                       $siteinfo["image"] = $oembed_data->thumbnail_url;
+                               }
+                       }
+               }
+
+               // Fetch the first mentioned charset. Can be in body or header
+               $charset = "";
+               if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches)) {
+                       $charset = trim(trim(trim(array_pop($matches)), ';,'));
+               }
+
+               if ($charset == "") {
+                       $charset = "utf-8";
+               }
+
+               if (($charset != "") && (strtoupper($charset) != "UTF-8")) {
+                       logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
+                       //$body = mb_convert_encoding($body, "UTF-8", $charset);
+                       $body = iconv($charset, "UTF-8//TRANSLIT", $body);
+               }
+
+               $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
+
+               $doc = new DOMDocument();
+               @$doc->loadHTML($body);
+
+               XML::deleteNode($doc, "style");
+               XML::deleteNode($doc, "script");
+               XML::deleteNode($doc, "option");
+               XML::deleteNode($doc, "h1");
+               XML::deleteNode($doc, "h2");
+               XML::deleteNode($doc, "h3");
+               XML::deleteNode($doc, "h4");
+               XML::deleteNode($doc, "h5");
+               XML::deleteNode($doc, "h6");
+               XML::deleteNode($doc, "ol");
+               XML::deleteNode($doc, "ul");
+
+               $xpath = new DOMXPath($doc);
+
+               $list = $xpath->query("//meta[@content]");
+               foreach ($list as $node) {
+                       $attr = array();
+                       if ($node->attributes->length) {
+                               foreach ($node->attributes as $attribute) {
+                                       $attr[$attribute->name] = $attribute->value;
+                               }
+                       }
+
+                       if (@$attr["http-equiv"] == "refresh") {
+                               $path = $attr["content"];
+                               $pathinfo = explode(";", $path);
+                               $content = "";
+                               foreach ($pathinfo as $value) {
+                                       if (substr(strtolower($value), 0, 4) == "url=") {
+                                               $content = substr($value, 4);
+                                       }
+                               }
+                               if ($content != "") {
+                                       $siteinfo = self::getSiteinfo($content, $no_guessing, $do_oembed, ++$count);
+                                       return($siteinfo);
+                               }
+                       }
+               }
+
+               $list = $xpath->query("//title");
+               if ($list->length > 0) {
+                       $siteinfo["title"] = trim($list->item(0)->nodeValue);
+               }
+
+               //$list = $xpath->query("head/meta[@name]");
+               $list = $xpath->query("//meta[@name]");
+               foreach ($list as $node) {
+                       $attr = array();
+                       if ($node->attributes->length) {
+                               foreach ($node->attributes as $attribute) {
+                                       $attr[$attribute->name] = $attribute->value;
+                               }
+                       }
+
+                       $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
+
+                       if ($attr["content"] != "") {
+                               switch (strtolower($attr["name"])) {
+                                       case "fulltitle":
+                                               $siteinfo["title"] = trim($attr["content"]);
+                                               break;
+                                       case "description":
+                                               $siteinfo["text"] = trim($attr["content"]);
+                                               break;
+                                       case "thumbnail":
+                                               $siteinfo["image"] = $attr["content"];
+                                               break;
+                                       case "twitter:image":
+                                               $siteinfo["image"] = $attr["content"];
+                                               break;
+                                       case "twitter:image:src":
+                                               $siteinfo["image"] = $attr["content"];
+                                               break;
+                                       case "twitter:card":
+                                               if (($siteinfo["type"] == "") || ($attr["content"] == "photo")) {
+                                                       $siteinfo["type"] = $attr["content"];
+                                               }
+                                               break;
+                                       case "twitter:description":
+                                               $siteinfo["text"] = trim($attr["content"]);
+                                               break;
+                                       case "twitter:title":
+                                               $siteinfo["title"] = trim($attr["content"]);
+                                               break;
+                                       case "dc.title":
+                                               $siteinfo["title"] = trim($attr["content"]);
+                                               break;
+                                       case "dc.description":
+                                               $siteinfo["text"] = trim($attr["content"]);
+                                               break;
+                                       case "keywords":
+                                               $keywords = explode(",", $attr["content"]);
+                                               break;
+                                       case "news_keywords":
+                                               $keywords = explode(",", $attr["content"]);
+                                               break;
+                               }
+                       }
+                       if ($siteinfo["type"] == "summary") {
+                               $siteinfo["type"] = "link";
+                       }
+               }
+
+               if (isset($keywords)) {
+                       $siteinfo["keywords"] = array();
+                       foreach ($keywords as $keyword) {
+                               if (!in_array(trim($keyword), $siteinfo["keywords"])) {
+                                       $siteinfo["keywords"][] = trim($keyword);
+                               }
+                       }
+               }
+
+               //$list = $xpath->query("head/meta[@property]");
+               $list = $xpath->query("//meta[@property]");
+               foreach ($list as $node) {
+                       $attr = array();
+                       if ($node->attributes->length) {
+                               foreach ($node->attributes as $attribute) {
+                                       $attr[$attribute->name] = $attribute->value;
+                               }
+                       }
+
+                       $attr["content"] = trim(html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8"));
+
+                       if ($attr["content"] != "") {
+                               switch (strtolower($attr["property"])) {
+                                       case "og:image":
+                                               $siteinfo["image"] = $attr["content"];
+                                               break;
+                                       case "og:title":
+                                               $siteinfo["title"] = trim($attr["content"]);
+                                               break;
+                                       case "og:description":
+                                               $siteinfo["text"] = trim($attr["content"]);
+                                               break;
+                               }
+                       }
+               }
+
+               if ((@$siteinfo["image"] == "") && !$no_guessing) {
+                       $list = $xpath->query("//img[@src]");
+                       foreach ($list as $node) {
+                               $attr = array();
+                               if ($node->attributes->length) {
+                                       foreach ($node->attributes as $attribute) {
+                                               $attr[$attribute->name] = $attribute->value;
+                                       }
+                               }
+
+                               $src = self::completeUrl($attr["src"], $url);
+                               $photodata = Image::getInfoFromURL($src);
+
+                               if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
+                                       if ($photodata[0] > 300) {
+                                               $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
+                                               $photodata[0] = 300;
+                                       }
+                                       if ($photodata[1] > 300) {
+                                               $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
+                                               $photodata[1] = 300;
+                                       }
+                                       $siteinfo["images"][] = array("src" => $src,
+                                                                       "width" => $photodata[0],
+                                                                       "height" => $photodata[1]);
+                               }
+                       }
+               } elseif ($siteinfo["image"] != "") {
+                       $src = self::completeUrl($siteinfo["image"], $url);
+
+                       unset($siteinfo["image"]);
+
+                       $photodata = Image::getInfoFromURL($src);
+
+                       if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) {
+                               $siteinfo["images"][] = array("src" => $src,
+                                                               "width" => $photodata[0],
+                                                               "height" => $photodata[1]);
+                       }
+               }
+
+               if ((@$siteinfo["text"] == "") && (@$siteinfo["title"] != "") && !$no_guessing) {
+                       $text = "";
+
+                       $list = $xpath->query("//div[@class='article']");
+                       foreach ($list as $node) {
+                               if (strlen($node->nodeValue) > 40) {
+                                       $text .= " ".trim($node->nodeValue);
+                               }
+                       }
+
+                       if ($text == "") {
+                               $list = $xpath->query("//div[@class='content']");
+                               foreach ($list as $node) {
+                                       if (strlen($node->nodeValue) > 40) {
+                                               $text .= " ".trim($node->nodeValue);
+                                       }
+                               }
+                       }
+
+                       // If none text was found then take the paragraph content
+                       if ($text == "") {
+                               $list = $xpath->query("//p");
+                               foreach ($list as $node) {
+                                       if (strlen($node->nodeValue) > 40) {
+                                               $text .= " ".trim($node->nodeValue);
+                                       }
+                               }
+                       }
+
+                       if ($text != "") {
+                               $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
+
+                               while (strpos($text, "  ")) {
+                                       $text = trim(str_replace("  ", " ", $text));
+                               }
+
+                               $siteinfo["text"] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, "UTF-8").'...');
+                       }
+               }
+
+               logger("parseurl_getsiteinfo: Siteinfo for ".$url." ".print_r($siteinfo, true), LOGGER_DEBUG);
+
+               call_hooks("getsiteinfo", $siteinfo);
+
+               return($siteinfo);
+       }
+
+       /**
+        * @brief Convert tags from CSV to an array
+        *
+        * @param string $string Tags
+        * @return array with formatted Hashtags
+        */
+       public static function convertTagsToArray($string)
+       {
+               $arr_tags = str_getcsv($string);
+               if (count($arr_tags)) {
+                       // add the # sign to every tag
+                       array_walk($arr_tags, array("self", "arrAddHashes"));
+
+                       return $arr_tags;
+               }
+       }
+
+       /**
+        * @brief Add a hasht sign to a string
+        *
+        *  This method is used as callback function
+        *
+        * @param string $tag The pure tag name
+        * @param int    $k   Counter for internal use
+        * @return void
+        */
+       private static function arrAddHashes(&$tag, $k)
+       {
+               $tag = "#" . $tag;
+       }
+
+       /**
+        * @brief Add a scheme to an url
+        *
+        * The src attribute of some html elements (e.g. images)
+        * can miss the scheme so we need to add the correct
+        * scheme
+        *
+        * @param string $url    The url which possibly does have
+        *                       a missing scheme (a link to an image)
+        * @param string $scheme The url with a correct scheme
+        *                       (e.g. the url from the webpage which does contain the image)
+        *
+        * @return string The url with a scheme
+        */
+       private static function completeUrl($url, $scheme)
+       {
+               $urlarr = parse_url($url);
+
+               // If the url does allready have an scheme
+               // we can stop the process here
+               if (isset($urlarr["scheme"])) {
+                       return($url);
+               }
+
+               $schemearr = parse_url($scheme);
+
+               $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+               if (@$schemearr["port"] != "") {
+                       $complete .= ":".$schemearr["port"];
+               }
+
+               if (strpos($urlarr["path"], "/") !== 0) {
+                       $complete .= "/";
+               }
+
+               $complete .= $urlarr["path"];
+
+               if (@$urlarr["query"] != "") {
+                       $complete .= "?".$urlarr["query"];
+               }
+
+               if (@$urlarr["fragment"] != "") {
+                       $complete .= "#".$urlarr["fragment"];
+               }
+
+               return($complete);
+       }
+}
author	Hypolite Petovan <mrpetovan@gmail.com>
	Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)
committer	Hypolite Petovan <mrpetovan@gmail.com>
	Thu, 4 Jan 2018 17:03:15 +0000 (12:03 -0500)
include/items.php		patch \| blob \| history
include/plaintext.php		patch \| blob \| history
mod/parse_url.php		patch \| blob \| history
src/Content/OEmbed.php		patch \| blob \| history
src/ParseUrl.php	[deleted file]	patch \| blob \| history
src/Util/ParseUrl.php	[new file with mode: 0644]	patch \| blob