From: Michael Vogel <icarus@dabo.de>
Date: Thu, 12 Jul 2012 21:41:04 +0000 (+0200)
Subject: parse_url: Further improvements of the new method to fetch page data
X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=02a1fc9cd08fba2168895d1892a91d8143323848;p=friendica.git

parse_url: Further improvements of the new method to fetch page data
---

diff --git a/include/api.php b/include/api.php
index 3858b9fe32..e0b788424e 100644
--- a/include/api.php
+++ b/include/api.php
@@ -1727,5 +1727,6 @@ notifications/follow
 notifications/leave
 blocks/exists
 blocks/blocking
+lists
 */
 
diff --git a/mod/parse_url.php b/mod/parse_url.php
index 97e1658c89..4d894969aa 100644
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -1,6 +1,4 @@
 <?php
-require_once('include/Photo.php');
-
 if(!function_exists('deletenode')) {
 	function deletenode(&$doc, $node)
 	{
@@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
 	}
 }
 
+function completeurl($url, $scheme) {
+        $urlarr = parse_url($url);
+
+        if (isset($urlarr["scheme"]))
+                return($url);
+
+        $schemearr = parse_url($scheme);
+
+        $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+        if ($schemearr["port"] != "")
+                $complete .= ":".$schemearr["port"];
+
+        $complete .= $urlarr["path"];
+
+        if ($urlarr["query"] != "")
+                $complete .= "?".$urlarr["query"];
+
+        if ($urlarr["fragment"] != "")
+                $complete .= "#".$urlarr["fragment"];
+
+        return($complete);
+}
+
 function parseurl_getsiteinfo($url) {
 	$siteinfo = array();
 
@@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
 	$header = curl_exec($ch);
 	curl_close($ch);
 
-	if (preg_match('/charset=(.*?)\n/', $header, $matches))
+	// Fetch the first mentioned charset. Can be in body or header
+	if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 		$charset = trim(array_pop($matches));
 	else
 		$charset = "utf-8";
@@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {
 
 	$xpath = new DomXPath($doc);
 
-	$list = $xpath->query("head/title");
+	//$list = $xpath->query("head/title");
+	$list = $xpath->query("//title");
 	foreach ($list as $node)
 		$siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
 
-	$list = $xpath->query("head/meta[@name]");
+	//$list = $xpath->query("head/meta[@name]");
+	$list = $xpath->query("//meta[@name]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
 		}
 	}
 
-	$list = $xpath->query("head/meta[@property]");
+	//$list = $xpath->query("head/meta[@property]");
+	$list = $xpath->query("//meta[@property]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
                                 foreach ($node->attributes as $attribute)
                                         $attr[$attribute->name] = $attribute->value;
 
-                        // guess mimetype from headers or filename
-                        $type = guess_image_type($attr["src"],true);
+			$src = completeurl($attr["src"], $url);
+			$photodata = getimagesize($src);
 
-                        $i = fetch_url($attr["src"]);
-                        $ph = new Photo($i, $type);
-
-			if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) {
-				if ($siteinfo["image"] == "")
-	                                $siteinfo["image"] = $attr["src"];
-
-				if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
-					$ph->scaleImage(300);
-	                                $siteinfo["images"][] = array("src"=>$attr["src"],
-									"width"=>$ph->getWidth(),
-									"height"=>$ph->getHeight());
-				} else
-	                                $siteinfo["images"][] = array("src"=>$attr["src"],
-									"width"=>$ph->getWidth(),
-									"height"=>$ph->getHeight());
+			if (($photodata[0] > 150) and ($photodata[1] > 150)) {
+				if ($photodata[0] > 300) {
+					$photodata[1] = $photodata[1] * (300 / $photodata[0]);
+					$photodata[0] = 300;
+				}
+				if ($photodata[1] > 300) {
+					$photodata[0] = $photodata[0] * (300 / $photodata[1]);
+					$photodata[1] = 300;
+				}
+				$siteinfo["images"][] = array("src"=>$src,
+								"width"=>$photodata[0],
+								"height"=>$photodata[1]);
 			}
+
                 }
         } else {
-		// guess mimetype from headers or filename
-                $type = guess_image_type($siteinfo["image"],true);
-
-                $i = fetch_url($siteinfo["image"]);
-                $ph = new Photo($i, $type);
+		$src = completeurl($siteinfo["image"], $url);
+		$photodata = getimagesize($src);
 
-		if($ph->is_valid())
-			$siteinfo["images"][] = array("src"=>$siteinfo["image"],
-							"width"=>$ph->getWidth(),
-							"height"=>$ph->getHeight());
+		if (($photodata[0] > 10) and ($photodata[1] > 10))
+			$siteinfo["images"][] = array("src"=>$src,
+							"width"=>$photodata[0],
+							"height"=>$photodata[1]);
 	}
 
 	if ($siteinfo["text"] == "") {
@@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {
 
 		$list = $xpath->query("//div[@class='article']");
 		foreach ($list as $node)
-			$text .= " ".trim($node->nodeValue);
+			if (strlen($node->nodeValue) > 40)
+				$text .= " ".trim($node->nodeValue);
 
 		if ($text == "") {
 			$list = $xpath->query("//div[@class='content']");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
+					$text .= " ".trim($node->nodeValue);
 		}
 
 		// If none text was found then take the paragraph content
 		if ($text == "") {
 			$list = $xpath->query("//p");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
+					$text .= " ".trim($node->nodeValue);
 		}
 
 		if ($text != "") {
@@ -238,9 +261,9 @@ function parse_url_content(&$a) {
 	if($url && $title && $text) {
 
 		if($textmode)
-			$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
+			$text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
 		else
-			$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
+			$text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
 
 		$title = str_replace(array("\r","\n"),array('',''),$title);
 
@@ -255,7 +278,8 @@ function parse_url_content(&$a) {
 	$siteinfo = parseurl_getsiteinfo($url);
 
 	if($siteinfo["title"] == "") {
-		echo sprintf($template,$url,$url,'') . $str_tags;
+		echo print_r($siteinfo, true);
+		//echo sprintf($template,$url,$url,'') . $str_tags;
 		killme();
 	} else {
 		$text = $siteinfo["text"];
@@ -305,7 +329,7 @@ function parse_url_content(&$a) {
 	}
 
 	if($image) {
-		$text = $br.$br.$image.$br.$text;
+		$text = $br.$br.$image.$text;
 	}
 	$title = str_replace(array("\r","\n"),array('',''),$title);
 
@@ -313,6 +337,6 @@ function parse_url_content(&$a) {
 
 	logger('parse_url: returns: ' . $result);
 
-	echo $result;
+	echo trim($result);
 	killme();
 }