]> git.mxchange.org Git - friendica.git/commitdiff
parse_url: Further improvements of the new method to fetch page data
authorMichael Vogel <icarus@dabo.de>
Thu, 12 Jul 2012 21:41:04 +0000 (23:41 +0200)
committerMichael Vogel <icarus@dabo.de>
Thu, 12 Jul 2012 21:41:04 +0000 (23:41 +0200)
include/api.php
mod/parse_url.php

index 3858b9fe32b635de95e84120c53c4f5ba6e89410..e0b788424e4d2e7f506efe287d34508d1b647c84 100644 (file)
@@ -1727,5 +1727,6 @@ notifications/follow
 notifications/leave
 blocks/exists
 blocks/blocking
+lists
 */
 
index 97e1658c8985fd47b6360f9685a91d0549c3ca20..4d894969aadbde21c152ac87895da09aba6915d9 100644 (file)
@@ -1,6 +1,4 @@
 <?php
-require_once('include/Photo.php');
-
 if(!function_exists('deletenode')) {
        function deletenode(&$doc, $node)
        {
@@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
        }
 }
 
+function completeurl($url, $scheme) {
+        $urlarr = parse_url($url);
+
+        if (isset($urlarr["scheme"]))
+                return($url);
+
+        $schemearr = parse_url($scheme);
+
+        $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+        if ($schemearr["port"] != "")
+                $complete .= ":".$schemearr["port"];
+
+        $complete .= $urlarr["path"];
+
+        if ($urlarr["query"] != "")
+                $complete .= "?".$urlarr["query"];
+
+        if ($urlarr["fragment"] != "")
+                $complete .= "#".$urlarr["fragment"];
+
+        return($complete);
+}
+
 function parseurl_getsiteinfo($url) {
        $siteinfo = array();
 
@@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
        $header = curl_exec($ch);
        curl_close($ch);
 
-       if (preg_match('/charset=(.*?)\n/', $header, $matches))
+       // Fetch the first mentioned charset. Can be in body or header
+       if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
                $charset = trim(array_pop($matches));
        else
                $charset = "utf-8";
@@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {
 
        $xpath = new DomXPath($doc);
 
-       $list = $xpath->query("head/title");
+       //$list = $xpath->query("head/title");
+       $list = $xpath->query("//title");
        foreach ($list as $node)
                $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
 
-       $list = $xpath->query("head/meta[@name]");
+       //$list = $xpath->query("head/meta[@name]");
+       $list = $xpath->query("//meta[@name]");
        foreach ($list as $node) {
                $attr = array();
                if ($node->attributes->length)
@@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
                }
        }
 
-       $list = $xpath->query("head/meta[@property]");
+       //$list = $xpath->query("head/meta[@property]");
+       $list = $xpath->query("//meta[@property]");
        foreach ($list as $node) {
                $attr = array();
                if ($node->attributes->length)
@@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
                                 foreach ($node->attributes as $attribute)
                                         $attr[$attribute->name] = $attribute->value;
 
-                        // guess mimetype from headers or filename
-                        $type = guess_image_type($attr["src"],true);
+                       $src = completeurl($attr["src"], $url);
+                       $photodata = getimagesize($src);
 
-                        $i = fetch_url($attr["src"]);
-                        $ph = new Photo($i, $type);
-
-                       if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) {
-                               if ($siteinfo["image"] == "")
-                                       $siteinfo["image"] = $attr["src"];
-
-                               if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
-                                       $ph->scaleImage(300);
-                                       $siteinfo["images"][] = array("src"=>$attr["src"],
-                                                                       "width"=>$ph->getWidth(),
-                                                                       "height"=>$ph->getHeight());
-                               } else
-                                       $siteinfo["images"][] = array("src"=>$attr["src"],
-                                                                       "width"=>$ph->getWidth(),
-                                                                       "height"=>$ph->getHeight());
+                       if (($photodata[0] > 150) and ($photodata[1] > 150)) {
+                               if ($photodata[0] > 300) {
+                                       $photodata[1] = $photodata[1] * (300 / $photodata[0]);
+                                       $photodata[0] = 300;
+                               }
+                               if ($photodata[1] > 300) {
+                                       $photodata[0] = $photodata[0] * (300 / $photodata[1]);
+                                       $photodata[1] = 300;
+                               }
+                               $siteinfo["images"][] = array("src"=>$src,
+                                                               "width"=>$photodata[0],
+                                                               "height"=>$photodata[1]);
                        }
+
                 }
         } else {
-               // guess mimetype from headers or filename
-                $type = guess_image_type($siteinfo["image"],true);
-
-                $i = fetch_url($siteinfo["image"]);
-                $ph = new Photo($i, $type);
+               $src = completeurl($siteinfo["image"], $url);
+               $photodata = getimagesize($src);
 
-               if($ph->is_valid())
-                       $siteinfo["images"][] = array("src"=>$siteinfo["image"],
-                                                       "width"=>$ph->getWidth(),
-                                                       "height"=>$ph->getHeight());
+               if (($photodata[0] > 10) and ($photodata[1] > 10))
+                       $siteinfo["images"][] = array("src"=>$src,
+                                                       "width"=>$photodata[0],
+                                                       "height"=>$photodata[1]);
        }
 
        if ($siteinfo["text"] == "") {
@@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {
 
                $list = $xpath->query("//div[@class='article']");
                foreach ($list as $node)
-                       $text .= " ".trim($node->nodeValue);
+                       if (strlen($node->nodeValue) > 40)
+                               $text .= " ".trim($node->nodeValue);
 
                if ($text == "") {
                        $list = $xpath->query("//div[@class='content']");
                        foreach ($list as $node)
-                               $text .= " ".trim($node->nodeValue);
+                               if (strlen($node->nodeValue) > 40)
+                                       $text .= " ".trim($node->nodeValue);
                }
 
                // If none text was found then take the paragraph content
                if ($text == "") {
                        $list = $xpath->query("//p");
                        foreach ($list as $node)
-                               $text .= " ".trim($node->nodeValue);
+                               if (strlen($node->nodeValue) > 40)
+                                       $text .= " ".trim($node->nodeValue);
                }
 
                if ($text != "") {
@@ -238,9 +261,9 @@ function parse_url_content(&$a) {
        if($url && $title && $text) {
 
                if($textmode)
-                       $text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
+                       $text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
                else
-                       $text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
+                       $text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
 
                $title = str_replace(array("\r","\n"),array('',''),$title);
 
@@ -255,7 +278,8 @@ function parse_url_content(&$a) {
        $siteinfo = parseurl_getsiteinfo($url);
 
        if($siteinfo["title"] == "") {
-               echo sprintf($template,$url,$url,'') . $str_tags;
+               echo print_r($siteinfo, true);
+               //echo sprintf($template,$url,$url,'') . $str_tags;
                killme();
        } else {
                $text = $siteinfo["text"];
@@ -305,7 +329,7 @@ function parse_url_content(&$a) {
        }
 
        if($image) {
-               $text = $br.$br.$image.$br.$text;
+               $text = $br.$br.$image.$text;
        }
        $title = str_replace(array("\r","\n"),array('',''),$title);
 
@@ -313,6 +337,6 @@ function parse_url_content(&$a) {
 
        logger('parse_url: returns: ' . $result);
 
-       echo $result;
+       echo trim($result);
        killme();
 }