]> git.mxchange.org Git - friendica.git/commitdiff
Don't guess the site info / restrict the description length
authorMichael Vogel <icarus@dabo.de>
Fri, 15 Nov 2019 13:28:42 +0000 (14:28 +0100)
committerMichael Vogel <icarus@dabo.de>
Fri, 15 Nov 2019 13:28:42 +0000 (14:28 +0100)
src/Util/ParseUrl.php

index 1f8fbdeabe4e8898d2a4442196a46a2093c2afd3..569136e6d13da5b67ebfd16f94a7823747326f45 100644 (file)
@@ -17,6 +17,16 @@ use Friendica\Database\DBA;
  */
 class ParseUrl
 {
+       /**
+        * Maximum number of characters for the description
+        */
+       const MAX_DESC_COUNT = 250;
+
+       /**
+        * Minimum number of characters for the description
+        */
+       const MIN_DESC_COUNT = 100;
+
        /**
         * @brief Search for chached embeddable data of an url otherwise fetch it
         *
@@ -336,36 +346,7 @@ class ParseUrl
                        $siteinfo['type'] = 'link';
                }
 
-               if (empty($siteinfo['image']) && !$no_guessing) {
-                       $list = $xpath->query('//img[@src]');
-                       foreach ($list as $node) {
-                               $img_tag = [];
-                               if ($node->attributes->length) {
-                                       foreach ($node->attributes as $attribute) {
-                                               $img_tag[$attribute->name] = $attribute->value;
-                                       }
-                               }
-
-                               $src = self::completeUrl($img_tag['src'], $url);
-                               $photodata = Images::getInfoFromURLCached($src);
-
-                               if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) {
-                                       if ($photodata[0] > 300) {
-                                               $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
-                                               $photodata[0] = 300;
-                                       }
-                                       if ($photodata[1] > 300) {
-                                               $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
-                                               $photodata[1] = 300;
-                                       }
-                                       $siteinfo['images'][] = [
-                                               'src'    => $src,
-                                               'width'  => $photodata[0],
-                                               'height' => $photodata[1]
-                                       ];
-                               }
-                       }
-               } elseif (!empty($siteinfo['image'])) {
+               if (!empty($siteinfo['image'])) {
                        $src = self::completeUrl($siteinfo['image'], $url);
 
                        unset($siteinfo['image']);
@@ -379,47 +360,15 @@ class ParseUrl
                        }
                }
 
-               if ((@$siteinfo['text'] == '') && (@$siteinfo['title'] != '') && !$no_guessing) {
-                       $text = '';
-
-                       $list = $xpath->query('//div[@class="article"]');
-                       foreach ($list as $node) {
-                               if (strlen($node->nodeValue) > 40) {
-                                       $text .= ' ' . trim($node->nodeValue);
-                               }
-                       }
-
-                       if ($text == '') {
-                               $list = $xpath->query('//div[@class="content"]');
-                               foreach ($list as $node) {
-                                       if (strlen($node->nodeValue) > 40) {
-                                               $text .= ' ' . trim($node->nodeValue);
-                                       }
-                               }
-                       }
-
-                       // If none text was found then take the paragraph content
-                       if ($text == '') {
-                               $list = $xpath->query('//p');
-                               foreach ($list as $node) {
-                                       if (strlen($node->nodeValue) > 40) {
-                                               $text .= ' ' . trim($node->nodeValue);
-                                       }
-                               }
-                       }
-
-                       if ($text != '') {
-                               $text = trim(str_replace(["\n", "\r"], [' ', ' '], $text));
-
-                               while (strpos($text, '  ')) {
-                                       $text = trim(str_replace('  ', ' ', $text));
-                               }
-
-                               $siteinfo['text'] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, 'UTF-8') . '...');
+               if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
+                       $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
+                       $pos = mb_strrpos($siteinfo['text'], '.');
+                       if ($pos > self::MIN_DESC_COUNT) {
+                               $siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
                        }
                }
 
-               Logger::log('Siteinfo for ' . $url . ' ' . print_r($siteinfo, true), Logger::DEBUG);
+               Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
 
                Hook::callAll('getsiteinfo', $siteinfo);