From a3b7f08f78fa7e02642ee507699d6b59c926a0eb Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Fri, 15 Nov 2019 14:28:42 +0100 Subject: [PATCH] Don't guess the site info / restrict the description length --- src/Util/ParseUrl.php | 85 +++++++++---------------------------------- 1 file changed, 17 insertions(+), 68 deletions(-) diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 1f8fbdeabe..569136e6d1 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -17,6 +17,16 @@ use Friendica\Database\DBA; */ class ParseUrl { + /** + * Maximum number of characters for the description + */ + const MAX_DESC_COUNT = 250; + + /** + * Minimum number of characters for the description + */ + const MIN_DESC_COUNT = 100; + /** * @brief Search for chached embeddable data of an url otherwise fetch it * @@ -336,36 +346,7 @@ class ParseUrl $siteinfo['type'] = 'link'; } - if (empty($siteinfo['image']) && !$no_guessing) { - $list = $xpath->query('//img[@src]'); - foreach ($list as $node) { - $img_tag = []; - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $img_tag[$attribute->name] = $attribute->value; - } - } - - $src = self::completeUrl($img_tag['src'], $url); - $photodata = Images::getInfoFromURLCached($src); - - if (($photodata) && ($photodata[0] > 150) && ($photodata[1] > 150)) { - if ($photodata[0] > 300) { - $photodata[1] = round($photodata[1] * (300 / $photodata[0])); - $photodata[0] = 300; - } - if ($photodata[1] > 300) { - $photodata[0] = round($photodata[0] * (300 / $photodata[1])); - $photodata[1] = 300; - } - $siteinfo['images'][] = [ - 'src' => $src, - 'width' => $photodata[0], - 'height' => $photodata[1] - ]; - } - } - } elseif (!empty($siteinfo['image'])) { + if (!empty($siteinfo['image'])) { $src = self::completeUrl($siteinfo['image'], $url); unset($siteinfo['image']); @@ -379,47 +360,15 @@ class ParseUrl } } - if ((@$siteinfo['text'] == '') && (@$siteinfo['title'] != '') && !$no_guessing) { - $text = ''; - - $list = $xpath->query('//div[@class="article"]'); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= ' ' . trim($node->nodeValue); - } - } - - if ($text == '') { - $list = $xpath->query('//div[@class="content"]'); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= ' ' . trim($node->nodeValue); - } - } - } - - // If none text was found then take the paragraph content - if ($text == '') { - $list = $xpath->query('//p'); - foreach ($list as $node) { - if (strlen($node->nodeValue) > 40) { - $text .= ' ' . trim($node->nodeValue); - } - } - } - - if ($text != '') { - $text = trim(str_replace(["\n", "\r"], [' ', ' '], $text)); - - while (strpos($text, ' ')) { - $text = trim(str_replace(' ', ' ', $text)); - } - - $siteinfo['text'] = trim(html_entity_decode(substr($text, 0, 350), ENT_QUOTES, 'UTF-8') . '...'); + if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) { + $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…'; + $pos = mb_strrpos($siteinfo['text'], '.'); + if ($pos > self::MIN_DESC_COUNT) { + $siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1); } } - Logger::log('Siteinfo for ' . $url . ' ' . print_r($siteinfo, true), Logger::DEBUG); + Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]); Hook::callAll('getsiteinfo', $siteinfo); -- 2.39.5