From: Hypolite Petovan <hypolite@mrpetovan.com>
Date: Tue, 16 Feb 2021 15:16:04 +0000 (-0500)
Subject: Add native media types and expiration to getSiteInfo
X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=69802554fdfe9272fd05f3e55379286eb3339011;p=friendica.git

Add native media types and expiration to getSiteInfo
---

diff --git a/mod/parse_url.php b/mod/parse_url.php
index 82325aa553..83997958d9 100644
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -180,28 +180,3 @@ function parse_url_content(App $a)
 
 	exit();
 }
-
-/**
- * Legacy function to call ParseUrl::getSiteinfoCached
- *
- * Note: We have moved the function to ParseUrl.php. This function is only for
- * legacy support and will be remove in the future
- *
- * @param string $url         The url of the page which should be scraped
- * @param bool   $no_guessing If true the parse doens't search for
- *                            preview pictures
- * @param bool   $do_oembed   The false option is used by the function fetch_oembed()
- *                            to avoid endless loops
- *
- * @return array which contains needed data for embedding
- *
- * @throws \Friendica\Network\HTTPException\InternalServerErrorException
- * @see   ParseUrl::getSiteinfoCached()
- *
- * @deprecated since version 3.6 use ParseUrl::getSiteinfoCached instead
- */
-function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true)
-{
-	$siteinfo = ParseUrl::getSiteinfoCached($url, $no_guessing, $do_oembed);
-	return $siteinfo;
-}
diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php
index 15186b5737..de280bcf85 100644
--- a/src/Util/ParseUrl.php
+++ b/src/Util/ParseUrl.php
@@ -29,6 +29,7 @@ use Friendica\Core\Logger;
 use Friendica\Database\Database;
 use Friendica\Database\DBA;
 use Friendica\DI;
+use Friendica\Network\HTTPException;
 
 /**
  * Get information about a given URL
@@ -37,6 +38,9 @@ use Friendica\DI;
  */
 class ParseUrl
 {
+	const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day';
+	const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months';
+
 	/**
 	 * Maximum number of characters for the description
 	 */
@@ -65,18 +69,23 @@ class ParseUrl
 	 *    array  'images'   => (optional) Array of preview pictures
 	 *    string 'keywords' => (optional) The tags which belong to the content
 	 *
-	 * @throws \Friendica\Network\HTTPException\InternalServerErrorException
+	 * @throws HTTPException\InternalServerErrorException
 	 * @see   ParseUrl::getSiteinfo() for more information about scraping
 	 * embeddable content
 	 */
-	public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
+	public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true): array
 	{
-		if ($url == "") {
-			return false;
+		if (empty($url)) {
+			return [
+				'url' => '',
+				'type' => 'error',
+			];
 		}
 
+		$urlHash = hash('sha256', $url);
+
 		$parsed_url = DBA::selectFirst('parsed_url', ['content'],
-			['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
+			['url_hash' => $urlHash, 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
 		);
 		if (!empty($parsed_url['content'])) {
 			$data = unserialize($parsed_url['content']);
@@ -85,12 +94,20 @@ class ParseUrl
 
 		$data = self::getSiteinfo($url, $no_guessing, $do_oembed);
 
-		DBA::insert(
+		$expires = $data['expires'];
+
+		unset($data['expires']);
+
+		DI::dba()->insert(
 			'parsed_url',
 			[
-				'url' => substr(Strings::normaliseLink($url), 0, 255), 'guessing' => !$no_guessing,
-				'oembed' => $do_oembed, 'content' => serialize($data),
-				'created' => DateTimeFormat::utcNow()
+				'url_hash' => $urlHash,
+				'guessing' => !$no_guessing,
+				'oembed'   => $do_oembed,
+				'url'      => $url,
+				'content'  => serialize($data),
+				'created'  => DateTimeFormat::utcNow(),
+				'expires'  => $expires,
 			],
 			Database::INSERT_UPDATE
 		);
@@ -117,7 +134,7 @@ class ParseUrl
 	 *
 	 * @return array which contains needed data for embedding
 	 *    string 'url'      => The url of the parsed page
-	 *    string 'type'     => Content type
+	 *    string 'type'     => Content type (error, link, photo, image, audio, video)
 	 *    string 'title'    => (optional) The title of the content
 	 *    string 'text'     => (optional) The description for the content
 	 *    string 'image'    => (optional) A preview image of the content (only available if $no_guessing = false)
@@ -140,6 +157,13 @@ class ParseUrl
 	 */
 	public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
 	{
+		if (empty($url)) {
+			return [
+				'url' => '',
+				'type' => 'error',
+			];
+		}
+
 		// Check if the URL does contain a scheme
 		$scheme = parse_url($url, PHP_URL_SCHEME);
 
@@ -154,6 +178,7 @@ class ParseUrl
 		$siteinfo = [
 			'url' => $url,
 			'type' => 'link',
+			'expires' => DateTimeFormat::utc(self::DEFAULT_EXPIRATION_FAILURE),
 		];
 
 		if ($count > 10) {
@@ -166,16 +191,35 @@ class ParseUrl
 			return $siteinfo;
 		}
 
+		$siteinfo['expires'] = DateTimeFormat::utc(self::DEFAULT_EXPIRATION_SUCCESS);
+
 		// If the file is too large then exit
 		if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) {
 			return $siteinfo;
 		}
 
+		// Native media type, no need for HTML parsing
+		$type = $curlResult->getHeader('Content-Type');
+		if ($type) {
+			preg_match('#(image|video|audio)/#i', $type, $matches);
+			if ($matches) {
+				$siteinfo['type'] = array_pop($matches);
+				return $siteinfo;
+			}
+		}
+
 		// If it isn't a HTML file then exit
 		if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
 			return $siteinfo;
 		}
 
+		if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
+			if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
+				$maxAge = max(86400, (int)array_pop($matches));
+				$siteinfo['expires'] = DateTimeFormat::utc("now + $maxAge seconds");
+			}
+		}
+
 		$header = $curlResult->getHeader();
 		$body = $curlResult->getBody();
 
diff --git a/src/Worker/ClearCache.php b/src/Worker/ClearCache.php
index 5eee4c74ab..a836e5bec6 100644
--- a/src/Worker/ClearCache.php
+++ b/src/Worker/ClearCache.php
@@ -64,7 +64,7 @@ class ClearCache
 		// Delete the cached OEmbed entries that are older than three month
 		DBA::delete('oembed', ["`created` < NOW() - INTERVAL 3 MONTH"]);
 
-		// Delete the cached "parse_url" entries that are older than three month
-		DBA::delete('parsed_url', ["`created` < NOW() - INTERVAL 3 MONTH"]);
+		// Delete the cached "parsed_url" entries that are expired
+		DBA::delete('parsed_url', ["`expires` < NOW()"]);
 	}
 }