Add native media types and expiration to getSiteInfo

author Hypolite Petovan <hypolite@mrpetovan.com>

Tue, 16 Feb 2021 15:16:04 +0000 (10:16 -0500)

committer Hypolite Petovan <hypolite@mrpetovan.com>

Thu, 18 Feb 2021 14:06:11 +0000 (09:06 -0500)
author Hypolite Petovan <hypolite@mrpetovan.com>
Tue, 16 Feb 2021 15:16:04 +0000 (10:16 -0500)
committer Hypolite Petovan <hypolite@mrpetovan.com>
Thu, 18 Feb 2021 14:06:11 +0000 (09:06 -0500)
diff --git a/mod/parse_url.php b/mod/parse_url.php

index 82325aa553698a98d5d89b678bb715d7bdc259cd..83997958d9f8d1a4ca1a45d5b897da0589c2f158 100644 (file)
--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -180,28 +180,3 @@ function parse_url_content(App $a)
  
         exit();
  }
-
-/**
- * Legacy function to call ParseUrl::getSiteinfoCached
- *
- * Note: We have moved the function to ParseUrl.php. This function is only for
- * legacy support and will be remove in the future
- *
- * @param string $url         The url of the page which should be scraped
- * @param bool   $no_guessing If true the parse doens't search for
- *                            preview pictures
- * @param bool   $do_oembed   The false option is used by the function fetch_oembed()
- *                            to avoid endless loops
- *
- * @return array which contains needed data for embedding
- *
- * @throws \Friendica\Network\HTTPException\InternalServerErrorException
- * @see   ParseUrl::getSiteinfoCached()
- *
- * @deprecated since version 3.6 use ParseUrl::getSiteinfoCached instead
- */
-function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true)
-{
-       $siteinfo = ParseUrl::getSiteinfoCached($url, $no_guessing, $do_oembed);
-       return $siteinfo;
-}
diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php

index 15186b57374bd9246444dcf484f575d6cfec7eaf..de280bcf85a623870d27952d1049208f52454d01 100644 (file)
--- a/src/Util/ParseUrl.php
+++ b/src/Util/ParseUrl.php
@@ -29,6 +29,7 @@ use Friendica\Core\Logger;
  use Friendica\Database\Database;
  use Friendica\Database\DBA;
  use Friendica\DI;
+use Friendica\Network\HTTPException;
  
  /**
   * Get information about a given URL
@@ -37,6 +38,9 @@ use Friendica\DI;
   */
  class ParseUrl
  {
+       const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day';
+       const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months';
+
         /**
          * Maximum number of characters for the description
          */
@@ -65,18 +69,23 @@ class ParseUrl
          *    array  'images'   => (optional) Array of preview pictures
          *    string 'keywords' => (optional) The tags which belong to the content
          *
-        * @throws \Friendica\Network\HTTPException\InternalServerErrorException
+        * @throws HTTPException\InternalServerErrorException
          * @see   ParseUrl::getSiteinfo() for more information about scraping
          * embeddable content
          */
-       public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
+       public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true): array
         {
-               if ($url == "") {
-                       return false;
+               if (empty($url)) {
+                       return [
+                               'url' => '',
+                               'type' => 'error',
+                       ];
                 }
  
+               $urlHash = hash('sha256', $url);
+
                 $parsed_url = DBA::selectFirst('parsed_url', ['content'],
-                       ['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
+                       ['url_hash' => $urlHash, 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
                 );
                 if (!empty($parsed_url['content'])) {
                         $data = unserialize($parsed_url['content']);
@@ -85,12 +94,20 @@ class ParseUrl
  
                 $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
  
-               DBA::insert(
+               $expires = $data['expires'];
+
+               unset($data['expires']);
+
+               DI::dba()->insert(
                         'parsed_url',
                         [
-                               'url' => substr(Strings::normaliseLink($url), 0, 255), 'guessing' => !$no_guessing,
-                               'oembed' => $do_oembed, 'content' => serialize($data),
-                               'created' => DateTimeFormat::utcNow()
+                               'url_hash' => $urlHash,
+                               'guessing' => !$no_guessing,
+                               'oembed'   => $do_oembed,
+                               'url'      => $url,
+                               'content'  => serialize($data),
+                               'created'  => DateTimeFormat::utcNow(),
+                               'expires'  => $expires,
                         ],
                         Database::INSERT_UPDATE
                 );
@@ -117,7 +134,7 @@ class ParseUrl
          *
          * @return array which contains needed data for embedding
          *    string 'url'      => The url of the parsed page
-        *    string 'type'     => Content type
+        *    string 'type'     => Content type (error, link, photo, image, audio, video)
          *    string 'title'    => (optional) The title of the content
          *    string 'text'     => (optional) The description for the content
          *    string 'image'    => (optional) A preview image of the content (only available if $no_guessing = false)
@@ -140,6 +157,13 @@ class ParseUrl
          */
         public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1)
         {
+               if (empty($url)) {
+                       return [
+                               'url' => '',
+                               'type' => 'error',
+                       ];
+               }
+
                 // Check if the URL does contain a scheme
                 $scheme = parse_url($url, PHP_URL_SCHEME);
  
@@ -154,6 +178,7 @@ class ParseUrl
                 $siteinfo = [
                         'url' => $url,
                         'type' => 'link',
+                       'expires' => DateTimeFormat::utc(self::DEFAULT_EXPIRATION_FAILURE),
                 ];
  
                 if ($count > 10) {
@@ -166,16 +191,35 @@ class ParseUrl
                         return $siteinfo;
                 }
  
+               $siteinfo['expires'] = DateTimeFormat::utc(self::DEFAULT_EXPIRATION_SUCCESS);
+
                 // If the file is too large then exit
                 if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) {
                         return $siteinfo;
                 }
  
+               // Native media type, no need for HTML parsing
+               $type = $curlResult->getHeader('Content-Type');
+               if ($type) {
+                       preg_match('#(image|video|audio)/#i', $type, $matches);
+                       if ($matches) {
+                               $siteinfo['type'] = array_pop($matches);
+                               return $siteinfo;
+                       }
+               }
+
                 // If it isn't a HTML file then exit
                 if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
                         return $siteinfo;
                 }
  
+               if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
+                       if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
+                               $maxAge = max(86400, (int)array_pop($matches));
+                               $siteinfo['expires'] = DateTimeFormat::utc("now + $maxAge seconds");
+                       }
+               }
+
                 $header = $curlResult->getHeader();
                 $body = $curlResult->getBody();
  
diff --git a/src/Worker/ClearCache.php b/src/Worker/ClearCache.php

index 5eee4c74ab9a0429d243a3ff8dc3af9fd3c9c821..a836e5bec6e36c704f2b287db6ea9b25d1b5ed66 100644 (file)
--- a/src/Worker/ClearCache.php
+++ b/src/Worker/ClearCache.php
@@ -64,7 +64,7 @@ class ClearCache
                 // Delete the cached OEmbed entries that are older than three month
                 DBA::delete('oembed', ["`created` < NOW() - INTERVAL 3 MONTH"]);
  
-               // Delete the cached "parse_url" entries that are older than three month
-               DBA::delete('parsed_url', ["`created` < NOW() - INTERVAL 3 MONTH"]);
+               // Delete the cached "parsed_url" entries that are expired
+               DBA::delete('parsed_url', ["`expires` < NOW()"]);
         }
  }
author	Hypolite Petovan <hypolite@mrpetovan.com>
	Tue, 16 Feb 2021 15:16:04 +0000 (10:16 -0500)
committer	Hypolite Petovan <hypolite@mrpetovan.com>
	Thu, 18 Feb 2021 14:06:11 +0000 (09:06 -0500)
mod/parse_url.php		patch \| blob \| history
src/Util/ParseUrl.php		patch \| blob \| history
src/Worker/ClearCache.php		patch \| blob \| history