]> git.mxchange.org Git - friendica.git/blobdiff - src/Util/ParseUrl.php
Check media links when fetching page data
[friendica.git] / src / Util / ParseUrl.php
index 745ab5c7498feb0d2d3a78abfb68fd20e6a2cc0d..13cb55b73ee1396acd4d03714d38c2c07be59233 100644 (file)
@@ -478,20 +478,14 @@ class ParseUrl
                        $siteinfo['type'] = 'link';
                }
 
-               if (!empty($siteinfo['image']) && empty($siteinfo['images'])) {
-                       $src = self::completeUrl($siteinfo['image'], $url);
-
+               if (!empty($siteinfo['image'])) {
+                       $siteinfo['images'] = $siteinfo['images'] ?? [];
+                       array_unshift($siteinfo['images'], ['url' => $siteinfo['image']]);
                        unset($siteinfo['image']);
-
-                       $photodata = Images::getInfoFromURLCached($src);
-
-                       if (($photodata) && ($photodata[0] > 10) && ($photodata[1] > 10)) {
-                               $siteinfo['images'][] = ['src' => $src,
-                                       'width' => $photodata[0],
-                                       'height' => $photodata[1]];
-                       }
                }
 
+               $siteinfo = self::checkMedia($url, $siteinfo);
+
                if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
                        $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
                        $pos = mb_strrpos($siteinfo['text'], '.');
@@ -508,17 +502,191 @@ class ParseUrl
        }
 
        /**
-        * Parse the Json-Ld parts
+        * Check the attached media elements.
+        * Fix existing data and add missing data.
+        *
+        * @param string $page_url
+        * @param array $siteinfo
+        * @return void
+        */
+       private static function checkMedia(string $page_url, array $siteinfo)
+       {
+               if (!empty($siteinfo['images'])) {
+                       array_walk($siteinfo['images'], function (&$image) use ($page_url) {
+                               // According to the specifications someone could place a picture url into the content field as well.
+                               // But this doesn't seem to happen in the wild, so we don't cover it here.
+                               $image['url'] = self::completeUrl($image['url'], $page_url);
+                               $photodata = Images::getInfoFromURLCached($image['url']);
+                               if (!empty($photodata) && ($photodata[0] > 50) && ($photodata[1] > 50)) {
+                                       $image['src'] = $image['url'];
+                                       $image['width'] = $photodata[0];
+                                       $image['height'] = $photodata[1];
+                                       $image['contenttype'] = $photodata['mime'];
+                                       unset($image['url']);
+                                       ksort($image);
+                               } else {
+                                       $image = [];
+                               }
+                       });
+
+                       $siteinfo['images'] = array_values(array_filter($siteinfo['images']));
+               }
+
+               foreach (['audio', 'video'] as $element) {
+                       if (!empty($siteinfo[$element])) {
+                               array_walk($siteinfo[$element], function (&$media) use ($page_url, &$siteinfo) {
+                                       $url = '';
+                                       $embed = '';
+                                       $content = '';
+                                       $contenttype = '';
+                                       foreach (['embed', 'content', 'url'] as $field) {
+                                               if (!empty($media[$field])) {
+                                                       $media[$field] = self::completeUrl($media[$field], $page_url);
+                                                       $type = self::getContentType($media[$field]);
+                                                       if ($type[0] == 'text') {
+                                                               if ($field == 'embed') {
+                                                                       $embed = $media[$field];
+                                                               } else {
+                                                                       $url = $media[$field];
+                                                               }
+                                                       } elseif (!empty($type[0])) {
+                                                               $content = $media[$field];
+                                                               $contenttype = implode('/', $type);
+                                                       }
+                                               }
+                                               unset($media[$field]);
+                                       }
+
+                                       foreach (['image', 'preview'] as $field) {
+                                               if (!empty($media[$field])) {
+                                                       $media[$field] = self::completeUrl($media[$field], $page_url);
+                                               }
+                                       }
+
+                                       if (!empty($url)) {
+                                               $media['url'] = $url;
+                                       }
+                                       if (!empty($embed)) {
+                                               $media['embed'] = $embed;
+                                               if (!empty($media['main'])) {
+                                                       $siteinfo['embed'] = $embed;
+                                               }
+                                       }
+                                       if (!empty($content)) {
+                                               $media['src'] = $content;
+                                       }
+                                       if (!empty($contenttype)) {
+                                               $media['contenttype'] = $contenttype;
+                                       }
+                                       if (empty($url) && empty($content) && empty($embed)) {
+                                               $media = [];
+                                       }
+                                       ksort($media);
+                               });
+
+                               $siteinfo[$element] = array_values(array_filter($siteinfo[$element]));
+                       }
+                       if (empty($siteinfo[$element])) {
+                               unset($siteinfo[$element]);
+                       }
+               }
+               return $siteinfo;
+       }
+
+       /**
+        * Convert tags from CSV to an array
+        *
+        * @param string $string Tags
+        * @return array with formatted Hashtags
+        */
+       public static function convertTagsToArray($string)
+       {
+               $arr_tags = str_getcsv($string);
+               if (count($arr_tags)) {
+                       // add the # sign to every tag
+                       array_walk($arr_tags, ["self", "arrAddHashes"]);
+
+                       return $arr_tags;
+               }
+       }
+
+       /**
+        * Add a hasht sign to a string
         *
-        * @param array $siteinfo 
-        * @param array $jsonld 
+        * This method is used as callback function
+        *
+        * @param string $tag The pure tag name
+        * @param int    $k   Counter for internal use
+        * @return void
+        */
+       private static function arrAddHashes(&$tag, $k)
+       {
+               $tag = "#" . $tag;
+       }
+
+       /**
+        * Add a scheme to an url
+        *
+        * The src attribute of some html elements (e.g. images)
+        * can miss the scheme so we need to add the correct
+        * scheme
+        *
+        * @param string $url    The url which possibly does have
+        *                       a missing scheme (a link to an image)
+        * @param string $scheme The url with a correct scheme
+        *                       (e.g. the url from the webpage which does contain the image)
+        *
+        * @return string The url with a scheme
+        */
+       private static function completeUrl($url, $scheme)
+       {
+               $urlarr = parse_url($url);
+
+               // If the url does allready have an scheme
+               // we can stop the process here
+               if (isset($urlarr["scheme"])) {
+                       return($url);
+               }
+
+               $schemearr = parse_url($scheme);
+
+               $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+               if (!empty($schemearr["port"])) {
+                       $complete .= ":".$schemearr["port"];
+               }
+
+               if (!empty($urlarr["path"])) {
+                       if (strpos($urlarr["path"], "/") !== 0) {
+                               $complete .= "/";
+                       }
+
+                       $complete .= $urlarr["path"];
+               }
+
+               if (!empty($urlarr["query"])) {
+                       $complete .= "?".$urlarr["query"];
+               }
+
+               if (!empty($urlarr["fragment"])) {
+                       $complete .= "#".$urlarr["fragment"];
+               }
+
+               return($complete);
+       }
+
+       /**
+        * Parse the Json-Ld parts of a web page
+        *
+        * @param array $siteinfo
+        * @param array $jsonld
         * @return array siteinfo
         */
        private static function parseParts(array $siteinfo, array $jsonld)
        {
                if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) {
                        foreach ($jsonld['@graph'] as $part) {
-                               $siteinfo = self::parseJsonLd($siteinfo, $part);
+                               $siteinfo = self::parseParts($siteinfo, $part);
                        }
                } elseif (!empty($jsonld['@type'])) {
                        $siteinfo = self::parseJsonLd($siteinfo, $jsonld);
@@ -543,6 +711,7 @@ class ParseUrl
        /**
         * Improve the siteinfo with information from the provided JSON-LD information
         * @see https://jsonld.com/
+        * @see https://schema.org/
         *
         * @param array $siteinfo
         * @param array $jsonld
@@ -557,8 +726,8 @@ class ParseUrl
                }
 
                // Silently ignore some types that aren't processed
-               if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork',
-                       'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', 
+               if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', 'MusicAlbum',
+                       'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', 'MusicRecording',
                        'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) {
                        return $siteinfo;
                }
@@ -616,7 +785,7 @@ class ParseUrl
                                return self::parseJsonLdWebOrganization($siteinfo, $jsonld);
                        case 'Person':
                        case 'Patient':
-                               case 'PerformingGroup':
+                       case 'PerformingGroup':
                        case 'DanceGroup';
                        case 'MusicGroup':
                        case 'TheaterGroup':                    
@@ -635,7 +804,7 @@ class ParseUrl
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD information concerning authors and publishers
+        * Fetch author and publisher data
         *
         * @param array $siteinfo
         * @param array $jsonld
@@ -663,14 +832,33 @@ class ParseUrl
 
                        $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization');
                        if (!empty($brand) && is_array($brand)) {
-                               $content = JsonLD::fetchElement($brand, 'name', '@type', 'brand');
+                               $content = JsonLD::fetchElement($brand, 'name');
                                if (!empty($content) && is_string($content)) {
                                        $jsonldinfo['publisher_name'] = trim($content);
                                }
-                               $content = JsonLD::fetchElement($brand, 'url', '@type', 'brand');
+
+                               $content = JsonLD::fetchElement($brand, 'sameAs');
                                if (!empty($content) && is_string($content)) {
                                        $jsonldinfo['publisher_url'] = trim($content);
                                }
+
+                               $content = JsonLD::fetchElement($brand, 'url');
+                               if (!empty($content) && is_string($content)) {
+                                       $jsonldinfo['publisher_url'] = trim($content);
+                               }
+
+                               $content = JsonLD::fetchElement($brand, 'logo', 'url');
+                               if (!empty($content) && is_string($content)) {
+                                       $jsonldinfo['publisher_img'] = trim($content);
+                               }
+                       }
+
+                       $logo = JsonLD::fetchElement($jsonld, 'publisher', 'logo');
+                       if (!empty($logo) && is_array($logo)) {
+                               $content = JsonLD::fetchElement($logo, 'url');
+                               if (!empty($content) && is_string($content)) {
+                                       $jsonldinfo['publisher_img'] = trim($content);
+                               }
                        }
                } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) {
                        $jsonldinfo['publisher_name'] = trim($jsonld['publisher']);
@@ -691,17 +879,25 @@ class ParseUrl
                        if (!empty($content) && is_string($content)) {
                                $jsonldinfo['author_url'] = trim($content);
                        }
+
+                       $logo = JsonLD::fetchElement($jsonld, 'author', 'logo');
+                       if (!empty($logo) && is_array($logo)) {
+                               $content = JsonLD::fetchElement($logo, 'url');
+                               if (!empty($content) && is_string($content)) {
+                                       $jsonldinfo['author_img'] = trim($content);
+                               }
+                       }
                } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) {
                        $jsonldinfo['author_name'] = trim($jsonld['author']);
                }
 
-               Logger::info('Fetched author information', ['fetched' => $jsonldinfo]);
+               Logger::info('Fetched Author information', ['fetched' => $jsonldinfo]);
 
                return array_merge($siteinfo, $jsonldinfo);
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD Article information
+        * Fetch data from the provided JSON-LD Article type
         * @see https://schema.org/Article
         *
         * @param array $siteinfo
@@ -761,7 +957,7 @@ class ParseUrl
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD WebPage information
+        * Fetch data from the provided JSON-LD WebPage type
         * @see https://schema.org/WebPage
         *
         * @param array $siteinfo
@@ -794,13 +990,13 @@ class ParseUrl
 
                $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld);
 
-               Logger::info('Fetched webpage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
+               Logger::info('Fetched WebPage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
 
                return array_merge($siteinfo, $jsonldinfo);
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD WebSite information
+        * Fetch data from the provided JSON-LD WebSite type
         * @see https://schema.org/WebSite
         *
         * @param array $siteinfo
@@ -838,7 +1034,7 @@ class ParseUrl
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD Organization information
+        * Fetch data from the provided JSON-LD Organization type
         * @see https://schema.org/Organization
         *
         * @param array $siteinfo
@@ -889,7 +1085,7 @@ class ParseUrl
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD Person information
+        * Fetch data from the provided JSON-LD Person type
         * @see https://schema.org/Person
         *
         * @param array $siteinfo
@@ -930,7 +1126,7 @@ class ParseUrl
        }
 
        /**
-        * Improve the siteinfo with information from the provided JSON-LD MediaObject
+        * Fetch data from the provided JSON-LD MediaObject type
         * @see https://schema.org/MediaObject
         *
         * @param array $siteinfo
@@ -948,7 +1144,12 @@ class ParseUrl
 
                $content = JsonLD::fetchElement($jsonld, 'url');
                if (!empty($content)) {
-                       $media['src'] = trim($content);
+                       $media['url'] = trim($content);
+               }
+
+               $content = JsonLD::fetchElement($jsonld, 'mainEntityOfPage');
+               if (!empty($content)) {
+                       $media['main'] = Strings::compareLink($content, $siteinfo['url']);
                }
 
                $content = JsonLD::fetchElement($jsonld, 'description');
@@ -981,100 +1182,22 @@ class ParseUrl
                        $media['width'] = trim($content);
                }
 
-               $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
-               if (!empty($content)) {
-                       $media['preview'] = trim($content);
-               }
-
                $content = JsonLD::fetchElement($jsonld, 'image');
                if (!empty($content)) {
                        $media['image'] = trim($content);
                }
 
-               Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]);
-               $siteinfo[$name][] = $media;
-               return $siteinfo;
-       }
-
-       /**
-        * Convert tags from CSV to an array
-        *
-        * @param string $string Tags
-        * @return array with formatted Hashtags
-        */
-       public static function convertTagsToArray($string)
-       {
-               $arr_tags = str_getcsv($string);
-               if (count($arr_tags)) {
-                       // add the # sign to every tag
-                       array_walk($arr_tags, ["self", "arrAddHashes"]);
-
-                       return $arr_tags;
-               }
-       }
-
-       /**
-        * Add a hasht sign to a string
-        *
-        * This method is used as callback function
-        *
-        * @param string $tag The pure tag name
-        * @param int    $k   Counter for internal use
-        * @return void
-        */
-       private static function arrAddHashes(&$tag, $k)
-       {
-               $tag = "#" . $tag;
-       }
-
-       /**
-        * Add a scheme to an url
-        *
-        * The src attribute of some html elements (e.g. images)
-        * can miss the scheme so we need to add the correct
-        * scheme
-        *
-        * @param string $url    The url which possibly does have
-        *                       a missing scheme (a link to an image)
-        * @param string $scheme The url with a correct scheme
-        *                       (e.g. the url from the webpage which does contain the image)
-        *
-        * @return string The url with a scheme
-        */
-       private static function completeUrl($url, $scheme)
-       {
-               $urlarr = parse_url($url);
-
-               // If the url does allready have an scheme
-               // we can stop the process here
-               if (isset($urlarr["scheme"])) {
-                       return($url);
-               }
-
-               $schemearr = parse_url($scheme);
-
-               $complete = $schemearr["scheme"]."://".$schemearr["host"];
-
-               if (!empty($schemearr["port"])) {
-                       $complete .= ":".$schemearr["port"];
-               }
-
-               if (!empty($urlarr["path"])) {
-                       if (strpos($urlarr["path"], "/") !== 0) {
-                               $complete .= "/";
+               $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
+               if (!empty($content) && (($media['image'] ?? '') != trim($content))) {
+                       if (!empty($media['image'])) {
+                               $media['preview'] = trim($content);
+                       } else {
+                               $media['image'] = trim($content);
                        }
-
-                       $complete .= $urlarr["path"];
                }
 
-               if (!empty($urlarr["query"])) {
-                       $complete .= "?".$urlarr["query"];
-               }
-
-               if (!empty($urlarr["fragment"])) {
-                       $complete .= "#".$urlarr["fragment"];
-               }
-
-               return($complete);
+               Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]);
+               $siteinfo[$name][] = $media;
+               return $siteinfo;
        }
 }