+ ksort($siteinfo);
+
+ return $siteinfo;
+ }
+
+ /**
+ * Check the attached media elements.
+ * Fix existing data and add missing data.
+ *
+ * @param string $page_url
+ * @param array $siteinfo
+ * @return array
+ */
+ private static function checkMedia(string $page_url, array $siteinfo) : array
+ {
+ if (!empty($siteinfo['images'])) {
+ array_walk($siteinfo['images'], function (&$image) use ($page_url) {
+ // According to the specifications someone could place a picture url into the content field as well.
+ // But this doesn't seem to happen in the wild, so we don't cover it here.
+ if (!empty($image['url'])) {
+ $image['url'] = self::completeUrl($image['url'], $page_url);
+ $photodata = Images::getInfoFromURLCached($image['url']);
+ if (!empty($photodata) && ($photodata[0] > 50) && ($photodata[1] > 50)) {
+ $image['src'] = $image['url'];
+ $image['width'] = $photodata[0];
+ $image['height'] = $photodata[1];
+ $image['contenttype'] = $photodata['mime'];
+ unset($image['url']);
+ ksort($image);
+ } else {
+ $image = [];
+ }
+ } else {
+ $image = [];
+ }
+ });
+
+ $siteinfo['images'] = array_values(array_filter($siteinfo['images']));
+ }
+
+ foreach (['audio', 'video'] as $element) {
+ if (!empty($siteinfo[$element])) {
+ array_walk($siteinfo[$element], function (&$media) use ($page_url, &$siteinfo) {
+ $url = '';
+ $embed = '';
+ $content = '';
+ $contenttype = '';
+ foreach (['embed', 'content', 'url'] as $field) {
+ if (!empty($media[$field])) {
+ $media[$field] = self::completeUrl($media[$field], $page_url);
+ $type = self::getContentType($media[$field]);
+ if (($type[0] ?? '') == 'text') {
+ if ($field == 'embed') {
+ $embed = $media[$field];
+ } else {
+ $url = $media[$field];
+ }
+ } elseif (!empty($type[0])) {
+ $content = $media[$field];
+ $contenttype = implode('/', $type);
+ }
+ }
+ unset($media[$field]);
+ }
+
+ foreach (['image', 'preview'] as $field) {
+ if (!empty($media[$field])) {
+ $media[$field] = self::completeUrl($media[$field], $page_url);
+ }
+ }
+
+ if (!empty($url)) {
+ $media['url'] = $url;
+ }
+ if (!empty($embed)) {
+ $media['embed'] = $embed;
+ if (empty($siteinfo['player']['embed'])) {
+ $siteinfo['player']['embed'] = $embed;
+ }
+ }
+ if (!empty($content)) {
+ $media['src'] = $content;
+ }
+ if (!empty($contenttype)) {
+ $media['contenttype'] = $contenttype;
+ }
+ if (empty($url) && empty($content) && empty($embed)) {
+ $media = [];
+ }
+ ksort($media);
+ });
+
+ $siteinfo[$element] = array_values(array_filter($siteinfo[$element]));
+ }
+ if (empty($siteinfo[$element])) {
+ unset($siteinfo[$element]);
+ }
+ }
+ return $siteinfo;
+ }
+
+ /**
+ * Convert tags from CSV to an array
+ *
+ * @param string $string Tags
+ * @return array with formatted Hashtags
+ */
+ public static function convertTagsToArray($string)
+ {
+ $arr_tags = str_getcsv($string);
+ if (count($arr_tags)) {
+ // add the # sign to every tag
+ array_walk($arr_tags, ["self", "arrAddHashes"]);
+
+ return $arr_tags;
+ }
+ }
+
+ /**
+ * Add a hasht sign to a string
+ *
+ * This method is used as callback function
+ *
+ * @param string $tag The pure tag name
+ * @param int $k Counter for internal use
+ * @return void
+ */
+ private static function arrAddHashes(&$tag, $k)
+ {
+ $tag = "#" . $tag;
+ }
+
+ /**
+ * Add a scheme to an url
+ *
+ * The src attribute of some html elements (e.g. images)
+ * can miss the scheme so we need to add the correct
+ * scheme
+ *
+ * @param string $url The url which possibly does have
+ * a missing scheme (a link to an image)
+ * @param string $scheme The url with a correct scheme
+ * (e.g. the url from the webpage which does contain the image)
+ *
+ * @return string The url with a scheme
+ */
+ private static function completeUrl($url, $scheme)
+ {
+ $urlarr = parse_url($url);
+
+ // If the url does allready have an scheme
+ // we can stop the process here
+ if (isset($urlarr["scheme"])) {
+ return($url);
+ }
+
+ $schemearr = parse_url($scheme);
+
+ $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+ if (!empty($schemearr["port"])) {
+ $complete .= ":".$schemearr["port"];
+ }
+
+ if (!empty($urlarr["path"])) {
+ if (strpos($urlarr["path"], "/") !== 0) {
+ $complete .= "/";
+ }
+
+ $complete .= $urlarr["path"];
+ }
+
+ if (!empty($urlarr["query"])) {
+ $complete .= "?".$urlarr["query"];
+ }
+
+ if (!empty($urlarr["fragment"])) {
+ $complete .= "#".$urlarr["fragment"];
+ }
+
+ return($complete);
+ }
+
+ /**
+ * Parse the Json-Ld parts of a web page
+ *
+ * @param array $siteinfo
+ * @param array $jsonld
+ * @return array siteinfo
+ */
+ private static function parseParts(array $siteinfo, array $jsonld)
+ {
+ if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) {
+ foreach ($jsonld['@graph'] as $part) {
+ if (!empty($part) && is_array($part)) {
+ $siteinfo = self::parseParts($siteinfo, $part);
+ }
+ }
+ } elseif (!empty($jsonld['@type'])) {
+ $siteinfo = self::parseJsonLd($siteinfo, $jsonld);
+ } elseif (!empty($jsonld)) {
+ $keys = array_keys($jsonld);
+ $numeric_keys = true;
+ foreach ($keys as $key) {
+ if (!is_int($key)) {
+ $numeric_keys = false;
+ }
+ }
+ if ($numeric_keys) {
+ foreach ($jsonld as $part) {
+ if (!empty($part) && is_array($part)) {
+ $siteinfo = self::parseParts($siteinfo, $part);
+ }
+ }
+ }
+ }
+
+ array_walk_recursive($siteinfo, function (&$element) {
+ if (is_string($element)) {
+ $element = trim(strip_tags(html_entity_decode($element, ENT_COMPAT, 'UTF-8')));
+ }
+ });
+