]> git.mxchange.org Git - friendica.git/blob - src/Util/ParseUrl.php
Some more "accept" parameters are added
[friendica.git] / src / Util / ParseUrl.php
1 <?php
2 /**
3  * @copyright Copyright (C) 2010-2022, the Friendica project
4  *
5  * @license GNU AGPL version 3 or any later version
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Affero General Public License as
9  * published by the Free Software Foundation, either version 3 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Affero General Public License for more details.
16  *
17  * You should have received a copy of the GNU Affero General Public License
18  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  */
21
22 namespace Friendica\Util;
23
24 use DOMDocument;
25 use DOMXPath;
26 use Friendica\Content\OEmbed;
27 use Friendica\Core\Hook;
28 use Friendica\Core\Logger;
29 use Friendica\Database\Database;
30 use Friendica\Database\DBA;
31 use Friendica\DI;
32 use Friendica\Network\HTTPClient\Client\HttpClient;
33 use Friendica\Network\HTTPException;
34 use Friendica\Network\HTTPClient\Client\HttpClientOptions;
35
36 /**
37  * Get information about a given URL
38  *
39  * Class with methods for extracting certain content from an url
40  */
41 class ParseUrl
42 {
43         const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day';
44         const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months';
45
46         /**
47          * Maximum number of characters for the description
48          */
49         const MAX_DESC_COUNT = 250;
50
51         /**
52          * Minimum number of characters for the description
53          */
54         const MIN_DESC_COUNT = 100;
55
56         /**
57          * Fetch the content type of the given url
58          * @param string $url    URL of the page
59          * @param string $accept content-type to accept
60          * @return array content type
61          */
62         public static function getContentType(string $url, string $accept = HttpClient::ACCEPT_DEFAULT)
63         {
64                 $curlResult = DI::httpClient()->head($url, [HttpClientOptions::ACCEPT_CONTENT => $accept]);
65
66                 // Workaround for systems that can't handle a HEAD request
67                 if (!$curlResult->isSuccess() && ($curlResult->getReturnCode() == 405)) {
68                         $curlResult = DI::httpClient()->get($url, [HttpClientOptions::CONTENT_LENGTH => 1000000, HttpClientOptions::ACCEPT_CONTENT => $accept]);
69                 }
70
71                 if (!$curlResult->isSuccess()) {
72                         return [];
73                 }
74
75                 $contenttype =  $curlResult->getHeader('Content-Type')[0] ?? '';
76                 if (empty($contenttype)) {
77                         return ['application', 'octet-stream'];
78                 }
79
80                 return explode('/', current(explode(';', $contenttype)));
81         }
82
83         /**
84          * Search for chached embeddable data of an url otherwise fetch it
85          *
86          * @param string $url         The url of the page which should be scraped
87          * @param bool   $do_oembed   The false option is used by the function fetch_oembed()
88          *                            to avoid endless loops
89          *
90          * @return array which contains needed data for embedding
91          *    string 'url'      => The url of the parsed page
92          *    string 'type'     => Content type
93          *    string 'title'    => (optional) The title of the content
94          *    string 'text'     => (optional) The description for the content
95          *    string 'image'    => (optional) A preview image of the content
96          *    array  'images'   => (optional) Array of preview pictures
97          *    string 'keywords' => (optional) The tags which belong to the content
98          *
99          * @throws HTTPException\InternalServerErrorException
100          * @see   ParseUrl::getSiteinfo() for more information about scraping
101          * embeddable content
102          */
103         public static function getSiteinfoCached($url, $do_oembed = true): array
104         {
105                 if (empty($url)) {
106                         return [
107                                 'url' => '',
108                                 'type' => 'error',
109                         ];
110                 }
111
112                 $urlHash = hash('sha256', $url);
113
114                 $parsed_url = DBA::selectFirst('parsed_url', ['content'],
115                         ['url_hash' => $urlHash, 'oembed' => $do_oembed]
116                 );
117                 if (!empty($parsed_url['content'])) {
118                         $data = unserialize($parsed_url['content']);
119                         return $data;
120                 }
121
122                 $data = self::getSiteinfo($url, $do_oembed);
123
124                 $expires = $data['expires'];
125
126                 unset($data['expires']);
127
128                 DI::dba()->insert(
129                         'parsed_url',
130                         [
131                                 'url_hash' => $urlHash,
132                                 'oembed'   => $do_oembed,
133                                 'url'      => $url,
134                                 'content'  => serialize($data),
135                                 'created'  => DateTimeFormat::utcNow(),
136                                 'expires'  => $expires,
137                         ],
138                         Database::INSERT_UPDATE
139                 );
140
141                 return $data;
142         }
143
144         /**
145          * Parse a page for embeddable content information
146          *
147          * This method parses to url for meta data which can be used to embed
148          * the content. If available it prioritizes Open Graph meta tags.
149          * If this is not available it uses the twitter cards meta tags.
150          * As fallback it uses standard html elements with meta informations
151          * like \<title\>Awesome Title\</title\> or
152          * \<meta name="description" content="An awesome description"\>
153          *
154          * @param string $url         The url of the page which should be scraped
155          * @param bool   $do_oembed   The false option is used by the function fetch_oembed()
156          *                            to avoid endless loops
157          * @param int    $count       Internal counter to avoid endless loops
158          *
159          * @return array which contains needed data for embedding
160          *    string 'url'      => The url of the parsed page
161          *    string 'type'     => Content type (error, link, photo, image, audio, video)
162          *    string 'title'    => (optional) The title of the content
163          *    string 'text'     => (optional) The description for the content
164          *    string 'image'    => (optional) A preview image of the content
165          *    array  'images'   => (optional) Array of preview pictures
166          *    string 'keywords' => (optional) The tags which belong to the content
167          *
168          * @throws \Friendica\Network\HTTPException\InternalServerErrorException
169          * @todo  https://developers.google.com/+/plugins/snippet/
170          * @verbatim
171          * <meta itemprop="name" content="Awesome title">
172          * <meta itemprop="description" content="An awesome description">
173          * <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
174          *
175          * <body itemscope itemtype="http://schema.org/Product">
176          *   <h1 itemprop="name">Shiny Trinket</h1>
177          *   <img itemprop="image" src="{image-url}" />
178          *   <p itemprop="description">Shiny trinkets are shiny.</p>
179          * </body>
180          * @endverbatim
181          */
182         public static function getSiteinfo($url, $do_oembed = true, $count = 1)
183         {
184                 if (empty($url)) {
185                         return [
186                                 'url' => '',
187                                 'type' => 'error',
188                         ];
189                 }
190
191                 // Check if the URL does contain a scheme
192                 $scheme = parse_url($url, PHP_URL_SCHEME);
193
194                 if ($scheme == '') {
195                         $url = 'http://' . ltrim($url, '/');
196                 }
197
198                 $url = trim($url, "'\"");
199
200                 $url = Network::stripTrackingQueryParams($url);
201
202                 $siteinfo = [
203                         'url' => $url,
204                         'type' => 'link',
205                         'expires' => DateTimeFormat::utc(self::DEFAULT_EXPIRATION_FAILURE),
206                 ];
207
208                 if ($count > 10) {
209                         Logger::notice('Endless loop detected', ['url' => $url]);
210                         return $siteinfo;
211                 }
212
213                 $type = self::getContentType($url);
214                 Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]);
215                 if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) {
216                         $siteinfo['type'] = $type[0];
217                         return $siteinfo;
218                 }
219
220                 if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) {
221                         Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]);
222                         return $siteinfo;
223                 }
224
225                 $curlResult = DI::httpClient()->get($url, [HttpClientOptions::CONTENT_LENGTH => 1000000, HttpClientOptions::ACCEPT_CONTENT => HttpClient::ACCEPT_HTML]);
226                 if (!$curlResult->isSuccess() || empty($curlResult->getBody())) {
227                         Logger::info('Empty body or error when fetching', ['url' => $url, 'success' => $curlResult->isSuccess(), 'code' => $curlResult->getReturnCode()]);
228                         return $siteinfo;
229                 }
230
231                 $siteinfo['expires'] = DateTimeFormat::utc(self::DEFAULT_EXPIRATION_SUCCESS);
232
233                 if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')[0] ?? '') {
234                         if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
235                                 $maxAge = max(86400, (int)array_pop($matches));
236                                 $siteinfo['expires'] = DateTimeFormat::utc("now + $maxAge seconds");
237                         }
238                 }
239
240                 $body = $curlResult->getBody();
241
242                 if ($do_oembed) {
243                         $oembed_data = OEmbed::fetchURL($url, false, false);
244
245                         if (!empty($oembed_data->type)) {
246                                 if (!in_array($oembed_data->type, ['error', 'rich', 'image', 'video', 'audio', ''])) {
247                                         $siteinfo['type'] = $oembed_data->type;
248                                 }
249
250                                 // See https://github.com/friendica/friendica/pull/5763#discussion_r217913178
251                                 if ($siteinfo['type'] != 'photo') {
252                                         if (!empty($oembed_data->title)) {
253                                                 $siteinfo['title'] = trim($oembed_data->title);
254                                         }
255                                         if (!empty($oembed_data->description)) {
256                                                 $siteinfo['text'] = trim($oembed_data->description);
257                                         }
258                                         if (!empty($oembed_data->author_name)) {
259                                                 $siteinfo['author_name'] = trim($oembed_data->author_name);
260                                         }
261                                         if (!empty($oembed_data->author_url)) {
262                                                 $siteinfo['author_url'] = trim($oembed_data->author_url);
263                                         }
264                                         if (!empty($oembed_data->provider_name)) {
265                                                 $siteinfo['publisher_name'] = trim($oembed_data->provider_name);
266                                         }
267                                         if (!empty($oembed_data->provider_url)) {
268                                                 $siteinfo['publisher_url'] = trim($oembed_data->provider_url);
269                                         }
270                                         if (!empty($oembed_data->thumbnail_url)) {
271                                                 $siteinfo['image'] = $oembed_data->thumbnail_url;
272                                         }
273                                 }
274                         }
275                 }
276
277                 $charset = '';
278                 // Look for a charset, first in headers
279                 // Expected form: Content-Type: text/html; charset=ISO-8859-4
280                 if (preg_match('/charset=([a-z0-9-_.\/]+)/i', $curlResult->getContentType(), $matches)) {
281                         $charset = trim(trim(trim(array_pop($matches)), ';,'));
282                 }
283
284                 // Then in body that gets precedence
285                 // Expected forms:
286                 // - <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
287                 // - <meta charset="utf-8">
288                 // - <meta charset=utf-8>
289                 // - <meta charSet="utf-8">
290                 // We escape <style> and <script> tags since they can contain irrelevant charset information
291                 // (see https://github.com/friendica/friendica/issues/9251#issuecomment-698636806)
292                 Strings::performWithEscapedBlocks($body, '#<(?:style|script).*?</(?:style|script)>#ism', function ($body) use (&$charset) {
293                         if (preg_match('/charset=["\']?([a-z0-9-_.\/]+)/i', $body, $matches)) {
294                                 $charset = trim(trim(trim(array_pop($matches)), ';,'));
295                         }
296                 });
297
298                 $siteinfo['charset'] = $charset;
299
300                 if ($charset && strtoupper($charset) != 'UTF-8') {
301                         // See https://github.com/friendica/friendica/issues/5470#issuecomment-418351211
302                         $charset = str_ireplace('latin-1', 'latin1', $charset);
303
304                         Logger::info('detected charset', ['charset' => $charset]);
305                         $body = iconv($charset, 'UTF-8//TRANSLIT', $body);
306                 }
307
308                 $body = mb_convert_encoding($body, 'HTML-ENTITIES', 'UTF-8');
309
310                 $doc = new DOMDocument();
311                 @$doc->loadHTML($body);
312
313                 XML::deleteNode($doc, 'style');
314                 XML::deleteNode($doc, 'option');
315                 XML::deleteNode($doc, 'h1');
316                 XML::deleteNode($doc, 'h2');
317                 XML::deleteNode($doc, 'h3');
318                 XML::deleteNode($doc, 'h4');
319                 XML::deleteNode($doc, 'h5');
320                 XML::deleteNode($doc, 'h6');
321                 XML::deleteNode($doc, 'ol');
322                 XML::deleteNode($doc, 'ul');
323
324                 $xpath = new DOMXPath($doc);
325
326                 $list = $xpath->query('//meta[@content]');
327                 foreach ($list as $node) {
328                         $meta_tag = [];
329                         if ($node->attributes->length) {
330                                 foreach ($node->attributes as $attribute) {
331                                         $meta_tag[$attribute->name] = $attribute->value;
332                                 }
333                         }
334
335                         if (@$meta_tag['http-equiv'] == 'refresh') {
336                                 $path = $meta_tag['content'];
337                                 $pathinfo = explode(';', $path);
338                                 $content = '';
339                                 foreach ($pathinfo as $value) {
340                                         if (substr(strtolower($value), 0, 4) == 'url=') {
341                                                 $content = substr($value, 4);
342                                         }
343                                 }
344                                 if ($content != '') {
345                                         $siteinfo = self::getSiteinfo($content, $do_oembed, ++$count);
346                                         return $siteinfo;
347                                 }
348                         }
349                 }
350
351                 $list = $xpath->query('//title');
352                 if ($list->length > 0) {
353                         $siteinfo['title'] = trim($list->item(0)->nodeValue);
354                 }
355
356                 $list = $xpath->query('//meta[@name]');
357                 foreach ($list as $node) {
358                         $meta_tag = [];
359                         if ($node->attributes->length) {
360                                 foreach ($node->attributes as $attribute) {
361                                         $meta_tag[$attribute->name] = $attribute->value;
362                                 }
363                         }
364
365                         if (empty($meta_tag['content'])) {
366                                 continue;
367                         }
368
369                         $meta_tag['content'] = trim(html_entity_decode($meta_tag['content'], ENT_QUOTES, 'UTF-8'));
370
371                         switch (strtolower($meta_tag['name'])) {
372                                 case 'fulltitle':
373                                         $siteinfo['title'] = trim($meta_tag['content']);
374                                         break;
375                                 case 'description':
376                                         $siteinfo['text'] = trim($meta_tag['content']);
377                                         break;
378                                 case 'thumbnail':
379                                         $siteinfo['image'] = $meta_tag['content'];
380                                         break;
381                                 case 'twitter:image':
382                                         $siteinfo['image'] = $meta_tag['content'];
383                                         break;
384                                 case 'twitter:image:src':
385                                         $siteinfo['image'] = $meta_tag['content'];
386                                         break;
387                                 case 'twitter:description':
388                                         $siteinfo['text'] = trim($meta_tag['content']);
389                                         break;
390                                 case 'twitter:title':
391                                         $siteinfo['title'] = trim($meta_tag['content']);
392                                         break;
393                                 case 'twitter:player':
394                                         $siteinfo['player']['embed'] = trim($meta_tag['content']);
395                                         break;
396                                 case 'twitter:player:stream':
397                                         $siteinfo['player']['stream'] = trim($meta_tag['content']);
398                                         break;
399                                 case 'twitter:player:width':
400                                         $siteinfo['player']['width'] = intval($meta_tag['content']);
401                                         break;
402                                 case 'twitter:player:height':
403                                         $siteinfo['player']['height'] = intval($meta_tag['content']);
404                                         break;
405                                 case 'dc.title':
406                                         $siteinfo['title'] = trim($meta_tag['content']);
407                                         break;
408                                 case 'dc.description':
409                                         $siteinfo['text'] = trim($meta_tag['content']);
410                                         break;
411                                 case 'dc.creator':
412                                         $siteinfo['publisher_name'] = trim($meta_tag['content']);
413                                         break;
414                                 case 'keywords':
415                                         $keywords = explode(',', $meta_tag['content']);
416                                         break;
417                                 case 'news_keywords':
418                                         $keywords = explode(',', $meta_tag['content']);
419                                         break;
420                         }
421                 }
422
423                 if (isset($keywords)) {
424                         $siteinfo['keywords'] = [];
425                         foreach ($keywords as $keyword) {
426                                 if (!in_array(trim($keyword), $siteinfo['keywords'])) {
427                                         $siteinfo['keywords'][] = trim($keyword);
428                                 }
429                         }
430                 }
431
432                 $list = $xpath->query('//meta[@property]');
433                 foreach ($list as $node) {
434                         $meta_tag = [];
435                         if ($node->attributes->length) {
436                                 foreach ($node->attributes as $attribute) {
437                                         $meta_tag[$attribute->name] = $attribute->value;
438                                 }
439                         }
440
441                         if (!empty($meta_tag['content'])) {
442                                 $meta_tag['content'] = trim(html_entity_decode($meta_tag['content'], ENT_QUOTES, 'UTF-8'));
443
444                                 switch (strtolower($meta_tag['property'])) {
445                                         case 'og:image':
446                                                 $siteinfo['image'] = $meta_tag['content'];
447                                                 break;
448                                         case 'og:image:url':
449                                                 $siteinfo['image'] = $meta_tag['content'];
450                                                 break;
451                                         case 'og:image:secure_url':
452                                                 $siteinfo['image'] = $meta_tag['content'];
453                                                 break;
454                                         case 'og:title':
455                                                 $siteinfo['title'] = trim($meta_tag['content']);
456                                                 break;
457                                         case 'og:description':
458                                                 $siteinfo['text'] = trim($meta_tag['content']);
459                                                 break;
460                                         case 'og:site_name':
461                                                 $siteinfo['publisher_name'] = trim($meta_tag['content']);
462                                                 break;
463                                         case 'og:locale':
464                                                 $siteinfo['language'] = trim($meta_tag['content']);
465                                                 break;
466                                         case 'og:type':
467                                                 $siteinfo['pagetype'] = trim($meta_tag['content']);
468                                                 break;
469                                         case 'twitter:description':
470                                                 $siteinfo['text'] = trim($meta_tag['content']);
471                                                 break;
472                                         case 'twitter:title':
473                                                 $siteinfo['title'] = trim($meta_tag['content']);
474                                                 break;
475                                         case 'twitter:image':
476                                                 $siteinfo['image'] = $meta_tag['content'];
477                                                 break;
478                                 }
479                         }
480                 }
481
482                 $list = $xpath->query("//script[@type='application/ld+json']");
483                 foreach ($list as $node) {
484                         if (!empty($node->nodeValue)) {
485                                 if ($jsonld = json_decode($node->nodeValue, true)) {
486                                         $siteinfo = self::parseParts($siteinfo, $jsonld);
487                                 }
488                         }
489                 }
490
491                 if (!empty($siteinfo['player']['stream'])) {
492                         // Only add player data to media arrays if there is no duplicate
493                         $content_urls = array_merge(array_column($siteinfo['audio'] ?? [], 'content'), array_column($siteinfo['video'] ?? [], 'content'));
494                         if (!in_array($siteinfo['player']['stream'], $content_urls)) {
495                                 $contenttype = self::getContentType($siteinfo['player']['stream']);
496                                 if (!empty($contenttype[0]) && in_array($contenttype[0], ['audio', 'video'])) {
497                                         $media = ['content' => $siteinfo['player']['stream']];
498
499                                         if (!empty($siteinfo['player']['embed'])) {
500                                                 $media['embed'] = $siteinfo['player']['embed'];
501                                         }
502
503                                         $siteinfo[$contenttype[0]][] = $media;
504                                 }
505                         }
506                 }
507
508                 if (!empty($siteinfo['image'])) {
509                         $siteinfo['images'] = $siteinfo['images'] ?? [];
510                         array_unshift($siteinfo['images'], ['url' => $siteinfo['image']]);
511                         unset($siteinfo['image']);
512                 }
513
514                 $siteinfo = self::checkMedia($url, $siteinfo);
515
516                 if (!empty($siteinfo['text']) && mb_strlen($siteinfo['text']) > self::MAX_DESC_COUNT) {
517                         $siteinfo['text'] = mb_substr($siteinfo['text'], 0, self::MAX_DESC_COUNT) . '…';
518                         $pos = mb_strrpos($siteinfo['text'], '.');
519                         if ($pos > self::MIN_DESC_COUNT) {
520                                 $siteinfo['text'] = mb_substr($siteinfo['text'], 0, $pos + 1);
521                         }
522                 }
523
524                 Logger::info('Siteinfo fetched', ['url' => $url, 'siteinfo' => $siteinfo]);
525
526                 Hook::callAll('getsiteinfo', $siteinfo);
527
528                 ksort($siteinfo);
529
530                 return $siteinfo;
531         }
532
533         /**
534          * Check the attached media elements.
535          * Fix existing data and add missing data.
536          *
537          * @param string $page_url
538          * @param array $siteinfo
539          * @return array
540          */
541         private static function checkMedia(string $page_url, array $siteinfo) : array
542         {
543                 if (!empty($siteinfo['images'])) {
544                         array_walk($siteinfo['images'], function (&$image) use ($page_url) {
545                                 // According to the specifications someone could place a picture url into the content field as well.
546                                 // But this doesn't seem to happen in the wild, so we don't cover it here.
547                                 if (!empty($image['url'])) {
548                                         $image['url'] = self::completeUrl($image['url'], $page_url);
549                                         $photodata = Images::getInfoFromURLCached($image['url']);
550                                         if (!empty($photodata) && ($photodata[0] > 50) && ($photodata[1] > 50)) {
551                                                 $image['src'] = $image['url'];
552                                                 $image['width'] = $photodata[0];
553                                                 $image['height'] = $photodata[1];
554                                                 $image['contenttype'] = $photodata['mime'];
555                                                 unset($image['url']);
556                                                 ksort($image);
557                                         } else {
558                                                 $image = [];
559                                         }
560                                 } else {
561                                         $image = [];
562                                 }
563                         });
564
565                         $siteinfo['images'] = array_values(array_filter($siteinfo['images']));
566                 }
567
568                 foreach (['audio', 'video'] as $element) {
569                         if (!empty($siteinfo[$element])) {
570                                 array_walk($siteinfo[$element], function (&$media) use ($page_url, &$siteinfo) {
571                                         $url = '';
572                                         $embed = '';
573                                         $content = '';
574                                         $contenttype = '';
575                                         foreach (['embed', 'content', 'url'] as $field) {
576                                                 if (!empty($media[$field])) {
577                                                         $media[$field] = self::completeUrl($media[$field], $page_url);
578                                                         $type = self::getContentType($media[$field]);
579                                                         if (($type[0] ?? '') == 'text') {
580                                                                 if ($field == 'embed') {
581                                                                         $embed = $media[$field];
582                                                                 } else {
583                                                                         $url = $media[$field];
584                                                                 }
585                                                         } elseif (!empty($type[0])) {
586                                                                 $content = $media[$field];
587                                                                 $contenttype = implode('/', $type);
588                                                         }
589                                                 }
590                                                 unset($media[$field]);
591                                         }
592
593                                         foreach (['image', 'preview'] as $field) {
594                                                 if (!empty($media[$field])) {
595                                                         $media[$field] = self::completeUrl($media[$field], $page_url);
596                                                 }
597                                         }
598
599                                         if (!empty($url)) {
600                                                 $media['url'] = $url;
601                                         }
602                                         if (!empty($embed)) {
603                                                 $media['embed'] = $embed;
604                                                 if (empty($siteinfo['player']['embed'])) {
605                                                         $siteinfo['player']['embed'] = $embed;
606                                                 }
607                                         }
608                                         if (!empty($content)) {
609                                                 $media['src'] = $content;
610                                         }
611                                         if (!empty($contenttype)) {
612                                                 $media['contenttype'] = $contenttype;
613                                         }
614                                         if (empty($url) && empty($content) && empty($embed)) {
615                                                 $media = [];
616                                         }
617                                         ksort($media);
618                                 });
619
620                                 $siteinfo[$element] = array_values(array_filter($siteinfo[$element]));
621                         }
622                         if (empty($siteinfo[$element])) {
623                                 unset($siteinfo[$element]);
624                         }
625                 }
626                 return $siteinfo;
627         }
628
629         /**
630          * Convert tags from CSV to an array
631          *
632          * @param string $string Tags
633          * @return array with formatted Hashtags
634          */
635         public static function convertTagsToArray($string)
636         {
637                 $arr_tags = str_getcsv($string);
638                 if (count($arr_tags)) {
639                         // add the # sign to every tag
640                         array_walk($arr_tags, ["self", "arrAddHashes"]);
641
642                         return $arr_tags;
643                 }
644         }
645
646         /**
647          * Add a hasht sign to a string
648          *
649          * This method is used as callback function
650          *
651          * @param string $tag The pure tag name
652          * @param int    $k   Counter for internal use
653          * @return void
654          */
655         private static function arrAddHashes(&$tag, $k)
656         {
657                 $tag = "#" . $tag;
658         }
659
660         /**
661          * Add a scheme to an url
662          *
663          * The src attribute of some html elements (e.g. images)
664          * can miss the scheme so we need to add the correct
665          * scheme
666          *
667          * @param string $url    The url which possibly does have
668          *                       a missing scheme (a link to an image)
669          * @param string $scheme The url with a correct scheme
670          *                       (e.g. the url from the webpage which does contain the image)
671          *
672          * @return string The url with a scheme
673          */
674         private static function completeUrl($url, $scheme)
675         {
676                 $urlarr = parse_url($url);
677
678                 // If the url does allready have an scheme
679                 // we can stop the process here
680                 if (isset($urlarr["scheme"])) {
681                         return($url);
682                 }
683
684                 $schemearr = parse_url($scheme);
685
686                 $complete = $schemearr["scheme"]."://".$schemearr["host"];
687
688                 if (!empty($schemearr["port"])) {
689                         $complete .= ":".$schemearr["port"];
690                 }
691
692                 if (!empty($urlarr["path"])) {
693                         if (strpos($urlarr["path"], "/") !== 0) {
694                                 $complete .= "/";
695                         }
696
697                         $complete .= $urlarr["path"];
698                 }
699
700                 if (!empty($urlarr["query"])) {
701                         $complete .= "?".$urlarr["query"];
702                 }
703
704                 if (!empty($urlarr["fragment"])) {
705                         $complete .= "#".$urlarr["fragment"];
706                 }
707
708                 return($complete);
709         }
710
711         /**
712          * Parse the Json-Ld parts of a web page
713          *
714          * @param array $siteinfo
715          * @param array $jsonld
716          * @return array siteinfo
717          */
718         private static function parseParts(array $siteinfo, array $jsonld)
719         {
720                 if (!empty($jsonld['@graph']) && is_array($jsonld['@graph'])) {
721                         foreach ($jsonld['@graph'] as $part) {
722                                 if (!empty($part) && is_array($part)) {
723                                         $siteinfo = self::parseParts($siteinfo, $part);
724                                 }
725                         }
726                 } elseif (!empty($jsonld['@type'])) {
727                         $siteinfo = self::parseJsonLd($siteinfo, $jsonld);
728                 } elseif (!empty($jsonld)) {
729                         $keys = array_keys($jsonld);
730                         $numeric_keys = true;
731                         foreach ($keys as $key) {
732                                 if (!is_int($key)) {
733                                         $numeric_keys = false;
734                                 }
735                         }
736                         if ($numeric_keys) {
737                                 foreach ($jsonld as $part) {
738                                         if (!empty($part) && is_array($part)) {
739                                                 $siteinfo = self::parseParts($siteinfo, $part);
740                                         }
741                                 }
742                         }
743                 }
744
745                 array_walk_recursive($siteinfo, function (&$element) {
746                         if (is_string($element)) {
747                                 $element = trim(strip_tags(html_entity_decode($element, ENT_COMPAT, 'UTF-8')));
748                         }
749                 });
750
751                 return $siteinfo;
752         }
753
754         /**
755          * Improve the siteinfo with information from the provided JSON-LD information
756          * @see https://jsonld.com/
757          * @see https://schema.org/
758          *
759          * @param array $siteinfo
760          * @param array $jsonld
761          * @return array siteinfo
762          */
763         private static function parseJsonLd(array $siteinfo, array $jsonld)
764         {
765                 $type = JsonLD::fetchElement($jsonld, '@type');
766                 if (empty($type)) {
767                         Logger::info('Empty type', ['url' => $siteinfo['url']]);
768                         return $siteinfo;
769                 }
770
771                 // Silently ignore some types that aren't processed
772                 if (in_array($type, ['SiteNavigationElement', 'JobPosting', 'CreativeWork', 'MusicAlbum',
773                         'WPHeader', 'WPSideBar', 'WPFooter', 'LegalService', 'MusicRecording',
774                         'ItemList', 'BreadcrumbList', 'Blog', 'Dataset', 'Product'])) {
775                         return $siteinfo;
776                 }
777
778                 switch ($type) {
779                         case 'Article':
780                         case 'AdvertiserContentArticle':
781                         case 'NewsArticle':
782                         case 'Report':
783                         case 'SatiricalArticle':
784                         case 'ScholarlyArticle':
785                         case 'SocialMediaPosting':
786                         case 'TechArticle':
787                         case 'ReportageNewsArticle':
788                         case 'SocialMediaPosting':
789                         case 'BlogPosting':
790                         case 'LiveBlogPosting':
791                         case 'DiscussionForumPosting':
792                                 return self::parseJsonLdArticle($siteinfo, $jsonld);
793                         case 'WebPage':
794                         case 'AboutPage':
795                         case 'CheckoutPage':
796                         case 'CollectionPage':
797                         case 'ContactPage':
798                         case 'FAQPage':
799                         case 'ItemPage':
800                         case 'MedicalWebPage':
801                         case 'ProfilePage':
802                         case 'QAPage':
803                         case 'RealEstateListing':
804                         case 'SearchResultsPage':
805                         case 'MediaGallery':
806                         case 'ImageGallery':
807                         case 'VideoGallery':
808                         case 'RadioEpisode':
809                         case 'Event':
810                                 return self::parseJsonLdWebPage($siteinfo, $jsonld);
811                         case 'WebSite':
812                                 return self::parseJsonLdWebSite($siteinfo, $jsonld);
813                         case 'Organization':
814                         case 'Airline':
815                         case 'Consortium':
816                         case 'Corporation':
817                         case 'EducationalOrganization':
818                         case 'FundingScheme':
819                         case 'GovernmentOrganization':
820                         case 'LibrarySystem':
821                         case 'LocalBusiness':
822                         case 'MedicalOrganization':
823                         case 'NGO':
824                         case 'NewsMediaOrganization':
825                         case 'Project':
826                         case 'SportsOrganization':
827                         case 'WorkersUnion':
828                                 return self::parseJsonLdWebOrganization($siteinfo, $jsonld);
829                         case 'Person':
830                         case 'Patient':
831                         case 'PerformingGroup':
832                         case 'DanceGroup';
833                         case 'MusicGroup':
834                         case 'TheaterGroup':
835                                 return self::parseJsonLdWebPerson($siteinfo, $jsonld);
836                         case 'AudioObject':
837                         case 'Audio':
838                                 return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'audio');
839                         case 'VideoObject':
840                                 return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'video');
841                         case 'ImageObject':
842                                 return self::parseJsonLdMediaObject($siteinfo, $jsonld, 'images');
843                         default:
844                                 Logger::info('Unknown type', ['type' => $type, 'url' => $siteinfo['url']]);
845                                 return $siteinfo;
846                 }
847         }
848
849         /**
850          * Fetch author and publisher data
851          *
852          * @param array $siteinfo
853          * @param array $jsonld
854          * @return array siteinfo
855          */
856         private static function parseJsonLdAuthor(array $siteinfo, array $jsonld)
857         {
858                 $jsonldinfo = [];
859
860                 if (!empty($jsonld['publisher']) && is_array($jsonld['publisher'])) {
861                         $content = JsonLD::fetchElement($jsonld, 'publisher', 'name');
862                         if (!empty($content) && is_string($content)) {
863                                 $jsonldinfo['publisher_name'] = trim($content);
864                         }
865
866                         $content = JsonLD::fetchElement($jsonld, 'publisher', 'url');
867                         if (!empty($content) && is_string($content)) {
868                                 $jsonldinfo['publisher_url'] = trim($content);
869                         }
870
871                         $brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization');
872                         if (!empty($brand) && is_array($brand)) {
873                                 $content = JsonLD::fetchElement($brand, 'name');
874                                 if (!empty($content) && is_string($content)) {
875                                         $jsonldinfo['publisher_name'] = trim($content);
876                                 }
877
878                                 $content = JsonLD::fetchElement($brand, 'url');
879                                 if (!empty($content) && is_string($content)) {
880                                         $jsonldinfo['publisher_url'] = trim($content);
881                                 }
882
883                                 $content = JsonLD::fetchElement($brand, 'logo', 'url');
884                                 if (!empty($content) && is_string($content)) {
885                                         $jsonldinfo['publisher_img'] = trim($content);
886                                 }
887                         }
888
889                         $logo = JsonLD::fetchElement($jsonld, 'publisher', 'logo');
890                         if (!empty($logo) && is_array($logo)) {
891                                 $content = JsonLD::fetchElement($logo, 'url');
892                                 if (!empty($content) && is_string($content)) {
893                                         $jsonldinfo['publisher_img'] = trim($content);
894                                 }
895                         }
896                 } elseif (!empty($jsonld['publisher']) && is_string($jsonld['publisher'])) {
897                         $jsonldinfo['publisher_name'] = trim($jsonld['publisher']);
898                 }
899
900                 if (!empty($jsonld['author']) && is_array($jsonld['author'])) {
901                         $content = JsonLD::fetchElement($jsonld, 'author', 'name');
902                         if (!empty($content) && is_string($content)) {
903                                 $jsonldinfo['author_name'] = trim($content);
904                         }
905
906                         $content = JsonLD::fetchElement($jsonld, 'author', 'sameAs');
907                         if (!empty($content) && is_string($content)) {
908                                 $jsonldinfo['author_url'] = trim($content);
909                         }
910
911                         $content = JsonLD::fetchElement($jsonld, 'author', 'url');
912                         if (!empty($content) && is_string($content)) {
913                                 $jsonldinfo['author_url'] = trim($content);
914                         }
915
916                         $logo = JsonLD::fetchElement($jsonld, 'author', 'logo');
917                         if (!empty($logo) && is_array($logo)) {
918                                 $content = JsonLD::fetchElement($logo, 'url');
919                                 if (!empty($content) && is_string($content)) {
920                                         $jsonldinfo['author_img'] = trim($content);
921                                 }
922                         }
923                 } elseif (!empty($jsonld['author']) && is_string($jsonld['author'])) {
924                         $jsonldinfo['author_name'] = trim($jsonld['author']);
925                 }
926
927                 Logger::info('Fetched Author information', ['fetched' => $jsonldinfo]);
928
929                 return array_merge($siteinfo, $jsonldinfo);
930         }
931
932         /**
933          * Fetch data from the provided JSON-LD Article type
934          * @see https://schema.org/Article
935          *
936          * @param array $siteinfo
937          * @param array $jsonld
938          * @return array siteinfo
939          */
940         private static function parseJsonLdArticle(array $siteinfo, array $jsonld)
941         {
942                 $jsonldinfo = [];
943
944                 $content = JsonLD::fetchElement($jsonld, 'headline');
945                 if (!empty($content) && is_string($content)) {
946                         $jsonldinfo['title'] = trim($content);
947                 }
948
949                 $content = JsonLD::fetchElement($jsonld, 'alternativeHeadline');
950                 if (!empty($content) && is_string($content) && (($jsonldinfo['title'] ?? '') != trim($content))) {
951                         $jsonldinfo['alternative_title'] = trim($content);
952                 }
953
954                 $content = JsonLD::fetchElement($jsonld, 'description');
955                 if (!empty($content) && is_string($content)) {
956                         $jsonldinfo['text'] = trim($content);
957                 }
958
959                 $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
960                 if (!empty($content)) {
961                         $jsonldinfo['image'] = trim($content);
962                 }
963
964                 $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject');
965                 if (!empty($content) && is_string($content)) {
966                         $jsonldinfo['image'] = trim($content);
967                 }
968
969                 if (!empty($jsonld['keywords']) && !is_array($jsonld['keywords'])) {
970                         $content = JsonLD::fetchElement($jsonld, 'keywords');
971                         if (!empty($content)) {
972                                 $siteinfo['keywords'] = [];
973                                 $keywords = explode(',', $content);
974                                 foreach ($keywords as $keyword) {
975                                         $siteinfo['keywords'][] = trim($keyword);
976                                 }
977                         }
978                 } elseif (!empty($jsonld['keywords'])) {
979                         $content = JsonLD::fetchElementArray($jsonld, 'keywords');
980                         if (!empty($content) && is_array($content)) {
981                                 $jsonldinfo['keywords'] = $content;
982                         }
983                 }
984
985                 $content = JsonLD::fetchElement($jsonld, 'datePublished');
986                 if (!empty($content) && is_string($content)) {
987                         $jsonldinfo['published'] = DateTimeFormat::utc($content);
988                 }
989
990                 $content = JsonLD::fetchElement($jsonld, 'dateModified');
991                 if (!empty($content) && is_string($content)) {
992                         $jsonldinfo['modified'] = DateTimeFormat::utc($content);
993                 }
994
995                 $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld);
996
997                 Logger::info('Fetched article information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
998
999                 return array_merge($siteinfo, $jsonldinfo);
1000         }
1001
1002         /**
1003          * Fetch data from the provided JSON-LD WebPage type
1004          * @see https://schema.org/WebPage
1005          *
1006          * @param array $siteinfo
1007          * @param array $jsonld
1008          * @return array siteinfo
1009          */
1010         private static function parseJsonLdWebPage(array $siteinfo, array $jsonld)
1011         {
1012                 $jsonldinfo = [];
1013
1014                 $content = JsonLD::fetchElement($jsonld, 'name');
1015                 if (!empty($content)) {
1016                         $jsonldinfo['title'] = trim($content);
1017                 }
1018
1019                 $content = JsonLD::fetchElement($jsonld, 'description');
1020                 if (!empty($content)) {
1021                         $jsonldinfo['text'] = trim($content);
1022                 }
1023
1024                 $content = JsonLD::fetchElement($jsonld, 'image');
1025                 if (!empty($content)) {
1026                         $jsonldinfo['image'] = trim($content);
1027                 }
1028
1029                 $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
1030                 if (!empty($content)) {
1031                         $jsonldinfo['image'] = trim($content);
1032                 }
1033
1034                 $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld);
1035
1036                 Logger::info('Fetched WebPage information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
1037
1038                 return array_merge($siteinfo, $jsonldinfo);
1039         }
1040
1041         /**
1042          * Fetch data from the provided JSON-LD WebSite type
1043          * @see https://schema.org/WebSite
1044          *
1045          * @param array $siteinfo
1046          * @param array $jsonld
1047          * @return array siteinfo
1048          */
1049         private static function parseJsonLdWebSite(array $siteinfo, array $jsonld)
1050         {
1051                 $jsonldinfo = [];
1052
1053                 $content = JsonLD::fetchElement($jsonld, 'name');
1054                 if (!empty($content)) {
1055                         $jsonldinfo['publisher_name'] = trim($content);
1056                 }
1057
1058                 $content = JsonLD::fetchElement($jsonld, 'description');
1059                 if (!empty($content)) {
1060                         $jsonldinfo['publisher_description'] = trim($content);
1061                 }
1062
1063                 $content = JsonLD::fetchElement($jsonld, 'url');
1064                 if (!empty($content)) {
1065                         $jsonldinfo['publisher_url'] = trim($content);
1066                 }
1067
1068                 $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
1069                 if (!empty($content)) {
1070                         $jsonldinfo['image'] = trim($content);
1071                 }
1072
1073                 $jsonldinfo = self::parseJsonLdAuthor($jsonldinfo, $jsonld);
1074
1075                 Logger::info('Fetched WebSite information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
1076                 return array_merge($siteinfo, $jsonldinfo);
1077         }
1078
1079         /**
1080          * Fetch data from the provided JSON-LD Organization type
1081          * @see https://schema.org/Organization
1082          *
1083          * @param array $siteinfo
1084          * @param array $jsonld
1085          * @return array siteinfo
1086          */
1087         private static function parseJsonLdWebOrganization(array $siteinfo, array $jsonld)
1088         {
1089                 $jsonldinfo = [];
1090
1091                 $content = JsonLD::fetchElement($jsonld, 'name');
1092                 if (!empty($content)) {
1093                         $jsonldinfo['publisher_name'] = trim($content);
1094                 }
1095
1096                 $content = JsonLD::fetchElement($jsonld, 'description');
1097                 if (!empty($content)) {
1098                         $jsonldinfo['publisher_description'] = trim($content);
1099                 }
1100
1101                 $content = JsonLD::fetchElement($jsonld, 'url');
1102                 if (!empty($content)) {
1103                         $jsonldinfo['publisher_url'] = trim($content);
1104                 }
1105
1106                 $content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject');
1107                 if (!empty($content)) {
1108                         $jsonldinfo['publisher_img'] = trim($content);
1109                 }
1110
1111                 $content = JsonLD::fetchElement($jsonld, 'brand', 'name', '@type', 'Organization');
1112                 if (!empty($content)) {
1113                         $jsonldinfo['publisher_name'] = trim($content);
1114                 }
1115
1116                 $content = JsonLD::fetchElement($jsonld, 'brand', 'url', '@type', 'Organization');
1117                 if (!empty($content)) {
1118                         $jsonldinfo['publisher_url'] = trim($content);
1119                 }
1120
1121                 Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
1122                 return array_merge($siteinfo, $jsonldinfo);
1123         }
1124
1125         /**
1126          * Fetch data from the provided JSON-LD Person type
1127          * @see https://schema.org/Person
1128          *
1129          * @param array $siteinfo
1130          * @param array $jsonld
1131          * @return array siteinfo
1132          */
1133         private static function parseJsonLdWebPerson(array $siteinfo, array $jsonld)
1134         {
1135                 $jsonldinfo = [];
1136
1137                 $content = JsonLD::fetchElement($jsonld, 'name');
1138                 if (!empty($content)) {
1139                         $jsonldinfo['author_name'] = trim($content);
1140                 }
1141
1142                 $content = JsonLD::fetchElement($jsonld, 'description');
1143                 if (!empty($content)) {
1144                         $jsonldinfo['author_description'] = trim($content);
1145                 }
1146
1147                 $content = JsonLD::fetchElement($jsonld, 'sameAs');
1148                 if (!empty($content) && is_string($content)) {
1149                         $jsonldinfo['author_url'] = trim($content);
1150                 }
1151
1152                 $content = JsonLD::fetchElement($jsonld, 'url');
1153                 if (!empty($content)) {
1154                         $jsonldinfo['author_url'] = trim($content);
1155                 }
1156
1157                 $content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject');
1158                 if (!empty($content) && !is_string($content)) {
1159                         Logger::notice('Unexpected return value for the author image', ['content' => $content]);
1160                 }
1161
1162                 if (!empty($content) && is_string($content)) {
1163                         $jsonldinfo['author_img'] = trim($content);
1164                 }
1165
1166                 Logger::info('Fetched Person information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
1167                 return array_merge($siteinfo, $jsonldinfo);
1168         }
1169
1170         /**
1171          * Fetch data from the provided JSON-LD MediaObject type
1172          * @see https://schema.org/MediaObject
1173          *
1174          * @param array $siteinfo
1175          * @param array $jsonld
1176          * @return array siteinfo
1177          */
1178         private static function parseJsonLdMediaObject(array $siteinfo, array $jsonld, string $name)
1179         {
1180                 $media = [];
1181
1182                 $content = JsonLD::fetchElement($jsonld, 'caption');
1183                 if (!empty($content)) {
1184                         $media['caption'] = trim($content);
1185                 }
1186
1187                 $content = JsonLD::fetchElement($jsonld, 'url');
1188                 if (!empty($content)) {
1189                         $media['url'] = trim($content);
1190                 }
1191
1192                 $content = JsonLD::fetchElement($jsonld, 'mainEntityOfPage');
1193                 if (!empty($content)) {
1194                         $media['main'] = Strings::compareLink($content, $siteinfo['url']);
1195                 }
1196
1197                 $content = JsonLD::fetchElement($jsonld, 'description');
1198                 if (!empty($content)) {
1199                         $media['description'] = trim($content);
1200                 }
1201
1202                 $content = JsonLD::fetchElement($jsonld, 'name');
1203                 if (!empty($content) && (($media['description'] ?? '') != trim($content))) {
1204                         $media['name'] = trim($content);
1205                 }
1206
1207                 $content = JsonLD::fetchElement($jsonld, 'contentUrl');
1208                 if (!empty($content)) {
1209                         $media['content'] = trim($content);
1210                 }
1211
1212                 $content = JsonLD::fetchElement($jsonld, 'embedUrl');
1213                 if (!empty($content)) {
1214                         $media['embed'] = trim($content);
1215                 }
1216
1217                 $content = JsonLD::fetchElement($jsonld, 'height');
1218                 if (!empty($content)) {
1219                         $media['height'] = trim($content);
1220                 }
1221
1222                 $content = JsonLD::fetchElement($jsonld, 'width');
1223                 if (!empty($content)) {
1224                         $media['width'] = trim($content);
1225                 }
1226
1227                 $content = JsonLD::fetchElement($jsonld, 'image');
1228                 if (!empty($content)) {
1229                         $media['image'] = trim($content);
1230                 }
1231
1232                 $content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
1233                 if (!empty($content) && (($media['image'] ?? '') != trim($content))) {
1234                         if (!empty($media['image'])) {
1235                                 $media['preview'] = trim($content);
1236                         } else {
1237                                 $media['image'] = trim($content);
1238                         }
1239                 }
1240
1241                 Logger::info('Fetched Media information', ['url' => $siteinfo['url'], 'fetched' => $media]);
1242                 $siteinfo[$name][] = $media;
1243                 return $siteinfo;
1244         }
1245 }