src/Content/PageInfo.php

   1 <?php
   2 /**
   3  * @copyright Copyright (C) 2010-2023, the Friendica project
   4  *
   5  * @license GNU AGPL version 3 or any later version
   6  *
   7  * This program is free software: you can redistribute it and/or modify
   8  * it under the terms of the GNU Affero General Public License as
   9  * published by the Free Software Foundation, either version 3 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU Affero General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Affero General Public License
  18  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  19  *
  20  */
  21
  22 namespace Friendica\Content;
  23
  24 use Friendica\Core\Hook;
  25 use Friendica\Core\Logger;
  26 use Friendica\DI;
  27 use Friendica\Network\HTTPException;
  28 use Friendica\Util\ParseUrl;
  29 use Friendica\Util\Strings;
  30
  31 /**
  32  * Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query
  33  */
  34 class PageInfo
  35 {
  36         /**
  37          * @param string $body
  38          * @param bool   $searchNakedUrls
  39          * @param bool   $no_photos
  40          * @return string
  41          * @throws HTTPException\InternalServerErrorException
  42          */
  43         public static function searchAndAppendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false)
  44         {
  45                 Logger::debug('add_page_info_to_body: fetch page info for body', ['body' => $body]);
  46
  47                 $url = self::getRelevantUrlFromBody($body, $searchNakedUrls);
  48                 if (!$url) {
  49                         return $body;
  50                 }
  51
  52                 $data = self::queryUrl($url);
  53                 if (!$data) {
  54                         return $body;
  55                 }
  56
  57                 return self::appendDataToBody($body, $data, $no_photos);
  58         }
  59
  60         /**
  61          * @param string $body
  62          * @param array  $data
  63          * @param bool   $no_photos
  64          * @return string
  65          * @throws HTTPException\InternalServerErrorException
  66          */
  67         public static function appendDataToBody(string $body, array $data, bool $no_photos = false): string
  68         {
  69                 // Only one [attachment] tag per body is allowed
  70                 $existingAttachmentPos = strpos($body, '[attachment');
  71                 if ($existingAttachmentPos !== false) {
  72                         $linkTitle = $data['title'] ?: $data['url'];
  73                         // Additional link attachments are prepended before the existing [attachment] tag
  74                         $body = substr_replace($body, "\n[bookmark=" . $data['url'] . ']' . $linkTitle . "[/bookmark]\n", $existingAttachmentPos, 0);
  75                 } else {
  76                         $footer = self::getFooterFromData($data, $no_photos);
  77                         $body = self::stripTrailingUrlFromBody($body, $data['url']);
  78                         $body .= "\n" . $footer;
  79                 }
  80
  81                 return $body;
  82         }
  83
  84         /**
  85          * @param string $url
  86          * @param bool $no_photos
  87          * @param string $photo
  88          * @param bool $keywords
  89          * @param string $keyword_denylist
  90          * @return string
  91          * @throws HTTPException\InternalServerErrorException
  92          */
  93         public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = ''): string
  94         {
  95                 $data = self::queryUrl($url, $photo, $keywords, $keyword_denylist);
  96
  97                 return self::getFooterFromData($data, $no_photos);
  98         }
  99
 100         /**
 101          * @param array $data
 102          * @param bool  $no_photos
 103          * @return string
 104          * @throws HTTPException\InternalServerErrorException
 105          */
 106         public static function getFooterFromData(array $data, bool $no_photos = false): string
 107         {
 108                 Hook::callAll('page_info_data', $data);
 109
 110                 if (empty($data['type'])) {
 111                         return '';
 112                 }
 113
 114                 // It maybe is a rich content, but if it does have everything that a link has,
 115                 // then treat it that way
 116                 if (($data['type'] == 'rich') && is_string($data['title']) &&
 117                         is_string($data['text']) && !empty($data['images'])) {
 118                         $data['type'] = 'link';
 119                 }
 120
 121                 $data['title'] = $data['title'] ?? '';
 122
 123                 if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) {
 124                         return '';
 125                 }
 126
 127                 if ($no_photos && ($data['type'] == 'photo')) {
 128                         return '';
 129                 }
 130
 131                 // Escape some bad characters
 132                 $text = "[attachment";
 133
 134                 foreach (['type', 'url', 'title', 'alternative_title', 'publisher_name', 'publisher_url', 'publisher_img', 'author_name', 'author_url', 'author_img'] as $field) {
 135                         if (!empty($data[$field])) {
 136                                 $text .= " " . $field . "='" . str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'";
 137                         }
 138                 }
 139
 140                 if (empty($data['text'])) {
 141                         $data['text'] = '';
 142                 }
 143
 144                 // Only embedd a picture link when it seems to be a valid picture ("width" is set)
 145                 if (!empty($data['images']) && !empty($data['images'][0]['width'])) {
 146                         $preview = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false));
 147                         // if the preview picture is larger than 500 pixels then show it in a larger mode
 148                         // But only, if the picture isn't higher than large (To prevent huge posts)
 149                         if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500)
 150                                 && ($data['images'][0]['width'] >= $data['images'][0]['height'])) {
 151                                 $text .= " image='" . $preview . "'";
 152                         } else {
 153                                 $text .= " preview='" . $preview . "'";
 154
 155                                 if (empty($data['text'])) {
 156                                         $data['text'] = $data['title'];
 157                                 }
 158
 159                                 if (empty($data['text'])) {
 160                                         $data['text'] = $data['url'];
 161                                 }
 162                         }
 163                 }
 164
 165                 $text .= ']' . str_replace(['[', ']'], ['&#91;', '&#93;'], $data['text']) . '[/attachment]';
 166
 167                 $hashtags = '';
 168                 if (!empty($data['keywords'])) {
 169                         $hashtags = "\n";
 170                         foreach ($data['keywords'] as $keyword) {
 171                                 /// @TODO make a positive list of allowed characters
 172                                 $hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
 173                                 $hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
 174                         }
 175                 }
 176
 177                 return $text . $hashtags;
 178         }
 179
 180         /**
 181          * @param string  $url
 182          * @param string $photo
 183          * @param bool $keywords
 184          * @param string $keyword_denylist
 185          * @return array|bool
 186          * @throws HTTPException\InternalServerErrorException
 187          */
 188         public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
 189         {
 190                 $data = ParseUrl::getSiteinfoCached($url);
 191
 192                 if ($photo != '') {
 193                         $data['images'][0]['src'] = $photo;
 194                 }
 195
 196                 if (!$keywords) {
 197                         unset($data['keywords']);
 198                 } elseif ($keyword_denylist && !empty($data['keywords'])) {
 199                         $list = explode(', ', $keyword_denylist);
 200
 201                         foreach ($list as $keyword) {
 202                                 $keyword = trim($keyword);
 203
 204                                 $index = array_search($keyword, $data['keywords']);
 205                                 if ($index !== false) {
 206                                         unset($data['keywords'][$index]);
 207                                 }
 208                         }
 209                 }
 210
 211                 Logger::debug('fetch page info for URL', ['url' => $url, 'data' => $data]);
 212
 213                 return $data;
 214         }
 215
 216         /**
 217          * @param string $url
 218          * @param string $photo
 219          * @param string $keyword_denylist
 220          * @return array
 221          * @throws HTTPException\InternalServerErrorException
 222          */
 223         public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = ''): array
 224         {
 225                 $data = self::queryUrl($url, $photo, true, $keyword_denylist);
 226
 227                 if (empty($data['keywords'])) {
 228                         return [];
 229                 }
 230
 231                 $taglist = [];
 232                 foreach ($data['keywords'] as $keyword) {
 233                         $hashtag = str_replace([' ', '+', '/', '.', '#', "'"],
 234                                 ['', '', '', '', '', ''], $keyword);
 235
 236                         $taglist[] = $hashtag;
 237                 }
 238
 239                 return $taglist;
 240         }
 241
 242         /**
 243          * Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info.
 244          *
 245          * @param string $body
 246          * @param bool   $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort
 247          * @return string|null
 248          */
 249         public static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false)
 250         {
 251                 $URLSearchString = 'https?://[^\[\]]*';
 252
 253                 // Fix for Mastodon where the mentions are in a different format
 254                 $body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
 255
 256                 // Remove all hashtags and mentions
 257                 $body = preg_replace("/([#@!])\[url\=(.*?)\](.*?)\[\/url\]/ism", '', $body);
 258
 259                 // Search for pure links
 260                 preg_match("/\[url\](https?:.*?)\[\/url\]/ism", $body, $matches);
 261
 262                 if (!$matches) {
 263                         // Search for links with descriptions
 264                         preg_match("/\[url\=(https?:.*?)\].*?\[\/url\]/ism", $body, $matches);
 265                 }
 266
 267                 if (!$matches && $searchNakedUrls) {
 268                         preg_match(Strings::autoLinkRegEx(), $body, $matches);
 269                         if ($matches && !Strings::endsWith($body, $matches[1])) {
 270                                 unset($matches);
 271                         }
 272                 }
 273
 274                 return $matches[1] ?? null;
 275         }
 276
 277         /**
 278          * Remove the provided URL from the body if it is at the end of it.
 279          * Keep the link label if it isn't the full URL or a shortened version of it.
 280          *
 281          * @param string $body
 282          * @param string $url
 283          * @return string
 284          */
 285         protected static function stripTrailingUrlFromBody(string $body, string $url): string
 286         {
 287                 $quotedUrl = preg_quote($url, '#');
 288                 $body = preg_replace_callback("#(?:
 289                         \[url]$quotedUrl\[/url]|
 290                         \[url=$quotedUrl]$quotedUrl\[/url]|
 291                         \[url=$quotedUrl]([^[]*?)\[/url]|
 292                         $quotedUrl
 293                 )$#isx", function ($match) use ($url) {
 294                         // Stripping URLs with no label
 295                         if (empty($match[1])) {
 296                                 return '';
 297                         }
 298
 299                         // Stripping link labels that include a shortened version of the URL
 300                         $trimMatch = trim($match[1], '.…');
 301                         if (!empty($trimMatch) && strpos($url, $trimMatch) !== false) {
 302                                 return '';
 303                         }
 304
 305                         // Keep all other labels
 306                         return $match[1];
 307                 }, $body);
 308
 309                 return rtrim($body);
 310         }
 311 }