src/Content/PageInfo.php

   1 <?php
   2 /**
   3  * @copyright Copyright (C) 2020, Friendica
   4  *
   5  * @license GNU AGPL version 3 or any later version
   6  *
   7  * This program is free software: you can redistribute it and/or modify
   8  * it under the terms of the GNU Affero General Public License as
   9  * published by the Free Software Foundation, either version 3 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU Affero General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Affero General Public License
  18  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  19  *
  20  */
  21
  22 namespace Friendica\Content;
  23
  24 use Friendica\Core\Hook;
  25 use Friendica\Core\Logger;
  26 use Friendica\DI;
  27 use Friendica\Network\HTTPException;
  28 use Friendica\Util\ParseUrl;
  29 use Friendica\Util\Strings;
  30
  31 /**
  32  * Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query
  33  */
  34 class PageInfo
  35 {
  36         /**
  37          * @param string $body
  38          * @param bool   $searchNakedUrls
  39          * @param bool   $no_photos
  40          * @return string
  41          * @throws HTTPException\InternalServerErrorException
  42          */
  43         public static function appendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false)
  44         {
  45                 Logger::info('add_page_info_to_body: fetch page info for body', ['body' => $body]);
  46
  47                 $url = self::getRelevantUrlFromBody($body, $searchNakedUrls);
  48                 if (!$url) {
  49                         return $body;
  50                 }
  51
  52                 $footer = self::getFooterFromUrl($url, $no_photos);
  53                 if (!$footer) {
  54                         return $body;
  55                 }
  56
  57                 $body = self::stripTrailingUrlFromBody($body, $url);
  58
  59                 $body .= "\n" . $footer;
  60
  61                 return $body;
  62         }
  63
  64         /**
  65          * @param string $url
  66          * @param bool $no_photos
  67          * @param string $photo
  68          * @param bool $keywords
  69          * @param string $keyword_denylist
  70          * @return string
  71          * @throws HTTPException\InternalServerErrorException
  72          */
  73         public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
  74         {
  75                 $data = self::queryUrl($url, $photo, $keywords, $keyword_denylist);
  76
  77                 return self::getFooterFromData($data, $no_photos);
  78         }
  79
  80         /**
  81          * @param array $data
  82          * @param bool  $no_photos
  83          * @return string
  84          * @throws HTTPException\InternalServerErrorException
  85          */
  86         public static function getFooterFromData(array $data, bool $no_photos = false)
  87         {
  88                 Hook::callAll('page_info_data', $data);
  89
  90                 if (empty($data['type'])) {
  91                         return '';
  92                 }
  93
  94                 // It maybe is a rich content, but if it does have everything that a link has,
  95                 // then treat it that way
  96                 if (($data['type'] == 'rich') && is_string($data['title']) &&
  97                         is_string($data['text']) && !empty($data['images'])) {
  98                         $data['type'] = 'link';
  99                 }
 100
 101                 $data['title'] = $data['title'] ?? '';
 102
 103                 if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) {
 104                         return '';
 105                 }
 106
 107                 if ($no_photos && ($data['type'] == 'photo')) {
 108                         return '';
 109                 }
 110
 111                 // Escape some bad characters
 112                 $data['url'] = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['url'], ENT_QUOTES, 'UTF-8', false));
 113                 $data['title'] = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['title'], ENT_QUOTES, 'UTF-8', false));
 114
 115                 $text = "[attachment type='" . $data['type'] . "'";
 116
 117                 if (empty($data['text'])) {
 118                         $data['text'] = $data['title'];
 119                 }
 120
 121                 if (empty($data['text'])) {
 122                         $data['text'] = $data['url'];
 123                 }
 124
 125                 if (!empty($data['url'])) {
 126                         $text .= " url='" . $data['url'] . "'";
 127                 }
 128
 129                 if (!empty($data['title'])) {
 130                         $text .= " title='" . $data['title'] . "'";
 131                 }
 132
 133                 // Only embedd a picture link when it seems to be a valid picture ("width" is set)
 134                 if (!empty($data['images']) && !empty($data['images'][0]['width'])) {
 135                         $preview = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false));
 136                         // if the preview picture is larger than 500 pixels then show it in a larger mode
 137                         // But only, if the picture isn't higher than large (To prevent huge posts)
 138                         if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500)
 139                                 && ($data['images'][0]['width'] >= $data['images'][0]['height'])) {
 140                                 $text .= " image='" . $preview . "'";
 141                         } else {
 142                                 $text .= " preview='" . $preview . "'";
 143                         }
 144                 }
 145
 146                 $text .= ']' . $data['text'] . '[/attachment]';
 147
 148                 $hashtags = '';
 149                 if (!empty($data['keywords'])) {
 150                         $hashtags = "\n";
 151                         foreach ($data['keywords'] as $keyword) {
 152                                 /// @TODO make a positive list of allowed characters
 153                                 $hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
 154                                 $hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
 155                         }
 156                 }
 157
 158                 return $text . $hashtags;
 159         }
 160
 161         /**
 162          * @param string  $url
 163          * @param string $photo
 164          * @param bool $keywords
 165          * @param string $keyword_denylist
 166          * @return array|bool
 167          * @throws HTTPException\InternalServerErrorException
 168          */
 169         public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
 170         {
 171                 $data = ParseUrl::getSiteinfoCached($url, true);
 172
 173                 if ($photo != '') {
 174                         $data['images'][0]['src'] = $photo;
 175                 }
 176
 177                 if (!$keywords) {
 178                         unset($data['keywords']);
 179                 } elseif ($keyword_denylist) {
 180                         $list = explode(', ', $keyword_denylist);
 181
 182                         foreach ($list as $keyword) {
 183                                 $keyword = trim($keyword);
 184
 185                                 $index = array_search($keyword, $data['keywords']);
 186                                 if ($index !== false) {
 187                                         unset($data['keywords'][$index]);
 188                                 }
 189                         }
 190                 }
 191
 192                 Logger::info('fetch page info for URL', ['url' => $url, 'data' => $data]);
 193
 194                 return $data;
 195         }
 196
 197         /**
 198          * @param string $url
 199          * @param string $photo
 200          * @param string $keyword_denylist
 201          * @return array
 202          * @throws HTTPException\InternalServerErrorException
 203          */
 204         public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = '')
 205         {
 206                 $data = self::queryUrl($url, $photo, true, $keyword_denylist);
 207
 208                 $taglist = [];
 209                 foreach ($data['keywords'] as $keyword) {
 210                         $hashtag = str_replace([' ', '+', '/', '.', '#', "'"],
 211                                 ['', '', '', '', '', ''], $keyword);
 212
 213                         $taglist[] = $hashtag;
 214                 }
 215
 216                 return $taglist;
 217         }
 218
 219         /**
 220          * Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info.
 221          *
 222          * @param string $body
 223          * @param bool   $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort
 224          * @return string|null
 225          */
 226         protected static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false)
 227         {
 228                 $URLSearchString = 'https?://[^\[\]]*';
 229
 230                 // Fix for Mastodon where the mentions are in a different format
 231                 $body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);
 232
 233                 preg_match("~(?<![!#@])\[url]($URLSearchString)\[/url]$~is", $body, $matches);
 234
 235                 if (!$matches) {
 236                         preg_match("~(?<![!#@])\[url=($URLSearchString)].*\[/url]$~is", $body, $matches);
 237                 }
 238
 239                 if (!$matches && $searchNakedUrls) {
 240                         preg_match('~(?<=\W|^)(?<![=\]])(https?://.+)$~is', $body, $matches);
 241                         if ($matches && !Strings::endsWith($body, $matches[1])) {
 242                                 unset($matches);
 243                         }
 244                 }
 245
 246                 return $matches[1] ?? null;
 247         }
 248
 249         /**
 250          * Remove the provided URL from the body if it is at the end of it.
 251          * Keep the link label if it isn't the full URL.
 252          *
 253          * @param string $body
 254          * @param string $url
 255          * @return string|string[]|null
 256          */
 257         protected static function stripTrailingUrlFromBody(string $body, string $url)
 258         {
 259                 $quotedUrl = preg_quote($url, '#');
 260                 $body = preg_replace("#(?:
 261                         \[url]$quotedUrl\[/url]|
 262                         \[url=$quotedUrl]$quotedUrl\[/url]|
 263                         \[url=$quotedUrl]([^[]*?)\[/url]|
 264                         $quotedUrl
 265                 )$#isx", '$1', $body);
 266
 267                 return $body;
 268         }
 269 }