]> git.mxchange.org Git - friendica.git/blob - src/Content/Text/Plaintext.php
Merge pull request #13132 from tobiasd/20230516-changelog
[friendica.git] / src / Content / Text / Plaintext.php
1 <?php
2 /**
3  * @copyright Copyright (C) 2010-2023, the Friendica project
4  *
5  * @license GNU AGPL version 3 or any later version
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Affero General Public License as
9  * published by the Free Software Foundation, either version 3 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Affero General Public License for more details.
16  *
17  * You should have received a copy of the GNU Affero General Public License
18  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  */
21
22 namespace Friendica\Content\Text;
23
24 use Friendica\Core\Protocol;
25 use Friendica\DI;
26 use Friendica\Model\Photo;
27 use Friendica\Model\Post;
28 use Friendica\Util\Network;
29
30 class Plaintext
31 {
32         // Assumed length of an URL when shortened via the network's own url shortener (e.g. Twitter)
33         const URL_LENGTH = 23;
34
35         /**
36          * Shortens message
37          *
38          * @param  string $msg
39          * @param  int    $limit
40          * @param  int    $uid
41          * @return string
42          *
43          * @todo For Twitter URLs aren't shortened, but they have to be calculated as if.
44          */
45         public static function shorten(string $msg, int $limit, int $uid = 0): string
46         {
47                 $ellipsis = html_entity_decode("&#x2026;", ENT_QUOTES, 'UTF-8');
48
49                 if (!empty($uid) && DI::pConfig()->get($uid, 'system', 'simple_shortening')) {
50                         return mb_substr(mb_substr(trim($msg), 0, $limit), 0, -3) . $ellipsis;
51                 }
52
53                 $lines = explode("\n", $msg);
54                 $msg = "";
55                 $recycle = html_entity_decode("&#x2672; ", ENT_QUOTES, 'UTF-8');
56                 foreach ($lines as $row => $line) {
57                         if (mb_strlen(trim($msg . "\n" . $line)) <= $limit) {
58                                 $msg = trim($msg . "\n" . $line);
59                         } elseif (($msg == "") || (($row == 1) && (substr($msg, 0, 4) == $recycle))) {
60                                 // Is the new message empty by now or is it a reshared message?
61                                 $msg = mb_substr(mb_substr(trim($msg . "\n" . $line), 0, $limit), 0, -3) . $ellipsis;
62                         } else {
63                                 break;
64                         }
65                 }
66
67                 return $msg;
68         }
69
70         /**
71          * Returns the character positions of the provided boundaries, optionally skipping a number of first occurrences
72          *
73          * @param string $text        Text to search
74          * @param string $open        Left boundary
75          * @param string $close       Right boundary
76          * @param int    $occurrences Number of first occurrences to skip
77          * @return boolean|array
78          */
79         public static function getBoundariesPosition($text, $open, $close, $occurrences = 0)
80         {
81                 if ($occurrences < 0) {
82                         $occurrences = 0;
83                 }
84
85                 $start_pos = -1;
86                 for ($i = 0; $i <= $occurrences; $i++) {
87                         if ($start_pos !== false) {
88                                 $start_pos = strpos($text, $open, $start_pos + 1);
89                         }
90                 }
91
92                 if ($start_pos === false) {
93                         return false;
94                 }
95
96                 $end_pos = strpos($text, $close, $start_pos);
97
98                 if ($end_pos === false) {
99                         return false;
100                 }
101
102                 $res = ['start' => $start_pos, 'end' => $end_pos];
103
104                 return $res;
105         }
106
107         /**
108          * Convert a message into plaintext for connectors to other networks
109          *
110          * @param array  $item           The message array that is about to be posted
111          * @param int    $limit          The maximum number of characters when posting to that network
112          * @param bool   $includedlinks  Has an attached link to be included into the message?
113          * @param int    $htmlmode       This controls the behavior of the BBCode conversion
114          *
115          * @return array Same array structure than \Friendica\Content\Text\BBCode::getAttachedData
116          * @throws \Friendica\Network\HTTPException\InternalServerErrorException
117          * @see   \Friendica\Content\Text\BBCode::getAttachedData
118          */
119         public static function getPost(array $item, int $limit = 0, bool $includedlinks = false, int $htmlmode = BBCode::MASTODON_API)
120         {
121                 // Fetch attached media information
122                 $post = self::getPostMedia($item);
123
124                 if (($item['title'] != '') && ($post['text'] != '')) {
125                         $post['text'] = trim($item['title'] . "\n\n" . $post['text']);
126                 } elseif ($item['title'] != '') {
127                         $post['text'] = trim($item['title']);
128                 }
129
130                 // Fetch the abstract from the given target network
131                 switch ($htmlmode) {
132                         case BBCode::TWITTER:
133                                 $abstract = BBCode::getAbstract($item['body'], Protocol::TWITTER);
134                                 break;
135
136                         case BBCode::OSTATUS:
137                                 $abstract = BBCode::getAbstract($item['body'], Protocol::STATUSNET);
138                                 break;
139
140                         default: // We don't know the exact target.
141                                 // We fetch an abstract since there is a posting limit.
142                                 if ($limit > 0) {
143                                         $abstract = BBCode::getAbstract($item['body']);
144                                 }
145                 }
146
147                 if ($abstract != '') {
148                         $post['text'] = $abstract;
149
150                         if ($post['type'] == 'text') {
151                                 $post['type'] = 'link';
152                                 $post['url'] = $item['plink'];
153                         }
154                 }
155
156                 $html = BBCode::convertForUriId($item['uri-id'], $post['text'] . ($post['after'] ?? ''), $htmlmode);
157                 $msg = HTML::toPlaintext($html, 0, true);
158                 $msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
159
160                 $complete_msg = $msg;
161
162                 $link = '';
163                 if ($includedlinks) {
164                         if ($post['type'] == 'link') {
165                                 $link = $post['url'];
166                         } elseif ($post['type'] == 'text') {
167                                 $link = $post['url'] ?? '';
168                         } elseif ($post['type'] == 'video') {
169                                 $link = $post['url'];
170                         } elseif ($post['type'] == 'photo') {
171                                 $link = $post['image'];
172                         }
173
174                         if (($msg == '') && isset($post['title'])) {
175                                 $msg = trim($post['title']);
176                         }
177
178                         if (($msg == '') && isset($post['description'])) {
179                                 $msg = trim($post['description']);
180                         }
181
182                         // If the link is already contained in the post, then it needn't to be added again
183                         // But: if the link is beyond the limit, then it has to be added.
184                         if (($link != '') && strstr($msg, $link)) {
185                                 $pos = strpos($msg, $link);
186
187                                 // Will the text be shortened in the link?
188                                 // Or is the link the last item in the post?
189                                 if (($limit > 0) && ($pos < $limit) && (($pos + self::URL_LENGTH > $limit) || ($pos + mb_strlen($link) == mb_strlen($msg)))) {
190                                         $msg = trim(str_replace($link, '', $msg));
191                                 } elseif (($limit == 0) || ($pos < $limit)) {
192                                         // The limit has to be increased since it will be shortened - but not now
193                                         // Only do it with Twitter
194                                         if (($limit > 0) && (mb_strlen($link) > self::URL_LENGTH) && ($htmlmode == BBCode::TWITTER)) {
195                                                 $limit = $limit - self::URL_LENGTH + mb_strlen($link);
196                                         }
197
198                                         $link = '';
199
200                                         if ($post['type'] == 'text') {
201                                                 unset($post['url']);
202                                         }
203                                 }
204                         }
205                 }
206
207                 if ($limit > 0) {
208                         // Reduce multiple spaces
209                         // When posted to a network with limited space, we try to gain space where possible
210                         while (strpos($msg, '  ') !== false) {
211                                 $msg = str_replace('  ', ' ', $msg);
212                         }
213
214                         if (!in_array($link, ['', $item['plink']]) && ($post['type'] != 'photo') && (strpos($complete_msg, $link) === false)) {
215                                 $complete_msg .= "\n" . $link;
216                         }
217
218                         $post['parts'] = self::getParts(trim($complete_msg), $limit);
219
220                         // Twitter is using its own limiter, so we always assume that shortened links will have this length
221                         if (mb_strlen($link) > 0) {
222                                 $limit = $limit - self::URL_LENGTH;
223                         }
224
225                         if (mb_strlen($msg) > $limit) {
226                                 if (($post['type'] == 'text') && isset($post['url'])) {
227                                         $post['url'] = $item['plink'];
228                                 } elseif (!isset($post['url'])) {
229                                         $limit = $limit - self::URL_LENGTH;
230                                         $post['url'] = $item['plink'];
231                                 } elseif (strpos($item['body'], '[share') !== false) {
232                                         $post['url'] = $item['plink'];
233                                 } elseif (DI::pConfig()->get($item['uid'], 'system', 'no_intelligent_shortening')) {
234                                         $post['url'] = $item['plink'];
235                                 }
236                                 $msg = self::shorten($msg, $limit, $item['uid']);
237                         }
238                 }
239
240                 $post['text'] = trim($msg);
241
242                 return $post;
243         }
244
245         /**
246          * Split the message in parts
247          *
248          * @param string  $message
249          * @param integer $baselimit
250          * @return array
251          */
252         private static function getParts(string $message, int $baselimit): array
253         {
254                 $parts = [];
255                 $part = '';
256
257                 $limit = $baselimit;
258
259                 while ($message) {
260                         $pos1 = strpos($message, ' ');
261                         $pos2 = strpos($message, "\n");
262
263                         if (($pos1 !== false) && ($pos2 !== false)) {
264                                 $pos = min($pos1, $pos2) + 1;
265                         } elseif ($pos1 !== false) {
266                                 $pos = $pos1 + 1;
267                         } elseif ($pos2 !== false) {
268                                 $pos = $pos2 + 1;
269                         } else {
270                                 $word = $message;
271                                 $message = '';
272                         }
273
274                         if (trim($message)) {
275                                 $word    = substr($message, 0, $pos);
276                                 $message = trim(substr($message, $pos));
277                         }
278
279                         if (Network::isValidHttpUrl(trim($word))) {
280                                 $limit += mb_strlen(trim($word)) - self::URL_LENGTH;
281                         }
282
283                         if ((mb_strlen($part . $word) > $limit - 8) && ($parts || (mb_strlen($part . $word . $message) > $limit))) {
284                                 $parts[] = trim($part);
285                                 $part    = '';
286                                 $limit   = $baselimit;
287                         }
288                         $part .= $word;
289                 }
290                 $parts[] = trim($part);
291
292                 if (count($parts) > 1) {
293                         foreach ($parts as $key => $part) {
294                                 $parts[$key] .= ' (' . ($key + 1) . '/' . count($parts) . ')';
295                         }
296                 }
297
298                 return $parts;
299         }
300
301         /**
302          * Fetch attached media to the post and simplify the body.
303          *
304          * @param array $item
305          * @return array
306          */
307         private static function getPostMedia(array $item): array
308         {
309                 $post = ['type' => 'text', 'images' => [], 'remote_images' => []];
310
311                 // Remove mentions and hashtag links
312                 $URLSearchString = '^\[\]';
313                 $post['text'] = preg_replace("/([#!@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $item['body']);
314
315                 // Remove abstract
316                 $post['text'] = BBCode::stripAbstract($post['text']);
317                 // Remove attached links
318                 $post['text'] = BBCode::removeAttachment($post['text']);
319                 // Remove any links
320                 $post['text'] = Post\Media::removeFromBody($post['text']);
321
322                 $images = Post\Media::getByURIId($item['uri-id'], [Post\Media::IMAGE]);
323                 if (!empty($item['quote-uri-id'])) {
324                         $images = array_merge($images, Post\Media::getByURIId($item['quote-uri-id'], [Post\Media::IMAGE]));
325                 }
326                 foreach ($images as $image) {
327                         if ($id = Photo::getIdForName($image['url'])) {
328                                 $post['images'][] = ['url' => $image['url'], 'description' => $image['description'], 'id' => $id];
329                         } else {
330                                 $post['remote_images'][] = ['url' => $image['url'], 'description' => $image['description']];
331                         }
332                 }
333
334                 if (empty($post['images'])) {
335                         unset($post['images']);
336                 }
337
338                 if (empty($post['remote_images'])) {
339                         unset($post['remote_images']);
340                 }
341
342                 if (!empty($post['images'])) {
343                         $post['type']              = 'photo';
344                         $post['image']             = $post['images'][0]['url'];
345                         $post['image_description'] = $post['images'][0]['description'];
346                 } elseif (!empty($post['remote_images'])) {
347                         $post['type']              = 'photo';
348                         $post['image']             = $post['remote_images'][0]['url'];
349                         $post['image_description'] = $post['remote_images'][0]['description'];
350                 }
351
352                 // Look for audio or video links
353                 $media = Post\Media::getByURIId($item['uri-id'], [Post\Media::AUDIO, Post\Media::VIDEO]);
354                 if (!empty($item['quote-uri-id'])) {
355                         $media = array_merge($media, Post\Media::getByURIId($item['quote-uri-id'], [Post\Media::AUDIO, Post\Media::VIDEO]));
356                 }
357
358                 foreach ($media as $medium) {
359                         if (in_array($medium['type'], [Post\Media::AUDIO, Post\Media::VIDEO])) {
360                                 $post['type'] = 'link';
361                                 $post['url']  = $medium['url'];
362                         }
363                 }
364
365                 // Look for an attached link
366                 $page = Post\Media::getByURIId($item['uri-id'], [Post\Media::HTML]);
367                 if (!empty($item['quote-uri-id']) && empty($page)) {
368                         $page = Post\Media::getByURIId($item['quote-uri-id'], [Post\Media::HTML]);
369                 }
370                 if (!empty($page)) {
371                         $post['type']          = 'link';
372                         $post['url']           = $page[0]['url'];
373                         $post['description']   = $page[0]['description'];
374                         $post['title']         = $page[0]['name'];
375
376                         if (empty($post['image']) && !empty($page[0]['preview'])) {
377                                 $post['image'] = $page[0]['preview'];
378                         }
379                 }
380
381                 return $post;
382         }
383 }