]> git.mxchange.org Git - friendica.git/blob - src/Content/Text/NPF.php
Reworked parser
[friendica.git] / src / Content / Text / NPF.php
1 <?php
2 /**
3  * @copyright Copyright (C) 2010-2023, the Friendica project
4  *
5  * @license GNU AGPL version 3 or any later version
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Affero General Public License as
9  * published by the Free Software Foundation, either version 3 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Affero General Public License for more details.
16  *
17  * You should have received a copy of the GNU Affero General Public License
18  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  */
21
22 namespace Friendica\Content\Text;
23
24 use DOMDocument;
25 use DOMElement;
26 use Friendica\Model\Photo;
27 use Friendica\Model\Post;
28
29 /**
30  * Tumblr Neue Post Format
31  * @see https://www.tumblr.com/docs/npf
32  */
33 class NPF
34 {
35         static public function fromBBCode(string $bbcode, int $uri_id): array
36         {
37                 $bbcode = self::prepareBody($bbcode);
38
39                 $html = BBCode::convert($bbcode, false, BBCode::CONNECTORS);
40                 if (empty($html)) {
41                         return [];
42                 }
43
44                 $doc = new DOMDocument();
45                 $doc->formatOutput = true;
46                 if (!@$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'))) {
47                         return [];
48                 }
49
50                 $element = $doc->getElementsByTagName('body')->item(0);
51                 echo $element->ownerDocument->saveHTML($element) . "\n";
52
53                 $npf        = [];
54                 $text       = '';
55                 $formatting = [];
56
57                 self::routeChildren($element, $uri_id, true, [], $npf, $text, $formatting);
58
59                 return self::addLinkBlockForUriId($uri_id, 0, $npf);
60         }
61
62         static private function prepareBody(string $body): string
63         {
64                 $shared = BBCode::fetchShareAttributes($body);
65                 if (!empty($shared)) {
66                         $body = $shared['shared'];
67                 }
68
69                 $body = BBCode::removeAttachment($body);
70
71                 $body = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '[img]$3[/img]', $body);
72
73                 if (preg_match_all("#\[url=([^\]]+?)\]\s*\[img=([^\[\]]*)\]([^\[\]]*)\[\/img\]\s*\[/url\]#ism", $body, $pictures, PREG_SET_ORDER)) {
74                         foreach ($pictures as $picture) {
75                                 if (preg_match('#/photo/.*-[01]\.#ism', $picture[2]) && (preg_match('#/photo/.*-0\.#ism', $picture[1]) || preg_match('#/photos/.*/image/#ism', $picture[1]))) {
76                                         $body = str_replace($picture[0], "\n\n[img=" . str_replace('-1.', '-0.', $picture[2]) . "]" . $picture[3] . "[/img]\n\n", $body);
77                                 }
78                         }
79                 }
80
81                 $body = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", "\n\n[img=$1]$2[/img]\n\n", $body);
82
83                 if (preg_match_all("#\[url=([^\]]+?)\]\s*\[img\]([^\[]+?)\[/img\]\s*\[/url\]#ism", $body, $pictures, PREG_SET_ORDER)) {
84                         foreach ($pictures as $picture) {
85                                 if (preg_match('#/photo/.*-[01]\.#ism', $picture[2]) && (preg_match('#/photo/.*-0\.#ism', $picture[1]) || preg_match('#/photos/.*/image/#ism', $picture[1]))) {
86                                         $body = str_replace($picture[0], "\n\n[img]" . str_replace('-1.', '-0.', $picture[2]) . "[/img]\n\n", $body);
87                                 }
88                         }
89                 }
90
91                 $body = preg_replace("/\[img\](.*?)\[\/img\]/ism", "\n\n[img]$1[/img]\n\n", $body);
92                 $body = preg_replace("/\[audio\](.*?)\[\/audio\]/ism", "\n\n[audio]$1[/audio]\n\n", $body);
93                 $body = preg_replace("/\[video\](.*?)\[\/video\]/ism", "\n\n[video]$1[/video]\n\n", $body);
94
95                 do {
96                         $oldbody = $body;
97                         $body = str_replace(["\n\n\n"], ["\n\n"], $body);
98                 } while ($oldbody != $body);
99
100                 return trim($body);
101         }
102
103         static private function routeChildren(DOMElement $element, int $uri_id, bool $parse_structure, array $callstack, array &$npf, string &$text, array &$formatting)
104         {
105                 if ($parse_structure && $text) {
106                         self::addBlock($text, $formatting, $npf, $callstack);
107                 }
108
109                 $callstack[] = $element->nodeName;
110                 $level = self::getLevelByCallstack($callstack);
111
112                 foreach ($element->childNodes as $child) {
113                         switch ($child->nodeName) {
114                                 case 'b':
115                                 case 'strong':
116                                         self::addFormatting($child, $uri_id, 'bold', $callstack, $npf, $text, $formatting);
117                                         break;
118         
119                                 case 'i':
120                                 case 'em':
121                                         self::addFormatting($child, $uri_id, 'italic', $callstack, $npf, $text, $formatting);
122                                         break;
123         
124                                 case 's':
125                                         self::addFormatting($child, $uri_id, 'strikethrough', $callstack, $npf, $text, $formatting);
126                                         break;
127
128                                 case 'u':
129                                 case 'span':
130                                         self::addFormatting($child, $uri_id, '', $callstack, $npf, $text, $formatting);
131                                         break;
132
133                                 case 'hr':
134                                 case 'br':
135                                         if (!empty($text)) {
136                                                 $text .= "\n";
137                                         }
138                                         break;
139                 
140                                 case '#text':
141                                         $text .= $child->textContent;
142                                         break;
143
144                                 case 'table':
145                                 case 'summary':
146                                         // Ignore tables and spoilers
147                                         break;
148
149                                 case 'a':
150                                         if ($text) {
151                                                 self::addInlineLink($child, $uri_id, $callstack, $npf, $text, $formatting);
152                                         } else {
153                                                 $npf = self::addLinkBlock($child, $uri_id, $level, $npf);
154                                         }
155                                         break;
156
157                                 case 'img':
158                                         $npf = self::addImageBlock($child, $uri_id, $level, $npf);
159                                         break;
160
161                                 case 'ol':
162                                 case 'div':
163                                 case 'h1':
164                                 case 'h2':
165                                 case 'h3':
166                                 case 'h4':
167                                 case 'h5':
168                                 case 'h6':
169                                 case 'blockquote':
170                                 case 'p':
171                                 case 'pre':
172                                 case 'code':
173                                 case 'ul':
174                                 case 'li':
175                                 case 'details':
176                                         self::routeChildren($child, $uri_id, true, $callstack, $npf, $text, $formatting);
177                                         break;
178
179                                 default:
180                                         print_r($npf);
181                                         print_r($callstack);
182                                         die($child . "\n");
183                         }
184                 }
185
186                 if ($parse_structure && $text) {
187                         self::addBlock($text, $formatting, $npf, $callstack);
188                 }
189         }
190
191         static private function getLevelByCallstack($callstack): int
192         {
193                 $level = 0;
194                 foreach ($callstack as $entry) {
195                         if (in_array($entry, ['ol', 'ul', 'blockquote'])) {
196                                 ++$level;
197                         }
198                 }
199                 return max(0, $level - 1);
200         }
201
202         static private function getSubTypeByCallstack($callstack): string
203         {
204                 $subtype = '';
205                 foreach ($callstack as $entry) {
206                         switch ($entry) {
207                                 case 'ol':
208                                         $subtype = 'ordered-list-item';
209                                         break;
210
211                                 case 'ul':
212                                         $subtype = 'unordered-list-item';
213                                         break;
214
215                                 case 'h1':
216                                         $subtype = 'heading1';
217                                         break;
218         
219                                 case 'h2':
220                                         $subtype = 'heading1';
221                                         break;
222         
223                                 case 'h3':
224                                         $subtype = 'heading1';
225                                         break;
226         
227                                 case 'h4':
228                                         $subtype = 'heading2';
229                                         break;
230         
231                                 case 'h5':
232                                         $subtype = 'heading2';
233                                         break;
234         
235                                 case 'h6':
236                                         $subtype = 'heading2';
237                                         break;
238         
239                                 case 'blockquote':
240                                 case 'pre':
241                                 case 'code':
242                                         $subtype = 'indented';
243                                         break;
244                         }
245                 }
246                 return $subtype;
247         }
248
249         static private function addFormatting(DOMElement $element, int $uri_id, string $type, array $callstack, array &$npf, string &$text, array &$formatting)
250         {
251                 $start = mb_strlen($text);
252                 self::routeChildren($element, $uri_id, false, $callstack, $npf, $text, $formatting);
253
254                 if (!empty($type)) {
255                         $formatting[] = [
256                                 'start' => $start,
257                                 'end'   => mb_strlen($text),
258                                 'type'  => $type
259                         ];
260                 }
261         }
262
263         static private function addInlineLink(DOMElement $element, int $uri_id, array $callstack, array &$npf, string &$text, array &$formatting)
264         {
265                 $start = mb_strlen($text);
266                 self::routeChildren($element, $uri_id, false, $callstack, $npf, $text, $formatting);
267
268                 $attributes = [];
269                 foreach ($element->attributes as $key => $attribute) {
270                         $attributes[$key] = trim($attribute->value);
271                 }
272                 if (!empty($attributes['href'])) {
273                         $formatting[] = [
274                                 'start' => $start,
275                                 'end'   => mb_strlen($text),
276                                 'type'  => 'link',
277                                 'url'   => $attributes['href']
278                         ];
279                 }
280         }
281
282         static private function addBlock(string &$text, array &$formatting, array &$npf, array $callstack)
283         {
284                 $block = [
285                         'callstack' => $callstack,
286                         'type'      => 'text',
287                         'text'      => $text,
288                 ];
289
290                 if (!empty($formatting)) {
291                         $block['formatting'] = $formatting;
292                 }
293
294                 $level = self::getLevelByCallstack($callstack);
295                 if ($level > 0) {
296                         $block['indent_level'] = $level;
297                 }
298
299                 $subtype = self::getSubTypeByCallstack($callstack);
300                 if ($subtype) {
301                         $block['subtype'] = $subtype;
302                 }
303
304                 $npf[] = $block;
305                 $text = '';
306                 $formatting = [];
307         }
308
309         static private function addPoster(array $media, array $block): array
310         {
311                 $poster = [];
312                 if (!empty($media['preview'])) {
313                         $poster['url'] = $media['preview'];
314                 }
315                 if (!empty($media['preview-width'])) {
316                         $poster['width'] = $media['preview-width'];
317                 }
318                 if (!empty($media['preview-height'])) {
319                         $poster['height'] = $media['preview-height'];
320                 }
321                 if (!empty($poster)) {
322                         $block['poster'] = $poster;
323                 }
324                 return $block;
325         }
326
327         static private function addLinkBlockForUriId(int $uri_id, int $level, array $npf): array
328         {
329                 foreach (Post\Media::getByURIId($uri_id, [Post\Media::HTML]) as $link) {
330                         $host = parse_url($link['url'], PHP_URL_HOST);
331                         if (in_array($host, ['www.youtube.com', 'youtu.be'])) {
332                                 $block = [
333                                         'type'     => 'video',
334                                         'provider' => 'youtube',
335                                         'url'      => $link['url'],
336                                 ];
337                         } elseif (in_array($host, ['vimeo.com'])) {
338                                 $block = [
339                                         'type'     => 'video',
340                                         'provider' => 'vimeo',
341                                         'url'      => $link['url'],
342                                 ];
343                         } elseif (in_array($host, ['open.spotify.com'])) {
344                                 $block = [
345                                         'type'     => 'audio',
346                                         'provider' => 'spotify',
347                                         'url'      => $link['url'],
348                                 ];
349                         } else {
350                                 $block = [
351                                         'type' => 'link',
352                                         'url'  => $link['url'],
353                                 ];
354                                 if (!empty($link['name'])) {
355                                         $block['title'] = $link['name'];
356                                 }
357                                 if (!empty($link['description'])) {
358                                         $block['description'] = $link['description'];
359                                 }
360                                 if (!empty($link['author-name'])) {
361                                         $block['author'] = $link['author-name'];
362                                 }
363                                 if (!empty($link['publisher-name'])) {
364                                         $block['site_name'] = $link['publisher-name'];
365                                 }
366                         }
367
368                         if ($level > 0) {
369                                 $block['indent_level'] = $level;
370                         }
371
372                         $npf[] = self::addPoster($link, $block);
373                 }
374                 return $npf;
375         }
376
377         static private function addImageBlock(DOMElement $element, int $uri_id, int $level, array $npf): array
378         {
379                 $attributes = [];
380                 foreach ($element->attributes as $key => $attribute) {
381                         $attributes[$key] = trim($attribute->value);
382                 }
383                 if (empty($attributes['src'])) {
384                         return $npf;
385                 }
386
387                 $block = [
388                         'type'  => 'image',
389                         'media' => [],
390                 ];
391
392                 if (!empty($attributes['alt'])) {
393                         $block['alt_text'] = $attributes['alt'];
394                 }
395
396                 if (!empty($attributes['title']) && (($attributes['alt'] ?? '') != $attributes['title'])) {
397                         $block['caption'] = $attributes['title'];
398                 }
399
400                 $rid = Photo::ridFromURI($attributes['src']);
401                 if (!empty($rid)) {
402                         $photos = Photo::selectToArray([], ['resource-id' => $rid]);
403                         foreach ($photos as $photo) {
404                                 $block['media'][] = [
405                                         'type'   => $photo['type'],
406                                         'url'    => str_replace('-0.', '-' . $photo['scale'] . '.', $attributes['src']),
407                                         'width'  => $photo['width'],
408                                         'height' => $photo['height'],
409                                 ];
410                         }
411                         if (empty($attributes['alt']) && !empty($photos[0]['desc'])) {
412                                 $block['alt_text'] = $photos[0]['desc'];
413                         }
414                 } elseif ($media = Post\Media::getByURL($uri_id, $attributes['src'], [Post\Media::IMAGE])) {
415                         $block['media'][] = [
416                                 'type'   => $media['mimetype'],
417                                 'url'    => $media['url'],
418                                 'width'  => $media['width'],
419                                 'height' => $media['height'],
420                         ];
421                         if (empty($attributes['alt']) && !empty($media['description'])) {
422                                 $block['alt_text'] = $media['description'];
423                         }
424                 } else {
425                         $block['media'][] = ['url' => $attributes['src']];
426                 }
427
428                 if ($level > 0) {
429                         $block['indent_level'] = $level;
430                 }
431
432                 $npf[] = $block;
433
434                 return $npf;
435         }
436
437         static private function addLinkBlock(DOMElement $element, int $uri_id, int $level, array $npf): array
438         {
439                 $attributes = [];
440                 foreach ($element->attributes as $key => $attribute) {
441                         $attributes[$key] = trim($attribute->value);
442                 }
443                 if (empty($attributes['href'])) {
444                         return $npf;
445                 }
446
447                 $media = Post\Media::getByURL($uri_id, $attributes['href'], [Post\Media::AUDIO, Post\Media::VIDEO]);
448                 if (!empty($media)) {
449                         switch ($media['type']) {
450                                 case Post\Media::AUDIO:
451                                         $block = [
452                                                 'type' => 'audio',
453                                                 'media' => [
454                                                         'type' => $media['mimetype'],
455                                                         'url'  => $media['url'],
456                                                 ]
457                                         ];
458
459                                         if (!empty($media['name'])) {
460                                                 $block['title'] = $media['name'];
461                                         } elseif (!empty($media['description'])) {
462                                                 $block['title'] = $media['description'];
463                                         }
464
465                                         $block = self::addPoster($media, $block);
466                                         break;
467
468                                 case Post\Media::VIDEO:
469                                         $block = [
470                                                 'type' => 'video',
471                                                 'media' => [
472                                                         'type' => $media['mimetype'],
473                                                         'url'  => $media['url'],
474                                                 ]
475                                         ];
476
477                                         $block = self::addPoster($media, $block);
478                                         break;
479                         }
480                 } else {
481                         $block = [
482                                 'type' => 'text',
483                                 'text' => $element->textContent,
484                                 'formatting' => [
485                                         'start' => 0,
486                                         'end'   => strlen($element->textContent),
487                                         'type'  => 'link',
488                                         'url'   => $attributes['href']
489                                 ]
490                         ];
491                 }
492
493                 if ($level > 0) {
494                         $block['indent_level'] = $level;
495                 }
496
497                 $npf[] = $block;
498
499                 return $npf;
500         }
501 }