Continued:

[friendica.git] / src / Content / Text / BBCode.php
diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php

index 239e6dfa092c6217bf6ed4ca5c643706eb285726..835a96e4d3b58c158b4fcaef3560f4edd6402af6 100644 (file)
--- a/src/Content/Text/BBCode.php
+++ b/src/Content/Text/BBCode.php
@@ -1,6 +1,6 @@
  <?php
  /**
- * @copyright Copyright (C) 2010-2023, the Friendica project
+ * @copyright Copyright (C) 2010-2024, the Friendica project
   *
   * @license GNU AGPL version 3 or any later version
   *
@@ -230,18 +230,73 @@ class BBCode
         {
                 DI::profiler()->startRecording('rendering');
                 // Remove pictures in advance to avoid unneeded proxy calls
+               $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
                 $text = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $2 ', $text);
                 $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
  
                 // Remove attachment
                 $text = self::replaceAttachment($text);
  
-               $naked_text = HTML::toPlaintext(self::convert($text, false, BBCode::EXTERNAL, true), 0, !$keep_urls);
+               $naked_text = HTML::toPlaintext(self::convert($text, false, self::EXTERNAL, true), 0, !$keep_urls);
  
                 DI::profiler()->stopRecording();
                 return $naked_text;
         }
  
+       /**
+        * Converts text into a format that can be used for the channel search and the language detection.
+        *
+        * @param string $text
+        * @param integer $uri_id
+        * @return string
+        */
+       public static function toSearchText(string $text, int $uri_id): string
+       {
+               // Removes attachments
+               $text = self::removeAttachment($text);
+
+               // Add images because of possible alt texts
+               if (!empty($uri_id)) {
+                       $text = Post\Media::addAttachmentsToBody($uri_id, $text, [Post\Media::IMAGE]);
+               }
+
+               if (empty($text)) {
+                       return '';
+               }
+
+               // Remove links without a link description
+               $text = preg_replace("~\[url\=.*\]https?:.*\[\/url\]~", ' ', $text);
+
+               // Remove pictures
+               $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
+
+               // Replace picture with the alt description
+               $text = preg_replace("/\[img\=.*?\](.*?)\[\/img\]/ism", ' $1 ', $text);
+
+               // Remove the other pictures
+               $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
+
+               // Removes mentions, remove links from hashtags
+               $text = preg_replace('/[@!]\[url\=.*?\].*?\[\/url\]/ism', ' ', $text);
+               $text = preg_replace('/[#]\[url\=.*?\](.*?)\[\/url\]/ism', ' #$1 ', $text);
+               $text = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', ' ', $text);
+               $text = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $text);
+
+               // Convert it to plain text
+               $text = self::toPlaintext($text, false);
+
+               // Remove possibly remaining links
+               $text = preg_replace(Strings::autoLinkRegEx(), '', $text);
+
+               // Remove all unneeded white space
+               do {
+                       $oldtext = $text;
+                       $text = str_replace(['  ', "\n", "\r", '"'], ' ', $text);
+               } while ($oldtext != $text);
+
+               return trim($text);
+       }
+
         private static function proxyUrl(string $image, int $simplehtml = self::INTERNAL, int $uriid = 0, string $size = ''): string
         {
                 // Only send proxied pictures to API and for internal display
@@ -931,7 +986,7 @@ class BBCode
                                 $network = $contact['network'] ?? Protocol::PHANTOM;
  
                                 $tpl = Renderer::getMarkupTemplate('shared_content.tpl');
-                               $text .= BBCode::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
+                               $text .= self::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
                                         '$profile'      => $attributes['profile'],
                                         '$avatar'       => $attributes['avatar'],
                                         '$author'       => $attributes['author'],
@@ -1112,6 +1167,7 @@ class BBCode
         public static function removeLinks(string $bbcode): string
         {
                 DI::profiler()->startRecording('rendering');
+               $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode);
                 $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode);
                 $bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode);
  
@@ -1178,7 +1234,7 @@ class BBCode
         }
  
         /**
-        * Expand Youtube and Vimeo links to 
+        * Expand Youtube and Vimeo links to
          *
          * @param string $text
          * @return string
@@ -1331,7 +1387,7 @@ class BBCode
                                         "\n[hr]", "[hr]\n", " [hr]", "[hr] ",
                                         "\n[attachment ", " [attachment ", "\n[/attachment]", "[/attachment]\n", " [/attachment]", "[/attachment] ",
                                         "[table]\n", "[table] ", " [table]", "\n[/table]", " [/table]", "[/table] ",
-                                       " \n", "\t\n", "[/li]\n", "\n[li]", "\n[*]", 
+                                       " \n", "\t\n", "[/li]\n", "\n[li]", "\n[*]",
                                 ];
                                 $replace = [
                                         "[th]", "[th]", "[th]", "[/th]", "[/th]", "[/th]",
@@ -1424,14 +1480,14 @@ class BBCode
                                 if ($simple_html == self::INTERNAL) {
                                         //Ensure to always start with <h4> if possible
                                         $heading_count = 0;
-                                       for ($level = 6; $level > 0; $level--) { 
+                                       for ($level = 6; $level > 0; $level--) {
                                                 if (preg_match("(\[h$level\].*?\[\/h$level\])ism", $text)) {
                                                         $heading_count++;
                                                 }
                                         }
                                         if ($heading_count > 0) {
                                                 $heading = min($heading_count + 3, 6);
-                                               for ($level = 6; $level > 0; $level--) { 
+                                               for ($level = 6; $level > 0; $level--) {
                                                         if (preg_match("(\[h$level\].*?\[\/h$level\])ism", $text)) {
                                                                 $text = preg_replace("(\[h$level\](.*?)\[\/h$level\])ism", "</p><h$heading>$1</h$heading><p>", $text);
                                                                 $heading--;
@@ -1492,7 +1548,11 @@ class BBCode
                                 $text = preg_replace("(\[style=(.*?)\](.*?)\[\/style\])ism", '<span style="$1">$2</span>', $text);
  
                                 // Mastodon Emoji (internal tag, do not document for users)
-                               $text = preg_replace("(\[emoji=(.*?)](.*?)\[/emoji])ism", '<span class="mastodon emoji"><img src="$1" alt="$2" title="$2"/></span>', $text);
+                               if ($simple_html == self::MASTODON_API) {
+                                       $text = preg_replace("(\[emoji=(.*?)](.*?)\[/emoji])ism", '$2', $text);
+                               } else {
+                                       $text = preg_replace("(\[emoji=(.*?)](.*?)\[/emoji])ism", '<span class="mastodon emoji"><img src="$1" alt="$2" title="$2"/></span>', $text);
+                               }
  
                                 // Check for CSS classes
                                 // @deprecated since 2021.12, left for backward-compatibility reasons
@@ -1898,7 +1958,7 @@ class BBCode
                                         $text
                                 );
  
-                               // sanitize href attributes (only allowlisted protocols URLs)
+                               // sanitize href attributes (only whitelisted protocols URLs)
                                 // default value for backward compatibility
                                 $allowed_link_protocols = DI::config()->get('system', 'allowed_link_protocols', []);
  
@@ -1992,7 +2052,7 @@ class BBCode
         {
                 DI::profiler()->startRecording('rendering');
  
-               $text = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
+               $text = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
                         $text = preg_replace("/[\s|\n]*\[abstract\].*?\[\/abstract\][\s|\n]*/ism", ' ', $text);
                         $text = preg_replace("/[\s|\n]*\[abstract=.*?\].*?\[\/abstract][\s|\n]*/ism", ' ', $text);
                         return $text;
@@ -2014,7 +2074,7 @@ class BBCode
                 DI::profiler()->startRecording('rendering');
                 $addon = strtolower($addon);
  
-               $abstract = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
+               $abstract = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
                         if ($addon && preg_match('#\[abstract=' . preg_quote($addon, '#') . '](.*?)\[/abstract]#ism', $text, $matches)) {
                                 return $matches[1];
                         }
@@ -2111,6 +2171,9 @@ class BBCode
                 // Maybe we should make this newline at every time before a quote.
                 $text = str_replace(['</a><blockquote>'], ['</a><br><blockquote>'], $text);
  
+               // The converter doesn't convert these elements
+               $text = str_replace(['<div>', '</div>'], ['<p>', '</p>'], $text);
+
                 // Now convert HTML to Markdown
                 $text = HTML::toMarkdown($text);