Add new Content\BBCode::toPlaintext()

[friendica.git] / src / Content / Text / BBCode.php
diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php

index 1148103054c2e37e3613581e08b29af922cd729c..96c8eceb70f97b0c7d07b6dd68296be31caf6b1b 100644 (file)
--- a/src/Content/Text/BBCode.php
+++ b/src/Content/Text/BBCode.php
@@ -7,7 +7,7 @@
  namespace Friendica\Content\Text;
  
  use DOMDocument;
-use DomXPath;
+use DOMXPath;
  use Exception;
  use Friendica\BaseObject;
  use Friendica\Content\OEmbed;
@@ -20,6 +20,7 @@ use Friendica\Core\PConfig;
  use Friendica\Core\Protocol;
  use Friendica\Core\System;
  use Friendica\Model\Contact;
+use Friendica\Model\Event;
  use Friendica\Network\Probe;
  use Friendica\Object\Image;
  use Friendica\Util\Map;
@@ -27,7 +28,6 @@ use Friendica\Util\Network;
  use Friendica\Util\ParseUrl;
  use League\HTMLToMarkdown\HtmlConverter;
  
-require_once "include/event.php";
  require_once "mod/proxy.php";
  
  class BBCode extends BaseObject
@@ -76,10 +76,12 @@ class BBCode extends BaseObject
  
                                         $picturedata = Image::getInfoFromURL($matches[1]);
  
-                                       if (($picturedata[0] >= 500) && ($picturedata[0] >= $picturedata[1])) {
-                                               $post["image"] = $matches[1];
-                                       } else {
-                                               $post["preview"] = $matches[1];
+                                       if ($picturedata) {
+                                               if (($picturedata[0] >= 500) && ($picturedata[0] >= $picturedata[1])) {
+                                                       $post["image"] = $matches[1];
+                                               } else {
+                                                       $post["preview"] = $matches[1];
+                                               }
                                         }
                                 }
  
@@ -241,6 +243,9 @@ class BBCode extends BaseObject
                         $body = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '[img]$3[/img]', $body);
  
                         $URLSearchString = "^\[\]";
+
+                       $body = preg_replace("/\[img\=([$URLSearchString]*)\](.*?)\[\/img\]/ism", '[img]$1[/img]', $body);
+
                         if (preg_match_all("(\[url=([$URLSearchString]*)\]\s*\[img\]([$URLSearchString]*)\[\/img\]\s*\[\/url\])ism", $body, $pictures, PREG_SET_ORDER)) {
                                 if ((count($pictures) == 1) && !$has_title) {
                                         // Checking, if the link goes to a picture
@@ -266,7 +271,7 @@ class BBCode extends BaseObject
                                                 $post["text"] = str_replace($pictures[0][0], "", $body);
                                         } else {
                                                 $imgdata = Image::getInfoFromURL($pictures[0][1]);
-                                               if (substr($imgdata["mime"], 0, 6) == "image/") {
+                                               if ($imgdata && substr($imgdata["mime"], 0, 6) == "image/") {
                                                         $post["type"] = "photo";
                                                         $post["image"] = $pictures[0][1];
                                                         $post["preview"] = $pictures[0][2];
@@ -338,159 +343,20 @@ class BBCode extends BaseObject
         }
  
         /**
-        * @brief Convert a message into plaintext for connectors to other networks
+        * @brief Converts a BBCode text into plaintext
          *
-        * @param array $b The message array that is about to be posted
-        * @param int $limit The maximum number of characters when posting to that network
-        * @param bool $includedlinks Has an attached link to be included into the message?
-        * @param int $htmlmode This triggers the behaviour of the bbcode conversion
-        * @param string $target_network Name of the network where the post should go to.
+        * @param bool $keep_urls Whether to keep URLs in the resulting plaintext
          *
-        * @return string The converted message
+        * @return string
          */
-       public static function toPlaintext($b, $limit = 0, $includedlinks = false, $htmlmode = 2, $target_network = "")
+       public static function toPlaintext($text, $keep_urls = true)
         {
-               // Remove the hash tags
-               $URLSearchString = "^\[\]";
-               $body = preg_replace("/([#@])\[url\=([$URLSearchString]*)\](.*?)\[\/url\]/ism", '$1$3', $b["body"]);
-
-               // Add an URL element if the text contains a raw link
-               $body = preg_replace("/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism", '$1[url]$2[/url]', $body);
-
-               // Remove the abstract
-               $body = self::stripAbstract($body);
-
-               // At first look at data that is attached via "type-..." stuff
-               // This will hopefully replaced with a dedicated bbcode later
-               //$post = self::getAttachedData($b["body"]);
-               $post = self::getAttachedData($body, $b);
-
-               if (($b["title"] != "") && ($post["text"] != "")) {
-                       $post["text"] = trim($b["title"]."\n\n".$post["text"]);
-               } elseif ($b["title"] != "") {
-                       $post["text"] = trim($b["title"]);
+               $naked_text = preg_replace('/\[(.+?)\]/','', $text);
+               if (!$keep_urls) {
+                       $naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text);
                 }
  
-               $abstract = "";
-
-               // Fetch the abstract from the given target network
-               if ($target_network != "") {
-                       $default_abstract = self::getAbstract($b["body"]);
-                       $abstract = self::getAbstract($b["body"], $target_network);
-
-                       // If we post to a network with no limit we only fetch
-                       // an abstract exactly for this network
-                       if (($limit == 0) && ($abstract == $default_abstract)) {
-                               $abstract = "";
-                       }
-               } else {// Try to guess the correct target network
-                       switch ($htmlmode) {
-                               case 8:
-                                       $abstract = self::getAbstract($b["body"], NETWORK_TWITTER);
-                                       break;
-                               case 7:
-                                       $abstract = self::getAbstract($b["body"], NETWORK_STATUSNET);
-                                       break;
-                               case 6:
-                                       $abstract = self::getAbstract($b["body"], NETWORK_APPNET);
-                                       break;
-                               default: // We don't know the exact target.
-                                       // We fetch an abstract since there is a posting limit.
-                                       if ($limit > 0) {
-                                               $abstract = self::getAbstract($b["body"]);
-                                       }
-                       }
-               }
-
-               if ($abstract != "") {
-                       $post["text"] = $abstract;
-
-                       if ($post["type"] == "text") {
-                               $post["type"] = "link";
-                               $post["url"] = $b["plink"];
-                       }
-               }
-
-               $html = self::convert($post["text"].$post["after"], false, $htmlmode);
-               $msg = HTML::toPlaintext($html, 0, true);
-               $msg = trim(html_entity_decode($msg, ENT_QUOTES, 'UTF-8'));
-
-               $link = "";
-               if ($includedlinks) {
-                       if ($post["type"] == "link") {
-                               $link = $post["url"];
-                       } elseif ($post["type"] == "text") {
-                               $link = $post["url"];
-                       } elseif ($post["type"] == "video") {
-                               $link = $post["url"];
-                       } elseif ($post["type"] == "photo") {
-                               $link = $post["image"];
-                       }
-
-                       if (($msg == "") && isset($post["title"])) {
-                               $msg = trim($post["title"]);
-                       }
-
-                       if (($msg == "") && isset($post["description"])) {
-                               $msg = trim($post["description"]);
-                       }
-
-                       // If the link is already contained in the post, then it neeedn't to be added again
-                       // But: if the link is beyond the limit, then it has to be added.
-                       if (($link != "") && strstr($msg, $link)) {
-                               $pos = strpos($msg, $link);
-
-                               // Will the text be shortened in the link?
-                               // Or is the link the last item in the post?
-                               if (($limit > 0) && ($pos < $limit) && (($pos + 23 > $limit) || ($pos + strlen($link) == strlen($msg)))) {
-                                       $msg = trim(str_replace($link, "", $msg));
-                               } elseif (($limit == 0) || ($pos < $limit)) {
-                                       // The limit has to be increased since it will be shortened - but not now
-                                       // Only do it with Twitter (htmlmode = 8)
-                                       if (($limit > 0) && (strlen($link) > 23) && ($htmlmode == 8)) {
-                                               $limit = $limit - 23 + strlen($link);
-                                       }
-
-                                       $link = "";
-
-                                       if ($post["type"] == "text") {
-                                               unset($post["url"]);
-                                       }
-                               }
-                       }
-               }
-
-               if ($limit > 0) {
-                       // Reduce multiple spaces
-                       // When posted to a network with limited space, we try to gain space where possible
-                       while (strpos($msg, "  ") !== false) {
-                               $msg = str_replace("  ", " ", $msg);
-                       }
-
-                       // Twitter is using its own limiter, so we always assume that shortened links will have this length
-                       if (iconv_strlen($link, "UTF-8") > 0) {
-                               $limit = $limit - 23;
-                       }
-
-                       if (iconv_strlen($msg, "UTF-8") > $limit) {
-                               if (($post["type"] == "text") && isset($post["url"])) {
-                                       $post["url"] = $b["plink"];
-                               } elseif (!isset($post["url"])) {
-                                       $limit = $limit - 23;
-                                       $post["url"] = $b["plink"];
-                               // Which purpose has this line? It is now uncommented, but left as a reminder
-                               //} elseif (strpos($b["body"], "[share") !== false) {
-                               //      $post["url"] = $b["plink"];
-                               } elseif (PConfig::get($b["uid"], "system", "no_intelligent_shortening")) {
-                                       $post["url"] = $b["plink"];
-                               }
-                               $msg = Plaintext::shorten($msg, $limit);
-                       }
-               }
-
-               $post["text"] = trim($msg);
-
-               return($post);
+               return $naked_text;
         }
  
         public static function scaleExternalImages($srctext, $include_link = true, $scale_replace = false)
@@ -680,7 +546,7 @@ class BBCode extends BaseObject
  
                 $return = '';
                 if ($simplehtml == 7) {
-                       $return = self::convertUrlForMastodon($data["url"]);
+                       $return = self::convertUrlForOStatus($data["url"]);
                 } elseif (($simplehtml != 4) && ($simplehtml != 0)) {
                         $return = sprintf('<a href="%s" target="_blank">%s</a><br>', $data["url"], $data["title"]);
                 } else {
@@ -708,9 +574,10 @@ class BBCode extends BaseObject
                                 }
  
                                 if ($data["description"] != "" && $data["description"] != $data["title"]) {
-                                       $return .= sprintf('<blockquote>%s</blockquote>', trim(self::convert($data["description"])));
+                                       // Sanitize the HTML by converting it to BBCode
+                                       $bbcode = HTML::toBBCode($data["description"]);
+                                       $return .= sprintf('<blockquote>%s</blockquote>', trim(self::convert($bbcode)));
                                 }
-
                                 if ($data["type"] == "link") {
                                         $return .= sprintf('<sup><a href="%s">%s</a></sup>', $data['url'], parse_url($data['url'], PHP_URL_HOST));
                                 }
@@ -757,7 +624,7 @@ class BBCode extends BaseObject
                 if (($data["url"] != "") && ($data["title"] != "")) {
                         $text .= "\n[url=" . $data["url"] . "]" . $data["title"] . "[/url]";
                 } elseif (($data["url"] != "")) {
-                       $text .= "\n" . $data["url"];
+                       $text .= "\n[url]" . $data["url"] . "[/url]";
                 }
  
                 return $text . "\n" . $data["after"];
@@ -770,7 +637,7 @@ class BBCode extends BaseObject
          * @param array $match Array with the matching values
          * @return string reformatted link including HTML codes
          */
-       private static function convertUrlForMastodonCallback($match)
+       private static function convertUrlForOStatusCallback($match)
         {
                 $url = $match[1];
  
@@ -783,34 +650,27 @@ class BBCode extends BaseObject
                         return $match[0];
                 }
  
-               return self::convertUrlForMastodon($url);
+               return self::convertUrlForOStatus($url);
         }
  
         /**
-        * @brief Converts [url] BBCodes in a format that looks fine on Mastodon and GNU Social.
+        * @brief Converts [url] BBCodes in a format that looks fine on OStatus systems.
          * @param string $url URL that is about to be reformatted
          * @return string reformatted link including HTML codes
          */
-       private static function convertUrlForMastodon($url)
+       private static function convertUrlForOStatus($url)
         {
                 $parts = parse_url($url);
                 $scheme = $parts['scheme'] . '://';
                 $styled_url = str_replace($scheme, '', $url);
  
-               $html = '<a href="%s" class="attachment" rel="nofollow noopener" target="_blank">' .
-                       '<span class="invisible">%s</span>';
-
                 if (strlen($styled_url) > 30) {
-                       $html .= '<span class="ellipsis">%s</span>' .
-                               '<span class="invisible">%s</span></a>';
-
-                       $ellipsis = substr($styled_url, 0, 30);
-                       $rest = substr($styled_url, 30);
-                       return sprintf($html, $url, $scheme, $ellipsis, $rest);
-               } else {
-                       $html .= '%s</a>';
-                       return sprintf($html, $url, $scheme, $styled_url);
+                       $styled_url = substr($styled_url, 0, 30) . "…";
                 }
+
+               $html = '<a href="%s" target="_blank">%s</a>';
+
+               return sprintf($html, $url, $styled_url);
         }
  
         /*
@@ -1105,13 +965,13 @@ class BBCode extends BaseObject
                                 }
  
                                 if (stripos(normalise_link($link), 'http://twitter.com/') === 0) {
+                                       $text .= '<br /><a href="' . $link . '">' . $link . '</a>';
+                               } else {
                                         $text .= $headline . '<blockquote>' . trim($share[3]) . "</blockquote><br />";
  
                                         if ($link != "") {
                                                 $text .= '<br /><a href="' . $link . '">[l]</a>';
                                         }
-                               } else {
-                                       $text .= '<br /><a href="' . $link . '">' . $link . '</a>';
                                 }
  
                                 break;
@@ -1207,7 +1067,7 @@ class BBCode extends BaseObject
  
                                 $doc = new DOMDocument();
                                 @$doc->loadHTML($body);
-                               $xpath = new DomXPath($doc);
+                               $xpath = new DOMXPath($doc);
                                 $list = $xpath->query("//meta[@name]");
                                 foreach ($list as $node) {
                                         $attr = [];
@@ -1296,13 +1156,17 @@ class BBCode extends BaseObject
  
         private static function textHighlightCallback($match)
         {
+               // Fallback in case the language doesn't exist
+               $return = '[code]' . $match[2] . '[/code]';
+
                 if (in_array(strtolower($match[1]),
                                 ['php', 'css', 'mysql', 'sql', 'abap', 'diff', 'html', 'perl', 'ruby',
-                               'vbscript', 'avrc', 'dtd', 'java', 'xml', 'cpp', 'python', 'javascript', 'js', 'sh'])
+                               'vbscript', 'avrc', 'dtd', 'java', 'xml', 'cpp', 'python', 'javascript', 'js', 'sh', 'bash'])
                 ) {
-                       return text_highlight($match[2], strtolower($match[1]));
+                       $return = text_highlight($match[2], strtolower($match[1]));
                 }
-               return $match[0];
+
+               return $return;
         }
  
         /**
@@ -1379,7 +1243,7 @@ class BBCode extends BaseObject
                 // After we're finished processing the bbcode we'll
                 // replace all of the event code with a reformatted version.
  
-               $ev = bbtoevent($text);
+               $ev = Event::fromBBCode($text);
  
                 // Replace any html brackets with HTML Entities to prevent executing HTML or script
                 // Don't use strip_tags here because it breaks [url] search by replacing & with amp
@@ -1439,8 +1303,8 @@ class BBCode extends BaseObject
                         $autolink_regex = "/([^\]\='".'"'."]|^)(https?\:\/\/[a-zA-Z0-9\:\/\-\?\&\;\.\=\_\~\#\%\$\!\+\,]+)/ism";
                         $text = preg_replace($autolink_regex, '$1[url]$2[/url]', $text);
                         if ($simple_html == 7) {
-                               $text = preg_replace_callback("/\[url\]([$URLSearchString]*)\[\/url\]/ism", 'self::convertUrlForMastodonCallback', $text);
-                               $text = preg_replace_callback("/\[url\=([$URLSearchString]*)\]([$URLSearchString]*)\[\/url\]/ism", 'self::convertUrlForMastodonCallback', $text);
+                               $text = preg_replace_callback("/\[url\]([$URLSearchString]*)\[\/url\]/ism", 'self::convertUrlForOStatusCallback', $text);
+                               $text = preg_replace_callback("/\[url\=([$URLSearchString]*)\]([$URLSearchString]*)\[\/url\]/ism", 'self::convertUrlForOStatusCallback', $text);
                         }
                 } else {
                         $text = preg_replace("(\[url\]([$URLSearchString]*)\[\/url\])ism", " $1 ", $text);
@@ -1540,10 +1404,8 @@ class BBCode extends BaseObject
                 if (strpos($text, '[/map]') !== false) {
                         $text = preg_replace_callback(
                                 "/\[map\](.*?)\[\/map\]/ism",
-                               function ($match) {
-                                       // the extra space in the following line is intentional
-                                       // Whyyy? - @MrPetovan
-                                       return str_replace($match[0], '<div class="map"  >' . Map::byLocation($match[1]) . '</div>', $match[0]);
+                               function ($match) use ($simple_html) {
+                                       return str_replace($match[0], '<p class="map">' . Map::byLocation($match[1], $simple_html) . '</p>', $match[0]);
                                 },
                                 $text
                         );
@@ -1551,16 +1413,14 @@ class BBCode extends BaseObject
                 if (strpos($text, '[map=') !== false) {
                         $text = preg_replace_callback(
                                 "/\[map=(.*?)\]/ism",
-                               function ($match) {
-                                       // the extra space in the following line is intentional
-                                       // Whyyy? - @MrPetovan
-                                       return str_replace($match[0], '<div class="map"  >' . Map::byCoordinates(str_replace('/', ' ', $match[1])) . '</div>', $match[0]);
+                               function ($match) use ($simple_html) {
+                                       return str_replace($match[0], '<p class="map">' . Map::byCoordinates(str_replace('/', ' ', $match[1]), $simple_html) . '</p>', $match[0]);
                                 },
                                 $text
                         );
                 }
                 if (strpos($text, '[map]') !== false) {
-                       $text = preg_replace("/\[map\]/", '<div class="map"></div>', $text);
+                       $text = preg_replace("/\[map\]/", '<p class="map"></p>', $text);
                 }
  
                 // Check for headers
@@ -1584,7 +1444,7 @@ class BBCode extends BaseObject
                 $text = preg_replace("(\[u\](.*?)\[\/u\])ism", '<u>$1</u>', $text);
  
                 // Check for strike-through text
-               $text = preg_replace("(\[s\](.*?)\[\/s\])ism", '<strike>$1</strike>', $text);
+               $text = preg_replace("(\[s\](.*?)\[\/s\])ism", '<s>$1</s>', $text);
  
                 // Check for over-line text
                 $text = preg_replace("(\[o\](.*?)\[\/o\])ism", '<span class="overline">$1</span>', $text);
@@ -1725,6 +1585,14 @@ class BBCode extends BaseObject
                 $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", '<img src="$3" style="width: $1px;" >', $text);
                 $text = preg_replace("/\[zmg\=([0-9]*)x([0-9]*)\](.*?)\[\/zmg\]/ism", '<img class="zrl" src="$3" style="width: $1px;" >', $text);
  
+               $text = preg_replace_callback("/\[img\=([$URLSearchString]*)\](.*?)\[\/img\]/ism",
+                       function ($matches) {
+                               $matches[1] = proxy_url($matches[1]);
+                               $matches[2] = htmlspecialchars($matches[2], ENT_COMPAT);
+                               return '<img src="' . $matches[1] . '" alt="' . $matches[2] . '">';
+                       },
+                       $text);
+
                 // Images
                 // [img]pathtoimage[/img]
                 $text = preg_replace_callback(
@@ -1821,7 +1689,7 @@ class BBCode extends BaseObject
                 // start which is always required). Allow desc with a missing summary for compatibility.
  
                 if ((x($ev, 'desc') || x($ev, 'summary')) && x($ev, 'start')) {
-                       $sub = format_event_html($ev, $simple_html);
+                       $sub = Event::getHTML($ev, $simple_html);
  
                         $text = preg_replace("/\[event\-summary\](.*?)\[\/event\-summary\]/ism", '', $text);
                         $text = preg_replace("/\[event\-description\](.*?)\[\/event\-description\]/ism", '', $text);
@@ -1856,10 +1724,12 @@ class BBCode extends BaseObject
                 $text = preg_replace_callback("/\[nobb\](.*?)\[\/nobb\]/ism", 'self::unescapeNoparseCallback', $text);
                 $text = preg_replace_callback("/\[pre\](.*?)\[\/pre\]/ism", 'self::unescapeNoparseCallback', $text);
  
-
+               /// @todo What is the meaning of these lines?
                 $text = preg_replace('/\[\&amp\;([#a-z0-9]+)\;\]/', '&$1;', $text);
                 $text = preg_replace('/\&\#039\;/', '\'', $text);
-               $text = preg_replace('/\&quot\;/', '"', $text);
+
+               // Currently deactivated, it made problems with " inside of alt texts.
+               //$text = preg_replace('/\&quot\;/', '"', $text);
  
                 // fix any escaped ampersands that may have been converted into links
                 $text = preg_replace('/\<([^>]*?)(src|href)=(.*?)\&amp\;(.*?)\>/ism', '<$1$2=$3&$4>', $text);