]> git.mxchange.org Git - friendica.git/commitdiff
Improve plaintext generation for language detection
authorMichael <heluecht@pirati.ca>
Mon, 5 Oct 2020 12:50:18 +0000 (12:50 +0000)
committerMichael <heluecht@pirati.ca>
Mon, 5 Oct 2020 12:50:18 +0000 (12:50 +0000)
src/Content/Text/BBCode.php
src/Model/Item.php

index 5b22746ce645492f7e28a24a805c619a484ba119..1b0fa9c7402ee33a0f908a1b39d92b8cbb78b1a4 100644 (file)
@@ -1220,6 +1220,19 @@ class BBCode
                return $return;
        }
 
+       public static function removeLinks(string $bbcode)
+       {
+               $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode);
+               $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode);
+               $bbcode = preg_replace("/\[img\](.*?)\[\/img\]/ism", ' ', $bbcode);
+
+               $bbcode = preg_replace('/([@!#])\[url\=(.*?)\](.*?)\[\/url\]/ism', '', $bbcode);
+               $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode);
+               $bbcode = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $bbcode);
+               $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode);
+               return $bbcode;
+       }
+
        /**
         * Converts a BBCode message to HTML message
         *
index dfea296815a94229bc07e8ad9f16a6968f468ffe..d53933ba781a52e07fb1d764e28f55b632529dcb 100644 (file)
@@ -2476,7 +2476,17 @@ class Item
                        return '';
                }
 
-               $naked_body = BBCode::toPlaintext($item['body'], false);
+               // Convert attachments to links
+               $naked_body = BBCode::removeAttachment($item['body']);
+
+               // Remove links and pictures
+               $naked_body = BBCode::removeLinks($naked_body);
+
+               // Convert the title and the body to plain text
+               $naked_body = trim($item['title'] . "\n" . BBCode::toPlaintext($naked_body));
+
+               // Remove possibly remaining links
+               $naked_body = preg_replace(Strings::autoLinkRegEx(), '', $naked_body);
 
                $ld = new Language();
                $languages = $ld->detect($naked_body)->limit(0, 3)->close();