]> git.mxchange.org Git - friendica.git/commitdiff
More languages / use profile text as fallback
authorMichael <heluecht@pirati.ca>
Mon, 2 Oct 2023 20:37:16 +0000 (20:37 +0000)
committerMichael <heluecht@pirati.ca>
Mon, 2 Oct 2023 20:37:16 +0000 (20:37 +0000)
doc/Addons.md
doc/de/Addons.md
src/Content/Text/BBCode.php
src/Core/L10n.php
src/Model/Item.php

index a0b12267966f4a401e456ece9ff6b61e14702bf4..8ad1f89ebef0f9640be56a9292057c35ffc41aeb 100644 (file)
@@ -227,6 +227,7 @@ Called after the language detection. This can be used for alternative language d
 
 - **text**: The text that is analyzed.
 - **detected**: (input/output) Array of language codes detected in the related text.
+- **uri-id**: The Uri-Id of the item.
 
 ### addon_settings
 Called when generating the HTML for the addon settings page.
index bd13f6334c8461c792d8ca6200d4bf83d03a7440..99639e2bddb34c0337a15cf724f38cee41d8f8cc 100644 (file)
@@ -109,6 +109,7 @@ Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzu
 `$data` ist ein Array:
         'text' => Der analysierte Text.
         'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen.
+        'uri-id' => Die Uri-Id des Beitrags
 
 **'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
     $b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
index ab7300da18b5c3e4f35b4bed9745374aee45bde5..36355f99645ff2936fb446733960571ba0ae3230 100644 (file)
@@ -230,18 +230,73 @@ class BBCode
        {
                DI::profiler()->startRecording('rendering');
                // Remove pictures in advance to avoid unneeded proxy calls
+               $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
                $text = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $2 ', $text);
                $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
 
                // Remove attachment
                $text = self::replaceAttachment($text);
 
-               $naked_text = HTML::toPlaintext(self::convert($text, false, BBCode::EXTERNAL, true), 0, !$keep_urls);
+               $naked_text = HTML::toPlaintext(self::convert($text, false, self::EXTERNAL, true), 0, !$keep_urls);
 
                DI::profiler()->stopRecording();
                return $naked_text;
        }
 
+       /**
+        * Converts text into a format that can be used for the channel search and the language detection.
+        *
+        * @param string $text
+        * @param integer $uri_id
+        * @return string
+        */
+       public static function toSearchText(string $text, int $uri_id): string
+       {
+               // Removes attachments
+               $text = self::removeAttachment($text);
+
+               // Add images because of possible alt texts
+               if (!empty($uri_id)) {
+                       $text = Post\Media::addAttachmentsToBody($uri_id, $text, [Post\Media::IMAGE]);
+               }
+
+               if (empty($text)) {
+                       return '';
+               }
+
+               // Remove links without a link description
+               $text = preg_replace("~\[url\=.*\]https?:.*\[\/url\]~", ' ', $text);
+
+               // Remove pictures
+               $text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
+
+               // Replace picture with the alt description
+               $text = preg_replace("/\[img\=.*?\](.*?)\[\/img\]/ism", ' $1 ', $text);
+
+               // Remove the other pictures
+               $text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
+
+               // Removes mentions, remove links from hashtags
+               $text = preg_replace('/[@!]\[url\=.*?\].*?\[\/url\]/ism', ' ', $text);
+               $text = preg_replace('/[#]\[url\=.*?\](.*?)\[\/url\]/ism', ' #$1 ', $text);
+               $text = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', ' ', $text);
+               $text = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $text);
+
+               // Convert it to plain text
+               $text = self::toPlaintext($text, false);
+
+               // Remove possibly remaining links
+               $text = preg_replace(Strings::autoLinkRegEx(), '', $text);
+
+               // Remove all unneeded white space
+               do {
+                       $oldtext = $text;
+                       $text = str_replace(['  ', "\n", "\r", '"', '_'], ' ', $text);
+               } while ($oldtext != $text);
+
+               return trim($text);
+       }
+
        private static function proxyUrl(string $image, int $simplehtml = self::INTERNAL, int $uriid = 0, string $size = ''): string
        {
                // Only send proxied pictures to API and for internal display
@@ -931,7 +986,7 @@ class BBCode
                                $network = $contact['network'] ?? Protocol::PHANTOM;
 
                                $tpl = Renderer::getMarkupTemplate('shared_content.tpl');
-                               $text .= BBCode::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
+                               $text .= self::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
                                        '$profile'      => $attributes['profile'],
                                        '$avatar'       => $attributes['avatar'],
                                        '$author'       => $attributes['author'],
@@ -1112,6 +1167,7 @@ class BBCode
        public static function removeLinks(string $bbcode): string
        {
                DI::profiler()->startRecording('rendering');
+               $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode);
                $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode);
                $bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode);
 
@@ -1996,7 +2052,7 @@ class BBCode
        {
                DI::profiler()->startRecording('rendering');
 
-               $text = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
+               $text = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
                        $text = preg_replace("/[\s|\n]*\[abstract\].*?\[\/abstract\][\s|\n]*/ism", ' ', $text);
                        $text = preg_replace("/[\s|\n]*\[abstract=.*?\].*?\[\/abstract][\s|\n]*/ism", ' ', $text);
                        return $text;
@@ -2018,7 +2074,7 @@ class BBCode
                DI::profiler()->startRecording('rendering');
                $addon = strtolower($addon);
 
-               $abstract = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
+               $abstract = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
                        if ($addon && preg_match('#\[abstract=' . preg_quote($addon, '#') . '](.*?)\[/abstract]#ism', $text, $matches)) {
                                return $matches[1];
                        }
index 7fd7fc4e87040fb4aca31c86d851a0ab37c8888c..414d578fca8be8aa906347b7235c117b1f8b804f 100644 (file)
@@ -400,20 +400,33 @@ class L10n
                                // Additionally some more languages are added to that list that are used in the Fediverse.
                                $additional_langs = [
                                        'af'         => 'Afrikaans',
+                                       'az-Latn'    => 'azərbaycan dili',
+                                       'bs-Latn'    => 'bosanski jezik',
+                                       'be'         => 'беларуская мова',
+                                       'bn'         => 'বাংলা',
                                        'cy'         => 'Cymraeg',
                                        'el-monoton' => 'Ελληνικά',
                                        'eu'         => 'euskara',
                                        'fa'         => 'فارسی',
+                                       'ga'         => 'Gaeilge',
                                        'gl'         => 'Galego',
+                                       'he'         => 'עברית',
                                        'hi'         => 'हिन्दी',
                                        'hr'         => 'Hrvatski',
+                                       'hy'         => 'Հայերեն',
                                        'id'         => 'bahasa Indonesia',
+                                       'jv'         => 'Basa Jawa',
+                                       'ka'         => 'ქართული',
                                        'ko'         => '한국인',
                                        'lt'         => 'lietuvių',
                                        'lv'         => 'latviešu',
+                                       'ms-Latn'    => 'Bahasa Melayu',
+                                       'sr-Cyrl'    => 'српски језик',
                                        'sk'         => 'slovenský',
                                        'sl'         => 'Slovenščina',
+                                       'sq'         => 'Shqip',
                                        'sw'         => 'Kiswahili',
+                                       'ta'         => 'தமிழ்',
                                        'th'         => 'แบบไทย',
                                        'tl'         => 'Wikang Tagalog',
                                        'tr'         => 'Türkçe',
index fbb608e5c86265e0e6c986232970ee12bdfdb990..bed726704a24dbeaf9b37aac87f0cb1d5b875e8b 100644 (file)
@@ -1987,7 +1987,7 @@ class Item
                        return '';
                }
 
-               $languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3);
+               $languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3, $item['uri-id'], $item['author-id']);
                if (empty($languages)) {
                        return '';
                }
@@ -2000,24 +2000,23 @@ class Item
         *
         * @param string  $body
         * @param integer $count
+        * @param integer $uri_id
+        * @param integer $author_id
         * @return array
         */
-       public static function getLanguageArray(string $body, int $count): array
+       public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array
        {
-               // Convert attachments to links
-               $naked_body = BBCode::removeAttachment($body);
-               if (empty($naked_body)) {
-                       return [];
-               }
+               $naked_body = BBCode::toSearchText($body, $uri_id);
 
-               // Remove links and pictures
-               $naked_body = BBCode::removeLinks($naked_body);
-
-               // Convert the title and the body to plain text
-               $naked_body = BBCode::toPlaintext($naked_body);
-
-               // Remove possibly remaining links
-               $naked_body = trim(preg_replace(Strings::autoLinkRegEx(), '', $naked_body));
+               if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) {
+                       $author = Contact::selectFirst(['about'], ['id' => $author_id]);
+                       if (!empty($author['about'])) {
+                               $about = BBCode::toSearchText($author['about'], 0);
+                               $about = self::getDominantLanguage($about);
+                               Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]);
+                               $naked_body .= ' ' . $about;
+                       }
+               }
 
                if (empty($naked_body)) {
                        return [];
@@ -2034,6 +2033,7 @@ class Item
                $data = [
                        'text'     => $naked_body,
                        'detected' => $languages,
+                       'uri-id'   => $uri_id,
                ];
 
                Hook::callAll('detect_languages', $data);