The language detection is now done in blocks

author Michael <heluecht@pirati.ca>

Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)

committer Michael <heluecht@pirati.ca>

Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)
author Michael <heluecht@pirati.ca>
Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)
committer Michael <heluecht@pirati.ca>
Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)
diff --git a/doc/Addons.md b/doc/Addons.md

index bfccde5ddf8908da796480199ba8a2151c1ec23c..b89a48d26d328ebf017808356f7335621b7f0de5 100644 (file)
--- a/doc/Addons.md
+++ b/doc/Addons.md
@@ -228,6 +228,7 @@ Called after the language detection. This can be used for alternative language d
  - **text**: The text that is analyzed.
  - **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability.
  - **uri-id**: The Uri-Id of the item.
+- **author-id**: The id of the author contact.
  
  ### addon_settings
  Called when generating the HTML for the addon settings page.
diff --git a/doc/de/Addons.md b/doc/de/Addons.md

index c61b68b489e201e5a4467cb0d497ed74f66b5ad8..0843c103abd2ee1e00d7372f076396b4b90de7fb 100644 (file)
--- a/doc/de/Addons.md
+++ b/doc/de/Addons.md
@@ -110,6 +110,7 @@ Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzu
          'text' => Der analysierte Text.
          'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit.
          'uri-id' => Die Uri-Id des Beitrags
+        'author-id' => Die Contact-id des Autors.
  
  **'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
      $b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
diff --git a/src/Model/Item.php b/src/Model/Item.php

index d3ae8aa6c661e231814a68c8ebeecb0bd97c9777..55884a802eeef1099c096434a9dff21f17ae74d0 100644 (file)
--- a/src/Model/Item.php
+++ b/src/Model/Item.php
@@ -49,6 +49,7 @@ use Friendica\Util\Proxy;
  use Friendica\Util\Strings;
  use Friendica\Util\Temporal;
  use GuzzleHttp\Psr7\Uri;
+use IntlChar;
  use LanguageDetection\Language;
  
  class Item
@@ -2010,67 +2011,118 @@ class Item
          */
         public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array
         {
-               $naked_body = BBCode::toSearchText($body, $uri_id);
+               $searchtext = BBCode::toSearchText($body, $uri_id);
  
-               if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) {
+               if ((count(explode(' ', $searchtext)) < 10) && (mb_strlen($searchtext) < 30) && $author_id) {
                         $author = Contact::selectFirst(['about'], ['id' => $author_id]);
                         if (!empty($author['about'])) {
                                 $about = BBCode::toSearchText($author['about'], 0);
-                               $about = self::getDominantLanguage($about);
-                               Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]);
-                               $naked_body .= ' ' . $about;
+                               Logger::debug('About field added', ['author' => $author_id, 'body' => $searchtext, 'about' => $about]);
+                               $searchtext .= ' ' . $about;
                         }
                 }
  
-               if (empty($naked_body)) {
+               if (empty($searchtext)) {
                         return [];
                 }
  
-               $naked_body = self::getDominantLanguage($naked_body);
-
                 $availableLanguages = DI::l10n()->getAvailableLanguages(true);
                 $availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages);
  
                 $ld = new Language(array_keys($availableLanguages));
-               $languages = $ld->detect($naked_body)->limit(0, $count)->close() ?: [];
  
-               $data = [
-                       'text'     => $naked_body,
-                       'detected' => $languages,
-                       'uri-id'   => $uri_id,
-               ];
+               $result = [];
+
+               foreach (self::splitByBlocks($searchtext) as $block) {
+                       $languages = $ld->detect($block)->limit(0, $count)->close() ?: [];
+
+                       $data = [
+                               'text'      => $block,
+                               'detected'  => $languages,
+                               'uri-id'    => $uri_id,
+                               'author-id' => $author_id,
+                       ];
+                       Hook::callAll('detect_languages', $data);
+
+                       foreach ($data['detected'] as $language => $quality) {
+                               $result[$language] = max($result[$language] ?? 0, $quality * (strlen($block) / strlen($searchtext)));
+                       }
+               }
  
-               Hook::callAll('detect_languages', $data);
-               $languages = $data['detected'];
+               arsort($result);
+               $result = array_slice($result, 0, $count);
  
-               return $languages;
+               return $result;
         }
  
         /**
-        * Check if latin or non latin are dominant in the body and only return the dominant one
+        * Split a string into different unicode blocks
+        * Currently the text is split into the latin and the non latin part.
          *
          * @param string $body
-        * @return string
+        * @return array
          */
-       private static function getDominantLanguage(string $body): string
+       private static function splitByBlocks(string $body): array
         {
-               $latin = '';
-               $non_latin = '';
+               $blocks         = [];
+               $previous_block = 0;
+
                 for ($i = 0; $i < mb_strlen($body); $i++) {
                         $character = mb_substr($body, $i, 1);
-                       $ord = mb_ord($character);
-
-                       // We add the most common characters to both strings.
-                       if (($ord <= 64) || ($ord >= 91 && $ord <= 96) || ($ord >= 123 && $ord <= 191) || in_array($ord, [215, 247]) || ($ord >= 697 && $ord <= 735) || ($ord > 65535)) {
-                               $latin .= $character;
-                               $non_latin .= $character;
-                       } elseif ($ord < 768) {
-                               $latin .= $character;
+                       $previous  = ($i > 0) ? mb_substr($body, $i - 1, 1) : '';
+                       $next      = ($i < mb_strlen($body)) ? mb_substr($body, $i + 1, 1) : '';
+
+                       if (!IntlChar::isalpha($character)) {
+                               if (($previous != '') && (IntlChar::isalpha($previous))) {
+                                       $previous_block = self::getBlockCode($previous);
+                               }
+
+                               $block = (($next != '') && IntlChar::isalpha($next)) ? self::getBlockCode($next) : $previous_block;
+                               $blocks[$block] = ($blocks[$block] ?? '') . $character;
                         } else {
-                               $non_latin .= $character;
+                               $block = self::getBlockCode($character);
+                               $blocks[$block] = ($blocks[$block] ?? '') . $character;
                         }
                 }
-               return (mb_strlen($latin) > mb_strlen($non_latin)) ? $latin : $non_latin;
+
+               foreach (array_keys($blocks) as $key) {
+                       $blocks[$key] = trim($blocks[$key]);
+                       if (empty($blocks[$key])) {
+                               unset($blocks[$key]);
+                       }
+               }
+
+               return array_values($blocks);
+       }
+
+       /**
+        * returns the block code for the given character
+        *
+        * @param string $character
+        * @return integer 0 = no alpha character (blank, signs, emojis, ...), 1 = latin character, 2 = character in every other language
+        */
+       private static function getBlockCode(string $character): int
+       {
+               if (!IntlChar::isalpha($character)) {
+                       return 0;
+               }
+               return self::isLatin($character) ? 1 : 2;
+       }
+
+       /**
+        * Checks if the given character is in one of the latin code blocks
+        *
+        * @param string $character
+        * @return boolean
+        */
+       private static function isLatin(string $character): bool
+       {
+               return in_array(IntlChar::getBlockCode($character), [
+                       IntlChar::BLOCK_CODE_BASIC_LATIN, IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT,
+                       IntlChar::BLOCK_CODE_LATIN_EXTENDED_A, IntlChar::BLOCK_CODE_LATIN_EXTENDED_B,
+                       IntlChar::BLOCK_CODE_LATIN_EXTENDED_C, IntlChar::BLOCK_CODE_LATIN_EXTENDED_D,
+                       IntlChar::BLOCK_CODE_LATIN_EXTENDED_E, IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL
+               ]);
         }
  
         public static function getLanguageMessage(array $item): string
author	Michael <heluecht@pirati.ca>
	Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)
committer	Michael <heluecht@pirati.ca>
	Wed, 11 Oct 2023 18:38:14 +0000 (18:38 +0000)
doc/Addons.md		patch \| blob \| history
doc/de/Addons.md		patch \| blob \| history
src/Model/Item.php		patch \| blob \| history