]> git.mxchange.org Git - friendica.git/commitdiff
Use ISO-639-1 for the language detection
authorMichael <heluecht@pirati.ca>
Thu, 2 Nov 2023 22:49:25 +0000 (22:49 +0000)
committerMichael <heluecht@pirati.ca>
Thu, 2 Nov 2023 22:49:25 +0000 (22:49 +0000)
src/Content/Conversation/Factory/Channel.php
src/Core/L10n.php
src/Model/Item.php
src/Model/Post/Engagement.php
src/Model/User.php
src/Module/Conversation/Timeline.php
src/Module/Settings/Display.php

index f03b5892519e6898f2bb7d5f1c7a3c1d32730297..0e44c7e590911da9ed5189599847e45fd822a53a 100644 (file)
@@ -35,13 +35,13 @@ final class Channel extends Timeline
         */
        public function getTimelines(int $uid): Timelines
        {
-               $language  = User::getLanguageCode($uid);
-               $languages = $this->l10n->getAvailableLanguages(true);
+               $iso639 = new \Matriphe\ISO639\ISO639;
+               $native = $iso639->nativeByCode1(User::getLanguageCode($uid));
 
                $tabs = [
                        new ChannelEntity(ChannelEntity::FORYOU, $this->l10n->t('For you'), $this->l10n->t('Posts from contacts you interact with and who interact with you'), 'y'),
                        new ChannelEntity(ChannelEntity::WHATSHOT, $this->l10n->t('What\'s Hot'), $this->l10n->t('Posts with a lot of interactions'), 'h'),
-                       new ChannelEntity(ChannelEntity::LANGUAGE, $languages[$language], $this->l10n->t('Posts in %s', $languages[$language]), 'g'),
+                       new ChannelEntity(ChannelEntity::LANGUAGE, $native, $this->l10n->t('Posts in %s', $native), 'g'),
                        new ChannelEntity(ChannelEntity::FOLLOWERS, $this->l10n->t('Followers'), $this->l10n->t('Posts from your followers that you don\'t follow'), 'f'),
                        new ChannelEntity(ChannelEntity::SHARERSOFSHARERS, $this->l10n->t('Sharers of sharers'), $this->l10n->t('Posts from accounts that are followed by accounts that you follow'), 'r'),
                        new ChannelEntity(ChannelEntity::IMAGE, $this->l10n->t('Images'), $this->l10n->t('Posts with images'), 'i'),
index 548aea1acef48844fe3a89adc3982fb1a43c04b5..87d11de6fba466a4078f1ff319b00f805fff5226 100644 (file)
@@ -378,7 +378,7 @@ class L10n
         *
         * @return array
         */
-       public function getAvailableLanguages(bool $additional = false): array
+       public function getAvailableLanguages(): array
        {
                $langs              = [];
                $strings_file_paths = glob('view/lang/*/strings.php');
@@ -392,107 +392,94 @@ class L10n
                                $path_array            = explode('/', $strings_file_path);
                                $langs[$path_array[2]] = self::LANG_NAMES[$path_array[2]] ?? $path_array[2];
                        }
+               }
+               return $langs;
+       }
 
-                       if ($additional) {
-                               // See https://github.com/friendica/friendica/issues/10511
-                               // Persian is manually added to language detection until a persian translation is provided for the interface, at
-                               // which point it will be automatically available through `getAvailableLanguages()` and this should be removed.
-                               // Additionally some more languages are added to that list that are used in the Fediverse.
-                               $additional_langs = [
-                                       'af'         => 'Afrikaans',
-                                       'az-Latn'    => 'azərbaycan dili',
-                                       'bs-Latn'    => 'bosanski jezik',
-                                       'be'         => 'беларуская мова',
-                                       'bn'         => 'বাংলা',
-                                       'cy'         => 'Cymraeg',
-                                       'el-monoton' => 'ελληνικά',
-                                       'eu'         => 'euskara, euskera',
-                                       'fa'         => 'فارسی',
-                                       'ga'         => 'Gaeilge',
-                                       'gl'         => 'galego',
-                                       'he'         => 'עברית',
-                                       'hi'         => 'हिन्दी, हिंदी',
-                                       'hr'         => 'hrvatski jezik',
-                                       'hy'         => 'Հայերեն',
-                                       'id'         => 'Bahasa Indonesia',
-                                       'jv'         => 'basa Jawa',
-                                       'ka'         => 'ქართული',
-                                       'ko'         => '한국어, 조선어',
-                                       'lt'         => 'lietuvių kalba',
-                                       'lv'         => 'latviešu valoda',
-                                       'ms-Latn'    => 'bahasa Melayu, بهاس ملايو‎',
-                                       'sr-Cyrl'    => 'српски језик',
-                                       'sk'         => 'slovenčina, slovenský jazyk',
-                                       'sl'         => 'slovenski jezik, slovenščina',
-                                       'sq'         => 'Shqip',
-                                       'sw'         => 'Kiswahili',
-                                       'ta'         => 'தமிழ்',
-                                       'th'         => 'ไทย',
-                                       'tl'         => 'Wikang Tagalog, ᜏᜒᜃᜅ᜔ ᜆᜄᜎᜓᜄ᜔',
-                                       'tr'         => 'Türkçe',
-                                       'pt-PT'      => 'português',
-                                       'uk'         => 'українська мова',
-                                       'uz'         => 'Oʻzbek, Ўзбек, أۇزبېك‎',
-                                       'vi'         => 'Việt Nam',
-                                       'zh-hant'    => '繁體',
-                               ];
-                               $langs = array_merge($additional_langs, $langs);
-                               ksort($langs);
-                       }
+       /**
+        * Get language codes that are detectable by our language detection routines.
+        * Öanguages are excluded that aren't used often and that tend to false detections.
+        * The listed codes are a collection of both the official ISO 639-1 codes and
+        * the codes that are used by our built-in language detection routine.
+        * When the detection is done, the result only consists of the official ISO 639-1 codes.
+        *
+        * @return array
+        */
+       public function getDetectableLanguages(): array
+       {
+               $additional_langs = [
+                       'af', 'az', 'az-Cyrl', 'az-Latn', 'be', 'bn', 'bs', 'bs-Cyrl', 'bs-Latn',
+                       'cy', 'da', 'el', 'el-monoton', 'el-polyton', 'en', 'eu',
+                       'fa', 'fi', 'ga', 'gl', 'he', 'hi', 'hr', 'hy', 'id', 'in', 'iw', 'jv', 'jw',
+                       'ka', 'ko', 'lt', 'lv', 'mo', 'ms', 'ms-Arab', 'ms-Latn', 'nb', 'nn', 'no',
+                       'pt', 'pt-PT', 'pt-BR', 'ro', 'sa', 'sk', 'sl', 'sq', 'sr', 'sr-Cyrl', 'sr-Latn', 'sw',
+                       'ta', 'th', 'tl', 'tr', 'ug', 'uk', 'uz', 'vi', 'zh', 'zh-Hant', 'zh-Hans',
+               ];
+
+               if (in_array('cld2', get_loaded_extensions())) {
+                       $additional_langs = array_merge($additional_langs,
+                               ['sd', 'si', 'yi', 'km', 'iu', 'lo', 'dv', 'gu', 'kn', 'te', 'ml', 'or', 'pa', 'iu']);
                }
+
+               $langs = array_merge($additional_langs, array_keys($this->getAvailableLanguages()));
+               sort($langs);
                return $langs;
        }
 
        /**
-        * The language detection routine uses some slightly different language codes.
-        * This function changes the language array accordingly.
+        * Return a list of supported languages witzh their two byte language codes.
         *
-        * @param array $languages
+        * @param bool $international If set to true, additionally the international language name is returned as well.
         * @return array
         */
-       public function convertForLanguageDetection(array $languages): array
+       public function getLanguageCodes(bool $international = false): array
        {
-               foreach ($languages as $key => $language) {
-                       $newkey = $this->convertCodeForLanguageDetection($key);
-                       if ($newkey != $key) {
-                               if (!isset($languages[$newkey])) {
-                                       $languages[$newkey] = $language;
-                               }
-                               unset($languages[$key]);
+               $iso639 = new \Matriphe\ISO639\ISO639;
+
+               $languages = [];
+
+               foreach ($this->getDetectableLanguages() as $code) {
+                       $code     = $this->toISO6391($code);
+                       $native   = $iso639->nativeByCode1($code);
+                       $language = $iso639->languageByCode1($code);
+                       if ($native != $iso639->languageByCode1($code) && $international) {
+                               $languages[$code] = $this->t('%s (%s)', $native, $language);
+                       } else {
+                               $languages[$code] = $native;
                        }
                }
 
-               ksort($languages);
-
                return $languages;
        }
 
        /**
-        * The language detection routine uses some slightly different language codes.
-        * This function changes the language codes accordingly.
+        * Convert the language code to ISO639-1
+        * It also converts old codes to their new counterparts.
         *
-        * @param string $language
+        * @param string $code
         * @return string
         */
-       public function convertCodeForLanguageDetection(string $language): string
+       public function toISO6391(string $code): string
        {
-               switch ($language) {
-                       case 'da-dk':
-                               return 'da';
-                       case 'en-us':
-                       case 'en-gb':
-                               return 'en';
-                       case 'fi-fi':
-                               return 'fi';
-                       case 'nb-no':
-                               return 'nb';
-                       case 'pt-br':
-                               return 'pt-BR';
-                       case 'zh-cn':
-                               return 'zh-Hans';
-                       default:
-                               return $language;
+               if ((strlen($code) > 2) && (substr($code, 2, 1) == '-')) {
+                       $code = substr($code, 0, 2);
+               }
+               if (in_array($code, ['nb', 'nn'])) {
+                       $code = 'no';
+               }
+               if ($code == 'in') {
+                       $code = 'id';
+               }
+               if ($code == 'iw') {
+                       $code = 'he';
+               }
+               if ($code == 'jw') {
+                       $code = 'jv';
+               }
+               if ($code == 'mo') {
+                       $code = 'ro';
                }
+               return $code;
        }
 
        /**
index 811eab50b41e17532ab9aaa48019a84f3d38ed2f..e505e76d97de699e170cbb7013e995da8361d99a 100644 (file)
@@ -2034,15 +2034,12 @@ class Item
                        return [];
                }
 
-               $availableLanguages = DI::l10n()->getAvailableLanguages(true);
-               $availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages);
-
-               $ld = new Language(array_keys($availableLanguages));
+               $ld = new Language(DI::l10n()->getDetectableLanguages());
 
                $result = [];
 
                foreach (self::splitByBlocks($searchtext) as $block) {
-                       $languages = $ld->detect($block)->limit(0, $count)->close() ?: [];
+                       $languages = $ld->detect($block)->close() ?: [];
 
                        $data = [
                                'text'      => $block,
@@ -2057,10 +2054,32 @@ class Item
                        }
                }
 
+               $result = self::compactLanguages($result);
+
                arsort($result);
-               $result = array_slice($result, 0, $count);
+               return array_slice($result, 0, $count);
+       }
 
-               return $result;
+       /**
+        * Concert the language code in the detection result to ISO 639-1.
+        * On duplicates the system uses the higher quality value.
+        *
+        * @param array $result
+        * @return array
+        */
+       private static function compactLanguages(array $result): array
+       {
+               $languages = [];
+               foreach ($result as $language => $quality) {
+                       if ($quality == 0) {
+                               continue;
+                       }
+                       $code = DI::l10n()->toISO6391($language);
+                       if (empty($languages[$code]) || ($languages[$code] < $quality)) {
+                               $languages[$code] = $quality;
+                       }
+               }
+               return $languages;
        }
 
        /**
index 017c34d19c9c2309b235b309dab16243f0e1ef83..74b479653cb24543b1db2e385540437133f21f87 100644 (file)
@@ -35,9 +35,6 @@ use Friendica\Model\Verb;
 use Friendica\Protocol\Activity;
 use Friendica\Protocol\Relay;
 use Friendica\Util\DateTimeFormat;
-use Friendica\Util\Strings;
-
-// Channel
 
 class Engagement
 {
index 698172c1df7f27ce88dc577b696cade35f8cab85..24a75b5d36e72502bcb8e4e87768359f70fb3c4e 100644 (file)
@@ -127,7 +127,6 @@ class User
 
                        case 'community':
                                return User::ACCOUNT_TYPE_COMMUNITY;
-
                }
                return null;
        }
@@ -425,7 +424,7 @@ class User
         * @return array user
         * @throws Exception
         */
-       public static function getFirstAdmin(array $fields = []) : array
+       public static function getFirstAdmin(array $fields = []): array
        {
                if (!empty(DI::config()->get('config', 'admin_nickname'))) {
                        return self::getByNickname(DI::config()->get('config', 'admin_nickname'), $fields);
@@ -560,22 +559,20 @@ class User
                return $default_circle;
        }
 
-/**
- * Fetch the language code from the given user. If the code is invalid, return the system language
- *
- * @param integer $uid User-Id
- * @return string
- */
+       /**
       * Fetch the language code from the given user. If the code is invalid, return the system language
       *
       * @param integer $uid User-Id
       * @return string
       */
        public static function getLanguageCode(int $uid): string
        {
-               $owner = self::getOwnerDataById($uid);
-               $languages = DI::l10n()->getAvailableLanguages(true);
-               if (in_array($owner['language'], array_keys($languages))) {
-                       $language = $owner['language'];
-               } else {
-                       $language = DI::config()->get('system', 'language');
+               $owner    = self::getOwnerDataById($uid);
+               $language = DI::l10n()->toISO6391($owner['language']);
+               if (in_array($language, array_keys(DI::l10n()->getLanguageCodes()))) {
+                       return $language;
                }
-               return $language;
+               return DI::l10n()->toISO6391(DI::config()->get('system', 'language'));
        }
 
        /**
@@ -1480,7 +1477,7 @@ class User
                Photo::delete(['uid' => $register['uid']]);
 
                return DBA::delete('user', ['uid' => $register['uid']]) &&
-                      Register::deleteByHash($register['hash']);
+                       Register::deleteByHash($register['hash']);
        }
 
        /**
index d15fefe3e06aa111bea25b39ec9bf4852af9bfde..725634eb0920ffd68e5ab27ba6962aa8979af1ca 100644 (file)
@@ -304,7 +304,7 @@ class Timeline extends BaseModule
                } elseif ($this->selectedTab == ChannelEntity::AUDIO) {
                        $condition = ["`media-type` & ?", 4];
                } elseif ($this->selectedTab == ChannelEntity::LANGUAGE) {
-                       $condition = ["JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?", $this->l10n->convertCodeForLanguageDetection(User::getLanguageCode($uid))];
+                       $condition = ["JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?", User::getLanguageCode($uid)];
                } elseif (is_numeric($this->selectedTab)) {
                        $condition = $this->getUserChannelConditions($this->selectedTab, $this->session->getLocalUserId());
                }
@@ -421,7 +421,6 @@ class Timeline extends BaseModule
        {
                $conditions = [];
                $languages  = $this->pConfig->get($uid, 'channel', 'languages', [User::getLanguageCode($uid)]);
-               $languages  = $this->l10n->convertForLanguageDetection($languages);
                foreach ($languages as $language) {
                        $conditions[] = "JSON_EXTRACT(JSON_KEYS(language), '$[0]') = ?";
                        $condition[]  = $language;
index b5dbf01eb810161a3e9022c8eca2cdaf369a22be..ad36da2ddfa65106af3733815c0dac10bd2eb9a3 100644 (file)
@@ -260,7 +260,7 @@ class Display extends BaseSettings
                $bookmarked_timelines = $this->pConfig->get($uid, 'system', 'network_timelines', $this->getAvailableTimelines($uid, true)->column('code'));
                $enabled_timelines    = $this->pConfig->get($uid, 'system', 'enabled_timelines', $this->getAvailableTimelines($uid, false)->column('code'));
                $channel_languages = $this->pConfig->get($uid, 'channel', 'languages', [User::getLanguageCode($uid)]);
-               $languages         = $this->l10n->getAvailableLanguages(true);
+               $languages         = $this->l10n->getLanguageCodes(true);
 
                $timelines = [];
                foreach ($this->getAvailableTimelines($uid) as $timeline) {