]> git.mxchange.org Git - friendica-addons.git/blob - cld/cld.php
CLD: New plugin for language detection via CLD2
[friendica-addons.git] / cld / cld.php
1 <?php
2 /**
3  * Name: Compact Language Detector
4  * Description: Improved language detection
5  * Version: 0.1
6  * Author: Michael Vogel <heluecht@pirati.ca>
7  */
8
9 use Friendica\Core\Hook;
10 use Friendica\Core\Logger;
11 use Friendica\DI;
12
13 function cld_install()
14 {
15         Hook::register('get_language', 'addon/cld/cld.php', 'cld_get_language');
16 }
17
18 function cld_get_language(array &$data)
19 {
20         if (!in_array('cld2', get_loaded_extensions())) {
21                 Logger::warning('CLD2 is not installed.');
22                 return;
23         }
24
25         $cld2 = new \CLD2Detector();
26
27         $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding
28
29         $result = $cld2->detect($data['text']);
30         
31         if ($data['detected']) {
32                 $original = array_key_first($data['detected']);
33         } else {
34                 $original = '';
35         }
36
37         $detected = $result['language_code'];
38         if ($detected == 'pt') {
39                 $detected = 'pt-PT';
40         } elseif ($detected == 'el') {
41                 $detected = 'el-monoton';
42         } elseif ($detected == 'no') {
43                 $detected = 'nb';
44         } elseif ($detected == 'zh') {
45                 $detected = 'zh-Hans';
46         } elseif ($detected == 'zh-Hant') {
47                 $detected = 'zh-hant';
48         }
49
50         if (!$result['is_reliable']) {
51                 Logger::debug('Unreliable detection', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
52                 return;
53         }
54
55         if ($original == $detected) {
56 //              return;
57         }
58
59         // Nur aus Testgründen
60         if (in_array($detected, ['xx-Qaai', 'ht', 'ga'])) {
61                 return;
62         }
63
64         $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true)));
65         
66         if (!in_array($detected, $available)) {
67                 Logger::debug('Unsupported language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
68                 return;
69         }
70
71         Logger::debug('Detected', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
72
73 //      Logger::debug('Detected different language', ['original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
74         $data['detected'] = [$detected => $result['language_probability'] / 100];
75 }