3 * Name: Compact Language Detector
4 * Description: Improved language detection
6 * Author: Michael Vogel <heluecht@pirati.ca>
9 use Friendica\Core\Hook;
10 use Friendica\Core\Logger;
13 function cld_install()
15 Hook::register('detect_languages', __FILE__, 'cld_detect_languages');
18 function cld_detect_languages(array &$data)
20 if (!in_array('cld2', get_loaded_extensions())) {
21 Logger::warning('CLD2 is not installed.');
25 $cld2 = new \CLD2Detector();
27 $cld2->setEncodingHint(CLD2Encoding::UTF8); // optional, hints about text encoding
28 $cld2->setPlainText(true);
30 $result = $cld2->detect($data['text']);
32 if ($data['detected']) {
33 $original = array_key_first($data['detected']);
38 $detected = $result['language_code'];
39 if ($detected == 'pt') {
41 } elseif ($detected == 'az') {
42 $detected = 'az-Latn';
43 } elseif ($detected == 'bs') {
44 $detected = 'bs-Latn';
45 } elseif ($detected == 'el') {
46 $detected = 'el-monoton';
47 } elseif ($detected == 'ht') {
49 } elseif ($detected == 'iw') {
51 } elseif ($detected == 'jw') {
53 } elseif ($detected == 'ms') {
54 $detected = 'ms-Latn';
55 } elseif ($detected == 'no') {
57 } elseif ($detected == 'sr') {
58 $detected = 'sr-Cyrl';
59 } elseif ($detected == 'zh') {
60 $detected = 'zh-Hans';
61 } elseif ($detected == 'zh-Hant') {
62 $detected = 'zh-hant';
65 // languages that aren't supported via the base language detection
66 if (in_array($detected, ['ceb', 'hmn', 'ht', 'kk', 'ky', 'mg', 'mk', 'ml', 'ny', 'or', 'pa', 'rw', 'su', 'st', 'tg', 'ts', 'xx-Qaai'])) {
70 if (!$result['is_reliable']) {
71 Logger::debug('Unreliable detection', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
75 if ($original == $detected) {
79 $available = array_keys(DI::l10n()->convertForLanguageDetection(DI::l10n()->getAvailableLanguages(true)));
81 if (!in_array($detected, $available)) {
82 Logger::debug('Unsupported language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
86 Logger::debug('Detected different language', ['uri-id' => $data['uri-id'], 'original' => $original, 'detected' => $detected, 'name' => $result['language_name'], 'probability' => $result['language_probability'], 'text' => $data['text']]);
87 $data['detected'] = [$detected => $result['language_probability'] / 100];