updated pear LangDetect to version 1.0.0

author Tobias Diekershoff <tobias.diekershoff@gmx.net>

Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)

committer Tobias Diekershoff <tobias.diekershoff@gmx.net>

Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)
author Tobias Diekershoff <tobias.diekershoff@gmx.net>
Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)
committer Tobias Diekershoff <tobias.diekershoff@gmx.net>
Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)
diff --git a/library/langdet/README.rst b/library/langdet/README.rst

new file mode 100644 (file)

index 0000000..9381c7f
--- /dev/null
+++ b/library/langdet/README.rst
@@ -0,0 +1,157 @@
+*******************
+Text_LanguageDetect
+*******************
+PHP library to identify human languages from text samples.
+Returns confidence scores for each.
+
+
+Installation
+============
+
+PEAR
+----
+::
+
+    $ pear install Text_LanguageDetect
+
+Composer
+--------
+::
+
+    $ composer require pear/text_languagedetect
+
+
+Usage
+=====
+Also see the examples in the ``docs/`` directory and
+the `official documentation`__.
+
+__ http://pear.php.net/package/Text_LanguageDetect/docs
+
+Language detection
+------------------
+Simple language detection::
+
+    <?php
+    require_once 'Text/LanguageDetect.php';
+
+    $text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
+
+    $ld = new Text_LanguageDetect();
+    $language = $ld->detectSimple($text);
+
+    echo $language;
+    //output: german
+
+Show the three most probable languages with their confidence score::
+
+    <?php
+    require_once 'Text/LanguageDetect.php';
+
+    $text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
+
+    $ld = new Text_LanguageDetect();
+    //3 most probable languages
+    $results = $ld->detect($text, 3);
+
+    foreach ($results as $language => $confidence) {
+        echo $language . ': ' . number_format($confidence, 2) . "\n";
+    }
+
+    //output:
+    //german: 0.35
+    //dutch: 0.25
+    //swedish: 0.20
+    ?>
+
+
+Language code
+-------------
+Instead of returning the full language name, ISO 639-2 two and three
+letter codes can be returned::
+
+    <?php
+    require_once 'Text/LanguageDetect.php';
+    $ld = new Text_LanguageDetect();
+
+    //will output the ISO 639-1 two-letter language code
+    // "de"
+    $ld->setNameMode(2);
+    echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
+
+    //will output the ISO 639-2 three-letter language code
+    // "deu"
+    $ld->setNameMode(3);
+    echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
+    ?>
+
+
+Supported languages
+===================
+- albanian
+- arabic
+- azeri
+- bengali
+- bulgarian
+- cebuano
+- croatian
+- czech
+- danish
+- dutch
+- english
+- estonian
+- farsi
+- finnish
+- french
+- german
+- hausa
+- hawaiian
+- hindi
+- hungarian
+- icelandic
+- indonesian
+- italian
+- kazakh
+- kyrgyz
+- latin
+- latvian
+- lithuanian
+- macedonian
+- mongolian
+- nepali
+- norwegian
+- pashto
+- pidgin
+- polish
+- portuguese
+- romanian
+- russian
+- serbian
+- slovak
+- slovene
+- somali
+- spanish
+- swahili
+- swedish
+- tagalog
+- turkish
+- ukrainian
+- urdu
+- uzbek
+- vietnamese
+- welsh
+
+
+Links
+=====
+Homepage
+  http://pear.php.net/package/Text_LanguageDetect
+Bug tracker
+  http://pear.php.net/bugs/search.php?cmd=display&package_name[]=Text_LanguageDetect
+Documentation
+  http://pear.php.net/package/Text_LanguageDetect/docs
+Unit test status
+  https://travis-ci.org/pear/Text_LanguageDetect
+
+  .. image:: https://travis-ci.org/pear/Text_LanguageDetect.svg?branch=master
+     :target: https://travis-ci.org/pear/Text_LanguageDetect
diff --git a/library/langdet/Text/LanguageDetect.php b/library/langdet/Text/LanguageDetect.php

index 45337ea8fca6054206f3b7851a989e474fe0f981..7f7d58a96b5d410cc444bafb9d0d9c9e09bc4234 100644 (file)
--- a/library/langdet/Text/LanguageDetect.php
+++ b/library/langdet/Text/LanguageDetect.php
@@ -1,13 +1,6 @@
  <?php
-
  /**
- * Detects the language of a given piece of text.
- *
- * Attempts to detect the language of a sample of text by correlating ranked
- * 3-gram frequencies to a table of 3-gram frequencies of known languages.
- *
- * Implements a version of a technique originally proposed by Cavnar & Trenkle
- * (1994): "N-Gram-Based Text Categorization"
+ * Part of Text_LanguageDetect
   *
   * PHP version 5
   *
@@ -15,20 +8,24 @@
   * @package   Text_LanguageDetect
   * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
   * @copyright 2005-2006 Nicholas Pisarro
- * @license   http://www.debian.org/misc/bsd.license BSD
- * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
   * @link      http://pear.php.net/package/Text_LanguageDetect/
- * @link      http://langdetect.blogspot.com/
   */
  
-require_once 'library/langdet/Text/LanguageDetect/Exception.php';
-require_once 'library/langdet/Text/LanguageDetect/Parser.php';
-require_once 'library/langdet/Text/LanguageDetect/ISO639.php';
+require_once 'Text/LanguageDetect/Exception.php';
+require_once 'Text/LanguageDetect/Parser.php';
+require_once 'Text/LanguageDetect/ISO639.php';
  
  /**
- * Language detection class
+ * Detects the language of a given piece of text.
+ *
+ * Attempts to detect the language of a sample of text by correlating ranked
+ * 3-gram frequencies to a table of 3-gram frequencies of known languages.
+ *
+ * Implements a version of a technique originally proposed by Cavnar & Trenkle
+ * (1994): "N-Gram-Based Text Categorization"
   *
- * Requires the langauge model database (lang.dat) that should have
+ * Requires the language model database (lang.dat) that should have
   * accompanied this class definition in order to be instantiated.
   *
   * Example usage:
@@ -60,10 +57,9 @@ require_once 'library/langdet/Text/LanguageDetect/ISO639.php';
   * @package   Text_LanguageDetect
   * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
   * @copyright 2005 Nicholas Pisarro
- * @license   http://www.debian.org/misc/bsd.license BSD
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
   * @version   Release: @package_version@
   * @link      http://pear.php.net/package/Text_LanguageDetect/
- * @todo      allow users to generate their own language models
   */
  class Text_LanguageDetect
  {
@@ -73,10 +69,9 @@ class Text_LanguageDetect
       * If this value starts with a slash (/) or a dot (.) the value of
       * $this->_data_dir will be ignored
       *
-     * @var      string
-     * @access   private
+     * @var string
       */
-    var $_db_filename = 'lang.dat';
+    protected $_db_filename = 'lang.dat';
  
      /**
       * The filename that stores the unicode block definitions
@@ -85,83 +80,74 @@ class Text_LanguageDetect
       * $this->_data_dir will be ignored
       *
       * @var string
-     * @access private
       */
-    var $_unicode_db_filename = 'unicode_blocks.dat';
+    protected $_unicode_db_filename = 'unicode_blocks.dat';
  
      /**
       * The data directory
       *
       * Should be set by PEAR installer
       *
-     * @var      string
-     * @access   private
+     * @var string
       */
-    var $_data_dir = '@data_dir@';
+    protected $_data_dir = '@data_dir@';
  
      /**
       * The trigram data for comparison
       *
       * Will be loaded on start from $this->_db_filename
       *
-     * @var      array
-     * @access   private
+     * @var array
       */
-    var $_lang_db = array();
+    protected $_lang_db = array();
  
      /**
-     * stores the map of the trigram data to unicode characters
+     * Stores the map of the trigram data to unicode characters
       *
-     * @access private
       * @var array
       */
-    var $_unicode_map;
+    protected $_unicode_map;
  
      /**
       * The size of the trigram data arrays
       *
-     * @var      int
-     * @access   private
+     * @var int
       */
-    var $_threshold = 300;
+    protected $_threshold = 300;
  
      /**
-     * the maximum possible score.
+     * The maximum possible score.
       *
-     * needed for score normalization. Different depending on the
+     * Needed for score normalization. Different depending on the
       * perl compatibility setting
       *
-     * @access  private
-     * @var     int
-     * @see     setPerlCompatible()
+     * @var int
+     * @see setPerlCompatible()
       */
-    var $_max_score = 0;
+    protected $_max_score = 0;
  
      /**
       * Whether or not to simulate perl's Language::Guess exactly
       *
-     * @access  private
-     * @var     bool
-     * @see     setPerlCompatible()
+     * @var bool
+     * @see setPerlCompatible()
       */
-    var $_perl_compatible = false;
+    protected $_perl_compatible = false;
  
      /**
       * Whether to use the unicode block detection to speed up processing
       *
-     * @access private
       * @var bool
       */
-    var $_use_unicode_narrowing = true;
+    protected $_use_unicode_narrowing = true;
  
      /**
-     * stores the result of the clustering operation
+     * Stores the result of the clustering operation
       *
-     * @access  private
-     * @var     array
-     * @see     clusterLanguages()
+     * @var array
+     * @see clusterLanguages()
       */
-    var $_clusters;
+    protected $_clusters;
  
      /**
       * Which type of "language names" are accepted and returned:
@@ -170,7 +156,7 @@ class Text_LanguageDetect
       * 2 - 2-letter ISO 639-1 code ("en")
       * 3 - 3-letter ISO 639-2 code ("eng")
       */
-    var $_name_mode = 0;
+    protected $_name_mode = 0;
  
      /**
       * Constructor
@@ -178,7 +164,7 @@ class Text_LanguageDetect
       * Will attempt to load the language database. If it fails, you will get
       * an exception.
       */
-    function __construct()
+    public function __construct()
      {
          $data = $this->_readdb($this->_db_filename);
          $this->_checkTrigram($data['trigram']);
@@ -200,9 +186,8 @@ class Text_LanguageDetect
       * @param string $fname File name to load
       *
       * @return string expected path to the language model database
-     * @access private
       */
-    function _get_data_loc($fname)
+    protected function _get_data_loc($fname)
      {
          if ($fname{0} == '/' || $fname{0} == '.') {
              // if filename starts with a slash, assume it's an absolute pathname
@@ -229,9 +214,8 @@ class Text_LanguageDetect
       *
       * @return array the language model data
       * @throws Text_LanguageDetect_Exception
-     * @access private
       */
-    function _readdb($fname)
+    protected function _readdb($fname)
      {
          // finds the correct data dir
          $fname = $this->_get_data_loc($fname);
@@ -259,9 +243,8 @@ class Text_LanguageDetect
       * @param array $trigram Trigram data from database
       *
       * @return void
-     * @access private
       */
-    function _checkTrigram($trigram)
+    protected function _checkTrigram($trigram)
      {
          if (!is_array($trigram)) {
              if (ini_get('magic_quotes_runtime')) {
@@ -353,11 +336,10 @@ class Text_LanguageDetect
      /**
       * Returns the number of languages that this object can detect
       *
-     * @access public
       * @return int            the number of languages
-     * @throws   Text_LanguageDetect_Exception
+     * @throws Text_LanguageDetect_Exception
       */
-    function getLanguageCount()
+    public function getLanguageCount()
      {
          return count($this->_lang_db);
      }
@@ -395,11 +377,10 @@ class Text_LanguageDetect
      /**
       * Returns the list of detectable languages
       *
-     * @access public
-     * @return array        the names of the languages known to this object
-     * @throws   Text_LanguageDetect_Exception
+     * @return array        the names of the languages known to this object<<<<<<<
+     * @throws Text_LanguageDetect_Exception
       */
-    function getLanguages()
+    public function getLanguages()
      {
          return $this->_convertToNameMode(
              array_keys($this->_lang_db)
@@ -437,7 +418,7 @@ class Text_LanguageDetect
       *
       * @return void
       */
-    function setNameMode($name_mode)
+    public function setNameMode($name_mode)
      {
          $this->_name_mode = $name_mode;
      }
@@ -467,10 +448,9 @@ class Text_LanguageDetect
       * @param string $text text to convert
       *
       * @return     array array of trigram frequencies
-     * @access     private
       * @deprecated Superceded by the Text_LanguageDetect_Parser class
       */
-    function _trigram($text)
+    protected function _trigram($text)
      {
          $s = new Text_LanguageDetect_Parser($text);
          $s->prepareTrigram();
@@ -488,9 +468,8 @@ class Text_LanguageDetect
       * @param array $arr array of trigram
       *
       * @return array ranks of trigrams
-     * @access protected
       */
-    function _arr_rank($arr)
+    protected function _arr_rank($arr)
      {
  
          // sorts alphabetically first as a standard way of breaking rank ties
@@ -518,12 +497,11 @@ class Text_LanguageDetect
      /**
       * Sorts an array by value breaking ties alphabetically
       *
-     * @param array &$arr the array to sort
+     * @param array $arr the array to sort
       *
       * @return void
-     * @access private
       */
-    function _bub_sort(&$arr)
+    protected function _bub_sort(&$arr)
      {
          // should do the same as this perl statement:
          // sort { $trigrams{$b} == $trigrams{$a}
@@ -561,9 +539,8 @@ class Text_LanguageDetect
       *
       * @return int 1 if $a is greater, -1 if not
       * @see    _bub_sort()
-     * @access private
       */
-    function _sort_func($a, $b)
+    protected function _sort_func($a, $b)
      {
          // each is actually a key/value pair, so that it can compare using both
          list($a_key, $a_value) = $a;
@@ -601,9 +578,8 @@ class Text_LanguageDetect
       *
       * @return int the sum of the differences between the ranks of
       *             the two trigram sets
-     * @access private
       */
-    function _distance($arr1, $arr2)
+    protected function _distance($arr1, $arr2)
      {
          $sumdist = 0;
  
@@ -634,9 +610,8 @@ class Text_LanguageDetect
       *
       * @return float the normalized score
       * @see    _distance()
-     * @access private
       */
-    function _normalize_score($score, $base_count = null)
+    protected function _normalize_score($score, $base_count = null)
      {
          if ($base_count === null) {
              $base_count = $this->_threshold;
@@ -712,7 +687,7 @@ class Text_LanguageDetect
          $sample_obj->setPadStart(!$this->_perl_compatible);
          $sample_obj->analyze();
  
-        $trigram_freqs =& $sample_obj->getTrigramRanks();
+        $trigram_freqs = $sample_obj->getTrigramRanks();
          $trigram_count = count($trigram_freqs);
  
          if ($trigram_count == 0) {
@@ -723,7 +698,7 @@ class Text_LanguageDetect
  
          // use unicode block detection to narrow down the possibilities
          if ($this->_use_unicode_narrowing) {
-            $blocks =& $sample_obj->getUnicodeBlocks();
+            $blocks = $sample_obj->getUnicodeBlocks();
  
              if (is_array($blocks)) {
                  $present_blocks = array_keys($blocks);
@@ -975,9 +950,8 @@ class Text_LanguageDetect
       *
       * @return mixed Block name, -1 if it failed
       * @see    unicodeBlockName()
-     * @access protected
       */
-    function _unicode_block_name($unicode, $blocks, $block_count = -1)
+    protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
      {
          // for a reference, see
          // http://www.unicode.org/Public/UNIDATA/Blocks.txt
@@ -1028,9 +1002,8 @@ class Text_LanguageDetect
       *
       * @return array the database of unicode block definitions
       * @throws Text_LanguageDetect_Exception
-     * @access protected
       */
-    function _read_unicode_block_db()
+    protected function _read_unicode_block_db()
      {
          // since the unicode definitions are always going to be the same,
          // might as well share the memory for the db with all other instances
@@ -1149,14 +1122,13 @@ class Text_LanguageDetect
       * Uses a nearest neighbor technique to generate the maximum possible
       * number of dendograms from the similarity data.
       *
-     * @access      public
-     * @return      array language cluster data
-     * @throws      Text_LanguageDetect_Exception
-     * @see         languageSimilarity()
-     * @deprecated  this function will eventually be removed and placed into
+     * @return     array language cluster data
+     * @throws     Text_LanguageDetect_Exception
+     * @see        languageSimilarity()
+     * @deprecated this function will eventually be removed and placed into
       *              the model generation class
       */
-    function clusterLanguages()
+    public function clusterLanguages()
      {
          // todo: set the maximum number of clusters
          // return cached result, if any
@@ -1465,7 +1437,7 @@ class Text_LanguageDetect
      }
  
      /**
-     * ut8-safe strlen()
+     * UTF8-safe strlen()
       *
       * Returns the numbers of characters (not bytes) in a utf8 string
       *
@@ -1489,10 +1461,9 @@ class Text_LanguageDetect
       * @param string $char a utf8 (possibly multi-byte) char
       *
       * @return int unicode value
-     * @access protected
       * @link   http://en.wikipedia.org/wiki/UTF-8
       */
-    function _utf8char2unicode($char)
+    protected function _utf8char2unicode($char)
      {
          // strlen() here will actually get the binary length of a single char
          switch (strlen($char)) {
@@ -1529,20 +1500,19 @@ class Text_LanguageDetect
      }
  
      /**
-     * utf8-safe fast character iterator
+     * UTF8-safe fast character iterator
       *
       * Will get the next character starting from $counter, which will then be
       * incremented. If a multi-byte char the bytes will be concatenated and
       * $counter will be incremeted by the number of bytes in the char.
       *
       * @param string $str             the string being iterated over
-     * @param int    &$counter        the iterator, will increment by reference
+     * @param int    $counter         the iterator, will increment by reference
       * @param bool   $special_convert whether to do special conversions
       *
       * @return char the next (possibly multi-byte) char from $counter
-     * @access private
       */
-    static function _next_char($str, &$counter, $special_convert = false)
+    protected static function _next_char($str, &$counter, $special_convert = false)
      {
          $char = $str{$counter++};
          $ord = ord($char);
@@ -1634,7 +1604,7 @@ class Text_LanguageDetect
       *
       * @return string|array Language name
       */
-    function _convertFromNameMode($lang, $convertKey = false)
+    protected function _convertFromNameMode($lang, $convertKey = false)
      {
          if ($this->_name_mode == 0) {
              return $lang;
@@ -1674,7 +1644,7 @@ class Text_LanguageDetect
       *
       * @return string|array Language name
       */
-    function _convertToNameMode($lang, $convertKey = false)
+    protected function _convertToNameMode($lang, $convertKey = false)
      {
          if ($this->_name_mode == 0) {
              return $lang;
diff --git a/library/langdet/Text/LanguageDetect/Exception.php b/library/langdet/Text/LanguageDetect/Exception.php

index 196d994f5cb712fa7e7105e2a994705ce476f70d..cdbfe13ba110a7f29cf3e4b9f9e491ec723d4361 100644 (file)
--- a/library/langdet/Text/LanguageDetect/Exception.php
+++ b/library/langdet/Text/LanguageDetect/Exception.php
@@ -1,4 +1,28 @@
  <?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category Text
+ * @package  Text_LanguageDetect
+ * @author   Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @license  BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link     http://pear.php.net/package/Text_LanguageDetect/
+ */
+
+/**
+ * Part of the PEAR language detection package
+ *
+ * PHP version 5
+ *
+ * @category Text
+ * @package  Text_LanguageDetect
+ * @author   Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @license  BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link     http://pear.php.net/package/Text_LanguageDetect/
+ * @link     http://langdetect.blogspot.com/
+ */
  class Text_LanguageDetect_Exception extends Exception
  {
      /**
diff --git a/library/langdet/Text/LanguageDetect/ISO639.php b/library/langdet/Text/LanguageDetect/ISO639.php

index c577a2e1a06a8b3e407ed8dbb8aa65560d7d9028..9fd76c92029479ab546c0e0b5122349a38a79ae2 100644 (file)
--- a/library/langdet/Text/LanguageDetect/ISO639.php
+++ b/library/langdet/Text/LanguageDetect/ISO639.php
@@ -9,7 +9,6 @@
   * @author    Christian Weiske <cweiske@php.net>
   * @copyright 2011 Christian Weiske <cweiske@php.net>
   * @license   http://www.debian.org/misc/bsd.license BSD
- * @version   SVN: $Id$
   * @link      http://pear.php.net/package/Text_LanguageDetect/
   */
  
@@ -23,7 +22,7 @@
   * @package   Text_LanguageDetect
   * @author    Christian Weiske <cweiske@php.net>
   * @copyright 2011 Christian Weiske <cweiske@php.net>
- * @license   http://www.debian.org/misc/bsd.license BSD
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
   * @link      http://www.loc.gov/standards/iso639-2/php/code_list.php
   */
  class Text_LanguageDetect_ISO639
diff --git a/library/langdet/Text/LanguageDetect/Parser.php b/library/langdet/Text/LanguageDetect/Parser.php

index 1c20c2657eb2b466d90846842547576e026b5fcb..3ec1776401a8a0bc88bf81a149a7833b4f1f0ea7 100644 (file)
--- a/library/langdet/Text/LanguageDetect/Parser.php
+++ b/library/langdet/Text/LanguageDetect/Parser.php
@@ -1,16 +1,15 @@
  <?php
-
  /**
- * This class represents a text sample to be parsed.
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
   *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
- * @link        http://pear.php.net/package/Text_LanguageDetect/
- * @link        http://langdetect.blogspot.com/
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2006 Nicholas Pisarro
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
   */
  
  /**
@@ -20,99 +19,106 @@
   * class. After a new profile has been built, the data can be retrieved using
   * the accessor functions.
   *
- * This class is intended to be used by the Text_LanguageDetect class, not 
+ * This class is intended to be used by the Text_LanguageDetect class, not
   * end-users.
   *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     release: 0.3.0
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2006 Nicholas Pisarro
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
+ * @version   Release: @package_version@
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
   */
  class Text_LanguageDetect_Parser extends Text_LanguageDetect
  {
      /**
-     * the piece of text being parsed
+     * The piece of text being parsed
       *
-     * @access  private
-     * @var     string
+     * @var string
       */
-    var $_string;
+    protected $_string;
  
      /**
-     * stores the trigram frequencies of the sample
+     * Stores the trigram frequencies of the sample
       *
-     * @access  private
-     * @var     string
+     * @var string
       */
-    var $_trigrams = array();
+    protected $_trigrams = array();
  
      /**
-     * stores the trigram ranks of the sample
+     * Stores the trigram ranks of the sample
       *
-     * @access  private
-     * @var     array
+     * @var array
       */
-    var $_trigram_ranks = array();
+    protected $_trigram_ranks = array();
  
      /**
-     * stores the unicode blocks of the sample
+     * Stores the unicode blocks of the sample
       *
-     * @access  private
-     * @var     array
+     * @var array
       */
-    var $_unicode_blocks = array();
-    
+    protected $_unicode_blocks = array();
+
      /**
       * Whether the parser should compile the unicode ranges
-     * 
-     * @access  private
-     * @var     bool
+     *
+     * @var bool
       */
-    var $_compile_unicode = false;
+    protected $_compile_unicode = false;
  
      /**
       * Whether the parser should compile trigrams
       *
-     * @access  private
-     * @var     bool
+     * @var bool
       */
-    var $_compile_trigram = false;
+    protected $_compile_trigram = false;
  
      /**
       * Whether the trigram parser should pad the beginning of the string
       *
-     * @access  private
-     * @var     bool
+     * @var bool
       */
-    var $_trigram_pad_start = false;
+    protected $_trigram_pad_start = false;
  
      /**
       * Whether the unicode parser should skip non-alphabetical ascii chars
       *
-     * @access  private
-     * @var     bool
+     * @var bool
       */
-    var $_unicode_skip_symbols = true;
+    protected $_unicode_skip_symbols = true;
  
      /**
       * Constructor
       *
-     * @access  private
-     * @param   string  $string     string to be parsed
+     * @param string $string string to be parsed
       */
-    function Text_LanguageDetect_Parser($string) {
+    public function __construct($string)
+    {
          $this->_string = $string;
      }
  
+    /**
+     * PHP 4 constructor for backwards compatibility.
+     *
+     * @param string $string string to be parsed
+     *
+     * @return void
+     */
+    public function Text_LanguageDetect_Parser($string)
+    {
+        self::__construct($string);
+    }
+
      /**
       * Returns true if a string is suitable for parsing
       *
-     * @param   string  $str    input string to test
-     * @return  bool            true if acceptable, false if not
+     * @param string $str input string to test
+     *
+     * @return bool true if acceptable, false if not
       */
-    public static function validateString($str) {
+    public static function validateString($str)
+    {
          if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
              return true;
          } else {
@@ -121,34 +127,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
      }
  
      /**
-     * turn on/off trigram counting
+     * Turn on/off trigram counting
       *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
+     * @param bool $bool true for on, false for off
+     *
+     * @return void
       */
-    function prepareTrigram($bool = true)
+    public function prepareTrigram($bool = true)
      {
          $this->_compile_trigram = $bool;
      }
  
      /**
-     * turn on/off unicode block counting
+     * Turn on/off unicode block counting
+     *
+     * @param bool $bool true for on, false for off
       *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
+     * @return void
       */
-    function prepareUnicode($bool = true)
+    public function prepareUnicode($bool = true)
      {
          $this->_compile_unicode = $bool;
      }
  
      /**
-     * turn on/off padding the beginning of the sample string
+     * Turn on/off padding the beginning of the sample string
+     *
+     * @param bool $bool true for on, false for off
       *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
+     * @return void
       */
-    function setPadStart($bool = true)
+    public function setPadStart($bool = true)
      {
          $this->_trigram_pad_start = $bool;
      }
@@ -156,10 +165,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
      /**
       * Should the unicode block counter skip non-alphabetical ascii chars?
       *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
+     * @param bool $bool true for on, false for off
+     *
+     * @return void
       */
-    function setUnicodeSkipSymbols($bool = true)
+    public function setUnicodeSkipSymbols($bool = true)
      {
          $this->_unicode_skip_symbols = $bool;
      }
@@ -167,10 +177,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
      /**
       * Returns the trigram ranks for the text sample
       *
-     * @access  public
-     * @return  array    trigram ranks in the text sample
+     * @return array Trigram ranks in the text sample
       */
-    function &getTrigramRanks()
+    public function getTrigramRanks()
      {
          return $this->_trigram_ranks;
      }
@@ -178,39 +187,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
      /**
       * Return the trigram freqency table
       *
-     * only used in testing to make sure the parser is working
+     * Only used in testing to make sure the parser is working
       *
-     * @access  public
-     * @return  array    trigram freqencies in the text sample
+     * @return array Trigram freqencies in the text sample
       */
-    function &getTrigramFreqs()
+    public function getTrigramFreqs()
      {
          return $this->_trigram;
      }
  
      /**
-     * returns the array of unicode blocks
+     * Returns the array of unicode blocks
       *
-     * @access  public
-     * @return  array   unicode blocks in the text sample
+     * @return array Unicode blocks in the text sample
       */
-    function &getUnicodeBlocks()
+    public function getUnicodeBlocks()
      {
          return $this->_unicode_blocks;
      }
  
      /**
       * Executes the parsing operation
-     * 
-     * Be sure to call the set*() functions to set options and the 
+     *
+     * Be sure to call the set*() functions to set options and the
       * prepare*() functions first to tell it what kind of data to compute
       *
       * Afterwards the get*() functions can be used to access the compiled
       * information.
       *
-     * @access public
+     * @return void
       */
-    function analyze()
+    public function analyze()
      {
          $len = strlen($this->_string);
          $byte_counter = 0;
@@ -258,9 +265,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
              if ($this->_compile_trigram) {
                  if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
                      if (!isset($this->_trigram[$a . $b . $char])) {
-                       $this->_trigram[$a . $b . $char] = 1;
+                        $this->_trigram[$a . $b . $char] = 1;
                      } else {
-                       $this->_trigram[$a . $b . $char]++;
+                        $this->_trigram[$a . $b . $char]++;
                      }
                  }
  
@@ -271,10 +278,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
              // unicode block detection
              if ($this->_compile_unicode) {
                  if ($this->_unicode_skip_symbols
-                        && strlen($char) == 1
-                        && ($char < 'A' || $char > 'z'
-                        || ($char > 'Z' && $char < 'a'))
-                        && $char != "'") {  // does not skip the apostrophe
+                    && strlen($char) == 1
+                    && ($char < 'A' || $char > 'z'
+                    || ($char > 'Z' && $char < 'a'))
+                    && $char != "'"
+                ) {  // does not skip the apostrophe
                                              // since it's included in the language
                                              // models
  
@@ -297,7 +305,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
          if ($this->_compile_unicode) {
              foreach ($unicode_chars as $utf8_char => $count) {
                  $search_result = $this->_unicode_block_name(
-                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+                    $this->_utf8char2unicode($utf8_char), $blocks, $block_count
+                );
  
                  if ($search_result != -1) {
                      $block_name = $search_result[2];
diff --git a/library/langdet/data/build-unicode_blocks.php b/library/langdet/data/build-unicode_blocks.php

new file mode 100644 (file)

index 0000000..afa75a5
--- /dev/null
+++ b/library/langdet/data/build-unicode_blocks.php
@@ -0,0 +1,7 @@
+<?php
+/**
+ * Generate the serialized unicode_blocks.dat file shipped with the package
+ */
+$unicode_blocks = include __DIR__ . '/unicode_blocks.php';
+file_put_contents(__DIR__ . '/unicode_blocks.dat', serialize($unicode_blocks));
+?>
+\ No newline at end of file
diff --git a/library/langdet/data/unicode_blocks.dat b/library/langdet/data/unicode_blocks.dat

index 3b24cd2c161897818f5b67e0ba59495bbc9b2173..1f66cac728d71ea24d43f79f1376b0b3cc05cad1 100644 (file)
--- a/library/langdet/data/unicode_blocks.dat
+++ b/library/langdet/data/unicode_blocks.dat
@@ -1 +1 @@
-a:145:{i:0;a:3:{i:0;s:6:"0x0000";i:1;s:6:"0x007F";i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;s:6:"0x0080";i:1;s:6:"0x00FF";i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;s:6:"0x0100";i:1;s:6:"0x017F";i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;s:6:"0x0180";i:1;s:6:"0x024F";i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;s:6:"0x0250";i:1;s:6:"0x02AF";i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;s:6:"0x02B0";i:1;s:6:"0x02FF";i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;s:6:"0x0300";i:1;s:6:"0x036F";i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;s:6:"0x0370";i:1;s:6:"0x03FF";i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;s:6:"0x0400";i:1;s:6:"0x04FF";i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;s:6:"0x0500";i:1;s:6:"0x052F";i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;s:6:"0x0530";i:1;s:6:"0x058F";i:2;s:8:"Armenian";}i:11;a:3:{i:0;s:6:"0x0590";i:1;s:6:"0x05FF";i:2;s:6:"Hebrew";}i:12;a:3:{i:0;s:6:"0x0600";i:1;s:6:"0x06FF";i:2;s:6:"Arabic";}i:13;a:3:{i:0;s:6:"0x0700";i:1;s:6:"0x074F";i:2;s:6:"Syriac";}i:14;a:3:{i:0;s:6:"0x0750";i:1;s:6:"0x077F";i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;s:6:"0x0780";i:1;s:6:"0x07BF";i:2;s:6:"Thaana";}i:16;a:3:{i:0;s:6:"0x0900";i:1;s:6:"0x097F";i:2;s:10:"Devanagari";}i:17;a:3:{i:0;s:6:"0x0980";i:1;s:6:"0x09FF";i:2;s:7:"Bengali";}i:18;a:3:{i:0;s:6:"0x0A00";i:1;s:6:"0x0A7F";i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;s:6:"0x0A80";i:1;s:6:"0x0AFF";i:2;s:8:"Gujarati";}i:20;a:3:{i:0;s:6:"0x0B00";i:1;s:6:"0x0B7F";i:2;s:5:"Oriya";}i:21;a:3:{i:0;s:6:"0x0B80";i:1;s:6:"0x0BFF";i:2;s:5:"Tamil";}i:22;a:3:{i:0;s:6:"0x0C00";i:1;s:6:"0x0C7F";i:2;s:6:"Telugu";}i:23;a:3:{i:0;s:6:"0x0C80";i:1;s:6:"0x0CFF";i:2;s:7:"Kannada";}i:24;a:3:{i:0;s:6:"0x0D00";i:1;s:6:"0x0D7F";i:2;s:9:"Malayalam";}i:25;a:3:{i:0;s:6:"0x0D80";i:1;s:6:"0x0DFF";i:2;s:7:"Sinhala";}i:26;a:3:{i:0;s:6:"0x0E00";i:1;s:6:"0x0E7F";i:2;s:4:"Thai";}i:27;a:3:{i:0;s:6:"0x0E80";i:1;s:6:"0x0EFF";i:2;s:3:"Lao";}i:28;a:3:{i:0;s:6:"0x0F00";i:1;s:6:"0x0FFF";i:2;s:7:"Tibetan";}i:29;a:3:{i:0;s:6:"0x1000";i:1;s:6:"0x109F";i:2;s:7:"Myanmar";}i:30;a:3:{i:0;s:6:"0x10A0";i:1;s:6:"0x10FF";i:2;s:8:"Georgian";}i:31;a:3:{i:0;s:6:"0x1100";i:1;s:6:"0x11FF";i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;s:6:"0x1200";i:1;s:6:"0x137F";i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;s:6:"0x1380";i:1;s:6:"0x139F";i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;s:6:"0x13A0";i:1;s:6:"0x13FF";i:2;s:8:"Cherokee";}i:35;a:3:{i:0;s:6:"0x1400";i:1;s:6:"0x167F";i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;s:6:"0x1680";i:1;s:6:"0x169F";i:2;s:5:"Ogham";}i:37;a:3:{i:0;s:6:"0x16A0";i:1;s:6:"0x16FF";i:2;s:5:"Runic";}i:38;a:3:{i:0;s:6:"0x1700";i:1;s:6:"0x171F";i:2;s:7:"Tagalog";}i:39;a:3:{i:0;s:6:"0x1720";i:1;s:6:"0x173F";i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;s:6:"0x1740";i:1;s:6:"0x175F";i:2;s:5:"Buhid";}i:41;a:3:{i:0;s:6:"0x1760";i:1;s:6:"0x177F";i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;s:6:"0x1780";i:1;s:6:"0x17FF";i:2;s:5:"Khmer";}i:43;a:3:{i:0;s:6:"0x1800";i:1;s:6:"0x18AF";i:2;s:9:"Mongolian";}i:44;a:3:{i:0;s:6:"0x1900";i:1;s:6:"0x194F";i:2;s:5:"Limbu";}i:45;a:3:{i:0;s:6:"0x1950";i:1;s:6:"0x197F";i:2;s:6:"Tai Le";}i:46;a:3:{i:0;s:6:"0x1980";i:1;s:6:"0x19DF";i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;s:6:"0x19E0";i:1;s:6:"0x19FF";i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;s:6:"0x1A00";i:1;s:6:"0x1A1F";i:2;s:8:"Buginese";}i:49;a:3:{i:0;s:6:"0x1D00";i:1;s:6:"0x1D7F";i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;s:6:"0x1D80";i:1;s:6:"0x1DBF";i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;s:6:"0x1DC0";i:1;s:6:"0x1DFF";i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;s:6:"0x1E00";i:1;s:6:"0x1EFF";i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;s:6:"0x1F00";i:1;s:6:"0x1FFF";i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;s:6:"0x2000";i:1;s:6:"0x206F";i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;s:6:"0x2070";i:1;s:6:"0x209F";i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;s:6:"0x20A0";i:1;s:6:"0x20CF";i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;s:6:"0x20D0";i:1;s:6:"0x20FF";i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;s:6:"0x2100";i:1;s:6:"0x214F";i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;s:6:"0x2150";i:1;s:6:"0x218F";i:2;s:12:"Number Forms";}i:60;a:3:{i:0;s:6:"0x2190";i:1;s:6:"0x21FF";i:2;s:6:"Arrows";}i:61;a:3:{i:0;s:6:"0x2200";i:1;s:6:"0x22FF";i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;s:6:"0x2300";i:1;s:6:"0x23FF";i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;s:6:"0x2400";i:1;s:6:"0x243F";i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;s:6:"0x2440";i:1;s:6:"0x245F";i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;s:6:"0x2460";i:1;s:6:"0x24FF";i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;s:6:"0x2500";i:1;s:6:"0x257F";i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;s:6:"0x2580";i:1;s:6:"0x259F";i:2;s:14:"Block Elements";}i:68;a:3:{i:0;s:6:"0x25A0";i:1;s:6:"0x25FF";i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;s:6:"0x2600";i:1;s:6:"0x26FF";i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;s:6:"0x2700";i:1;s:6:"0x27BF";i:2;s:8:"Dingbats";}i:71;a:3:{i:0;s:6:"0x27C0";i:1;s:6:"0x27EF";i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;s:6:"0x27F0";i:1;s:6:"0x27FF";i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;s:6:"0x2800";i:1;s:6:"0x28FF";i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;s:6:"0x2900";i:1;s:6:"0x297F";i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;s:6:"0x2980";i:1;s:6:"0x29FF";i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;s:6:"0x2A00";i:1;s:6:"0x2AFF";i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;s:6:"0x2B00";i:1;s:6:"0x2BFF";i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;s:6:"0x2C00";i:1;s:6:"0x2C5F";i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;s:6:"0x2C80";i:1;s:6:"0x2CFF";i:2;s:6:"Coptic";}i:80;a:3:{i:0;s:6:"0x2D00";i:1;s:6:"0x2D2F";i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;s:6:"0x2D30";i:1;s:6:"0x2D7F";i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;s:6:"0x2D80";i:1;s:6:"0x2DDF";i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;s:6:"0x2E00";i:1;s:6:"0x2E7F";i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;s:6:"0x2E80";i:1;s:6:"0x2EFF";i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;s:6:"0x2F00";i:1;s:6:"0x2FDF";i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;s:6:"0x2FF0";i:1;s:6:"0x2FFF";i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;s:6:"0x3000";i:1;s:6:"0x303F";i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;s:6:"0x3040";i:1;s:6:"0x309F";i:2;s:8:"Hiragana";}i:89;a:3:{i:0;s:6:"0x30A0";i:1;s:6:"0x30FF";i:2;s:8:"Katakana";}i:90;a:3:{i:0;s:6:"0x3100";i:1;s:6:"0x312F";i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;s:6:"0x3130";i:1;s:6:"0x318F";i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;s:6:"0x3190";i:1;s:6:"0x319F";i:2;s:6:"Kanbun";}i:93;a:3:{i:0;s:6:"0x31A0";i:1;s:6:"0x31BF";i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;s:6:"0x31C0";i:1;s:6:"0x31EF";i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;s:6:"0x31F0";i:1;s:6:"0x31FF";i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;s:6:"0x3200";i:1;s:6:"0x32FF";i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;s:6:"0x3300";i:1;s:6:"0x33FF";i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;s:6:"0x3400";i:1;s:6:"0x4DBF";i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;s:6:"0x4DC0";i:1;s:6:"0x4DFF";i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;s:6:"0x4E00";i:1;s:6:"0x9FFF";i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;s:6:"0xA000";i:1;s:6:"0xA48F";i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;s:6:"0xA490";i:1;s:6:"0xA4CF";i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;s:6:"0xA700";i:1;s:6:"0xA71F";i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;s:6:"0xA800";i:1;s:6:"0xA82F";i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;s:6:"0xAC00";i:1;s:6:"0xD7AF";i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;s:6:"0xD800";i:1;s:6:"0xDB7F";i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;s:6:"0xDB80";i:1;s:6:"0xDBFF";i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;s:6:"0xDC00";i:1;s:6:"0xDFFF";i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;s:6:"0xE000";i:1;s:6:"0xF8FF";i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;s:6:"0xF900";i:1;s:6:"0xFAFF";i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;s:6:"0xFB00";i:1;s:6:"0xFB4F";i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;s:6:"0xFB50";i:1;s:6:"0xFDFF";i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;s:6:"0xFE00";i:1;s:6:"0xFE0F";i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;s:6:"0xFE10";i:1;s:6:"0xFE1F";i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;s:6:"0xFE20";i:1;s:6:"0xFE2F";i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;s:6:"0xFE30";i:1;s:6:"0xFE4F";i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;s:6:"0xFE50";i:1;s:6:"0xFE6F";i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;s:6:"0xFE70";i:1;s:6:"0xFEFF";i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;s:6:"0xFF00";i:1;s:6:"0xFFEF";i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;s:6:"0xFFF0";i:1;s:6:"0xFFFF";i:2;s:8:"Specials";}i:121;a:3:{i:0;s:7:"0x10000";i:1;s:7:"0x1007F";i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;s:7:"0x10080";i:1;s:7:"0x100FF";i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;s:7:"0x10100";i:1;s:7:"0x1013F";i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;s:7:"0x10140";i:1;s:7:"0x1018F";i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;s:7:"0x10300";i:1;s:7:"0x1032F";i:2;s:10:"Old Italic";}i:126;a:3:{i:0;s:7:"0x10330";i:1;s:7:"0x1034F";i:2;s:6:"Gothic";}i:127;a:3:{i:0;s:7:"0x10380";i:1;s:7:"0x1039F";i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;s:7:"0x103A0";i:1;s:7:"0x103DF";i:2;s:11:"Old Persian";}i:129;a:3:{i:0;s:7:"0x10400";i:1;s:7:"0x1044F";i:2;s:7:"Deseret";}i:130;a:3:{i:0;s:7:"0x10450";i:1;s:7:"0x1047F";i:2;s:7:"Shavian";}i:131;a:3:{i:0;s:7:"0x10480";i:1;s:7:"0x104AF";i:2;s:7:"Osmanya";}i:132;a:3:{i:0;s:7:"0x10800";i:1;s:7:"0x1083F";i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;s:7:"0x10A00";i:1;s:7:"0x10A5F";i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;s:7:"0x1D000";i:1;s:7:"0x1D0FF";i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;s:7:"0x1D100";i:1;s:7:"0x1D1FF";i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;s:7:"0x1D200";i:1;s:7:"0x1D24F";i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;s:7:"0x1D300";i:1;s:7:"0x1D35F";i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;s:7:"0x1D400";i:1;s:7:"0x1D7FF";i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;s:7:"0x20000";i:1;s:7:"0x2A6DF";i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;s:7:"0x2F800";i:1;s:7:"0x2FA1F";i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;s:7:"0xE0000";i:1;s:7:"0xE007F";i:2;s:4:"Tags";}i:142;a:3:{i:0;s:7:"0xE0100";i:1;s:7:"0xE01EF";i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;s:7:"0xF0000";i:1;s:7:"0xFFFFF";i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;s:8:"0x100000";i:1;s:8:"0x10FFFF";i:2;s:32:"Supplementary Private Use Area-B";}}
-\ No newline at end of file
+a:145:{i:0;a:3:{i:0;i:0;i:1;i:127;i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;i:128;i:1;i:255;i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;i:256;i:1;i:383;i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;i:384;i:1;i:591;i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;i:592;i:1;i:687;i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;i:688;i:1;i:767;i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;i:768;i:1;i:879;i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;i:880;i:1;i:1023;i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;i:1024;i:1;i:1279;i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;i:1280;i:1;i:1327;i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;i:1328;i:1;i:1423;i:2;s:8:"Armenian";}i:11;a:3:{i:0;i:1424;i:1;i:1535;i:2;s:6:"Hebrew";}i:12;a:3:{i:0;i:1536;i:1;i:1791;i:2;s:6:"Arabic";}i:13;a:3:{i:0;i:1792;i:1;i:1871;i:2;s:6:"Syriac";}i:14;a:3:{i:0;i:1872;i:1;i:1919;i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;i:1920;i:1;i:1983;i:2;s:6:"Thaana";}i:16;a:3:{i:0;i:2304;i:1;i:2431;i:2;s:10:"Devanagari";}i:17;a:3:{i:0;i:2432;i:1;i:2559;i:2;s:7:"Bengali";}i:18;a:3:{i:0;i:2560;i:1;i:2687;i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;i:2688;i:1;i:2815;i:2;s:8:"Gujarati";}i:20;a:3:{i:0;i:2816;i:1;i:2943;i:2;s:5:"Oriya";}i:21;a:3:{i:0;i:2944;i:1;i:3071;i:2;s:5:"Tamil";}i:22;a:3:{i:0;i:3072;i:1;i:3199;i:2;s:6:"Telugu";}i:23;a:3:{i:0;i:3200;i:1;i:3327;i:2;s:7:"Kannada";}i:24;a:3:{i:0;i:3328;i:1;i:3455;i:2;s:9:"Malayalam";}i:25;a:3:{i:0;i:3456;i:1;i:3583;i:2;s:7:"Sinhala";}i:26;a:3:{i:0;i:3584;i:1;i:3711;i:2;s:4:"Thai";}i:27;a:3:{i:0;i:3712;i:1;i:3839;i:2;s:3:"Lao";}i:28;a:3:{i:0;i:3840;i:1;i:4095;i:2;s:7:"Tibetan";}i:29;a:3:{i:0;i:4096;i:1;i:4255;i:2;s:7:"Myanmar";}i:30;a:3:{i:0;i:4256;i:1;i:4351;i:2;s:8:"Georgian";}i:31;a:3:{i:0;i:4352;i:1;i:4607;i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;i:4608;i:1;i:4991;i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;i:4992;i:1;i:5023;i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;i:5024;i:1;i:5119;i:2;s:8:"Cherokee";}i:35;a:3:{i:0;i:5120;i:1;i:5759;i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;i:5760;i:1;i:5791;i:2;s:5:"Ogham";}i:37;a:3:{i:0;i:5792;i:1;i:5887;i:2;s:5:"Runic";}i:38;a:3:{i:0;i:5888;i:1;i:5919;i:2;s:7:"Tagalog";}i:39;a:3:{i:0;i:5920;i:1;i:5951;i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;i:5952;i:1;i:5983;i:2;s:5:"Buhid";}i:41;a:3:{i:0;i:5984;i:1;i:6015;i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;i:6016;i:1;i:6143;i:2;s:5:"Khmer";}i:43;a:3:{i:0;i:6144;i:1;i:6319;i:2;s:9:"Mongolian";}i:44;a:3:{i:0;i:6400;i:1;i:6479;i:2;s:5:"Limbu";}i:45;a:3:{i:0;i:6480;i:1;i:6527;i:2;s:6:"Tai Le";}i:46;a:3:{i:0;i:6528;i:1;i:6623;i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;i:6624;i:1;i:6655;i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;i:6656;i:1;i:6687;i:2;s:8:"Buginese";}i:49;a:3:{i:0;i:7424;i:1;i:7551;i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;i:7552;i:1;i:7615;i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;i:7616;i:1;i:7679;i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;i:7680;i:1;i:7935;i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;i:7936;i:1;i:8191;i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;i:8192;i:1;i:8303;i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;i:8304;i:1;i:8351;i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;i:8352;i:1;i:8399;i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;i:8400;i:1;i:8447;i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;i:8448;i:1;i:8527;i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;i:8528;i:1;i:8591;i:2;s:12:"Number Forms";}i:60;a:3:{i:0;i:8592;i:1;i:8703;i:2;s:6:"Arrows";}i:61;a:3:{i:0;i:8704;i:1;i:8959;i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;i:8960;i:1;i:9215;i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;i:9216;i:1;i:9279;i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;i:9280;i:1;i:9311;i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;i:9312;i:1;i:9471;i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;i:9472;i:1;i:9599;i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;i:9600;i:1;i:9631;i:2;s:14:"Block Elements";}i:68;a:3:{i:0;i:9632;i:1;i:9727;i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;i:9728;i:1;i:9983;i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;i:9984;i:1;i:10175;i:2;s:8:"Dingbats";}i:71;a:3:{i:0;i:10176;i:1;i:10223;i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;i:10224;i:1;i:10239;i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;i:10240;i:1;i:10495;i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;i:10496;i:1;i:10623;i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;i:10624;i:1;i:10751;i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;i:10752;i:1;i:11007;i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;i:11008;i:1;i:11263;i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;i:11264;i:1;i:11359;i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;i:11392;i:1;i:11519;i:2;s:6:"Coptic";}i:80;a:3:{i:0;i:11520;i:1;i:11567;i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;i:11568;i:1;i:11647;i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;i:11648;i:1;i:11743;i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;i:11776;i:1;i:11903;i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;i:11904;i:1;i:12031;i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;i:12032;i:1;i:12255;i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;i:12272;i:1;i:12287;i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;i:12288;i:1;i:12351;i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;i:12352;i:1;i:12447;i:2;s:8:"Hiragana";}i:89;a:3:{i:0;i:12448;i:1;i:12543;i:2;s:8:"Katakana";}i:90;a:3:{i:0;i:12544;i:1;i:12591;i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;i:12592;i:1;i:12687;i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;i:12688;i:1;i:12703;i:2;s:6:"Kanbun";}i:93;a:3:{i:0;i:12704;i:1;i:12735;i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;i:12736;i:1;i:12783;i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;i:12784;i:1;i:12799;i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;i:12800;i:1;i:13055;i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;i:13056;i:1;i:13311;i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;i:13312;i:1;i:19903;i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;i:19904;i:1;i:19967;i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;i:19968;i:1;i:40959;i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;i:40960;i:1;i:42127;i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;i:42128;i:1;i:42191;i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;i:42752;i:1;i:42783;i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;i:43008;i:1;i:43055;i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;i:44032;i:1;i:55215;i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;i:55296;i:1;i:56191;i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;i:56192;i:1;i:56319;i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;i:56320;i:1;i:57343;i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;i:57344;i:1;i:63743;i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;i:63744;i:1;i:64255;i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;i:64256;i:1;i:64335;i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;i:64336;i:1;i:65023;i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;i:65024;i:1;i:65039;i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;i:65040;i:1;i:65055;i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;i:65056;i:1;i:65071;i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;i:65072;i:1;i:65103;i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;i:65104;i:1;i:65135;i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;i:65136;i:1;i:65279;i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;i:65280;i:1;i:65519;i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;i:65520;i:1;i:65535;i:2;s:8:"Specials";}i:121;a:3:{i:0;i:65536;i:1;i:65663;i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;i:65664;i:1;i:65791;i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;i:65792;i:1;i:65855;i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;i:65856;i:1;i:65935;i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;i:66304;i:1;i:66351;i:2;s:10:"Old Italic";}i:126;a:3:{i:0;i:66352;i:1;i:66383;i:2;s:6:"Gothic";}i:127;a:3:{i:0;i:66432;i:1;i:66463;i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;i:66464;i:1;i:66527;i:2;s:11:"Old Persian";}i:129;a:3:{i:0;i:66560;i:1;i:66639;i:2;s:7:"Deseret";}i:130;a:3:{i:0;i:66640;i:1;i:66687;i:2;s:7:"Shavian";}i:131;a:3:{i:0;i:66688;i:1;i:66735;i:2;s:7:"Osmanya";}i:132;a:3:{i:0;i:67584;i:1;i:67647;i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;i:68096;i:1;i:68191;i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;i:118784;i:1;i:119039;i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;i:119040;i:1;i:119295;i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;i:119296;i:1;i:119375;i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;i:119552;i:1;i:119647;i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;i:119808;i:1;i:120831;i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;i:131072;i:1;i:173791;i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;i:194560;i:1;i:195103;i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;i:917504;i:1;i:917631;i:2;s:4:"Tags";}i:142;a:3:{i:0;i:917760;i:1;i:917999;i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;i:983040;i:1;i:1048575;i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;i:1048576;i:1;i:1114111;i:2;s:32:"Supplementary Private Use Area-B";}}
+\ No newline at end of file
diff --git a/library/langdet/data/unicode_blocks.php b/library/langdet/data/unicode_blocks.php

new file mode 100644 (file)

index 0000000..2be6a19
--- /dev/null
+++ b/library/langdet/data/unicode_blocks.php
@@ -0,0 +1,874 @@
+<?php
+return array (
+  0 =>
+  array (
+    0 => 0x0000,
+    1 => 0x007F,
+    2 => 'Basic Latin',
+  ),
+  1 =>
+  array (
+    0 => 0x0080,
+    1 => 0x00FF,
+    2 => 'Latin-1 Supplement',
+  ),
+  2 =>
+  array (
+    0 => 0x0100,
+    1 => 0x017F,
+    2 => 'Latin Extended-A',
+  ),
+  3 =>
+  array (
+    0 => 0x0180,
+    1 => 0x024F,
+    2 => 'Latin Extended-B',
+  ),
+  4 =>
+  array (
+    0 => 0x0250,
+    1 => 0x02AF,
+    2 => 'IPA Extensions',
+  ),
+  5 =>
+  array (
+    0 => 0x02B0,
+    1 => 0x02FF,
+    2 => 'Spacing Modifier Letters',
+  ),
+  6 =>
+  array (
+    0 => 0x0300,
+    1 => 0x036F,
+    2 => 'Combining Diacritical Marks',
+  ),
+  7 =>
+  array (
+    0 => 0x0370,
+    1 => 0x03FF,
+    2 => 'Greek and Coptic',
+  ),
+  8 =>
+  array (
+    0 => 0x0400,
+    1 => 0x04FF,
+    2 => 'Cyrillic',
+  ),
+  9 =>
+  array (
+    0 => 0x0500,
+    1 => 0x052F,
+    2 => 'Cyrillic Supplement',
+  ),
+  10 =>
+  array (
+    0 => 0x0530,
+    1 => 0x058F,
+    2 => 'Armenian',
+  ),
+  11 =>
+  array (
+    0 => 0x0590,
+    1 => 0x05FF,
+    2 => 'Hebrew',
+  ),
+  12 =>
+  array (
+    0 => 0x0600,
+    1 => 0x06FF,
+    2 => 'Arabic',
+  ),
+  13 =>
+  array (
+    0 => 0x0700,
+    1 => 0x074F,
+    2 => 'Syriac',
+  ),
+  14 =>
+  array (
+    0 => 0x0750,
+    1 => 0x077F,
+    2 => 'Arabic Supplement',
+  ),
+  15 =>
+  array (
+    0 => 0x0780,
+    1 => 0x07BF,
+    2 => 'Thaana',
+  ),
+  16 =>
+  array (
+    0 => 0x0900,
+    1 => 0x097F,
+    2 => 'Devanagari',
+  ),
+  17 =>
+  array (
+    0 => 0x0980,
+    1 => 0x09FF,
+    2 => 'Bengali',
+  ),
+  18 =>
+  array (
+    0 => 0x0A00,
+    1 => 0x0A7F,
+    2 => 'Gurmukhi',
+  ),
+  19 =>
+  array (
+    0 => 0x0A80,
+    1 => 0x0AFF,
+    2 => 'Gujarati',
+  ),
+  20 =>
+  array (
+    0 => 0x0B00,
+    1 => 0x0B7F,
+    2 => 'Oriya',
+  ),
+  21 =>
+  array (
+    0 => 0x0B80,
+    1 => 0x0BFF,
+    2 => 'Tamil',
+  ),
+  22 =>
+  array (
+    0 => 0x0C00,
+    1 => 0x0C7F,
+    2 => 'Telugu',
+  ),
+  23 =>
+  array (
+    0 => 0x0C80,
+    1 => 0x0CFF,
+    2 => 'Kannada',
+  ),
+  24 =>
+  array (
+    0 => 0x0D00,
+    1 => 0x0D7F,
+    2 => 'Malayalam',
+  ),
+  25 =>
+  array (
+    0 => 0x0D80,
+    1 => 0x0DFF,
+    2 => 'Sinhala',
+  ),
+  26 =>
+  array (
+    0 => 0x0E00,
+    1 => 0x0E7F,
+    2 => 'Thai',
+  ),
+  27 =>
+  array (
+    0 => 0x0E80,
+    1 => 0x0EFF,
+    2 => 'Lao',
+  ),
+  28 =>
+  array (
+    0 => 0x0F00,
+    1 => 0x0FFF,
+    2 => 'Tibetan',
+  ),
+  29 =>
+  array (
+    0 => 0x1000,
+    1 => 0x109F,
+    2 => 'Myanmar',
+  ),
+  30 =>
+  array (
+    0 => 0x10A0,
+    1 => 0x10FF,
+    2 => 'Georgian',
+  ),
+  31 =>
+  array (
+    0 => 0x1100,
+    1 => 0x11FF,
+    2 => 'Hangul Jamo',
+  ),
+  32 =>
+  array (
+    0 => 0x1200,
+    1 => 0x137F,
+    2 => 'Ethiopic',
+  ),
+  33 =>
+  array (
+    0 => 0x1380,
+    1 => 0x139F,
+    2 => 'Ethiopic Supplement',
+  ),
+  34 =>
+  array (
+    0 => 0x13A0,
+    1 => 0x13FF,
+    2 => 'Cherokee',
+  ),
+  35 =>
+  array (
+    0 => 0x1400,
+    1 => 0x167F,
+    2 => 'Unified Canadian Aboriginal Syllabics',
+  ),
+  36 =>
+  array (
+    0 => 0x1680,
+    1 => 0x169F,
+    2 => 'Ogham',
+  ),
+  37 =>
+  array (
+    0 => 0x16A0,
+    1 => 0x16FF,
+    2 => 'Runic',
+  ),
+  38 =>
+  array (
+    0 => 0x1700,
+    1 => 0x171F,
+    2 => 'Tagalog',
+  ),
+  39 =>
+  array (
+    0 => 0x1720,
+    1 => 0x173F,
+    2 => 'Hanunoo',
+  ),
+  40 =>
+  array (
+    0 => 0x1740,
+    1 => 0x175F,
+    2 => 'Buhid',
+  ),
+  41 =>
+  array (
+    0 => 0x1760,
+    1 => 0x177F,
+    2 => 'Tagbanwa',
+  ),
+  42 =>
+  array (
+    0 => 0x1780,
+    1 => 0x17FF,
+    2 => 'Khmer',
+  ),
+  43 =>
+  array (
+    0 => 0x1800,
+    1 => 0x18AF,
+    2 => 'Mongolian',
+  ),
+  44 =>
+  array (
+    0 => 0x1900,
+    1 => 0x194F,
+    2 => 'Limbu',
+  ),
+  45 =>
+  array (
+    0 => 0x1950,
+    1 => 0x197F,
+    2 => 'Tai Le',
+  ),
+  46 =>
+  array (
+    0 => 0x1980,
+    1 => 0x19DF,
+    2 => 'New Tai Lue',
+  ),
+  47 =>
+  array (
+    0 => 0x19E0,
+    1 => 0x19FF,
+    2 => 'Khmer Symbols',
+  ),
+  48 =>
+  array (
+    0 => 0x1A00,
+    1 => 0x1A1F,
+    2 => 'Buginese',
+  ),
+  49 =>
+  array (
+    0 => 0x1D00,
+    1 => 0x1D7F,
+    2 => 'Phonetic Extensions',
+  ),
+  50 =>
+  array (
+    0 => 0x1D80,
+    1 => 0x1DBF,
+    2 => 'Phonetic Extensions Supplement',
+  ),
+  51 =>
+  array (
+    0 => 0x1DC0,
+    1 => 0x1DFF,
+    2 => 'Combining Diacritical Marks Supplement',
+  ),
+  52 =>
+  array (
+    0 => 0x1E00,
+    1 => 0x1EFF,
+    2 => 'Latin Extended Additional',
+  ),
+  53 =>
+  array (
+    0 => 0x1F00,
+    1 => 0x1FFF,
+    2 => 'Greek Extended',
+  ),
+  54 =>
+  array (
+    0 => 0x2000,
+    1 => 0x206F,
+    2 => 'General Punctuation',
+  ),
+  55 =>
+  array (
+    0 => 0x2070,
+    1 => 0x209F,
+    2 => 'Superscripts and Subscripts',
+  ),
+  56 =>
+  array (
+    0 => 0x20A0,
+    1 => 0x20CF,
+    2 => 'Currency Symbols',
+  ),
+  57 =>
+  array (
+    0 => 0x20D0,
+    1 => 0x20FF,
+    2 => 'Combining Diacritical Marks for Symbols',
+  ),
+  58 =>
+  array (
+    0 => 0x2100,
+    1 => 0x214F,
+    2 => 'Letterlike Symbols',
+  ),
+  59 =>
+  array (
+    0 => 0x2150,
+    1 => 0x218F,
+    2 => 'Number Forms',
+  ),
+  60 =>
+  array (
+    0 => 0x2190,
+    1 => 0x21FF,
+    2 => 'Arrows',
+  ),
+  61 =>
+  array (
+    0 => 0x2200,
+    1 => 0x22FF,
+    2 => 'Mathematical Operators',
+  ),
+  62 =>
+  array (
+    0 => 0x2300,
+    1 => 0x23FF,
+    2 => 'Miscellaneous Technical',
+  ),
+  63 =>
+  array (
+    0 => 0x2400,
+    1 => 0x243F,
+    2 => 'Control Pictures',
+  ),
+  64 =>
+  array (
+    0 => 0x2440,
+    1 => 0x245F,
+    2 => 'Optical Character Recognition',
+  ),
+  65 =>
+  array (
+    0 => 0x2460,
+    1 => 0x24FF,
+    2 => 'Enclosed Alphanumerics',
+  ),
+  66 =>
+  array (
+    0 => 0x2500,
+    1 => 0x257F,
+    2 => 'Box Drawing',
+  ),
+  67 =>
+  array (
+    0 => 0x2580,
+    1 => 0x259F,
+    2 => 'Block Elements',
+  ),
+  68 =>
+  array (
+    0 => 0x25A0,
+    1 => 0x25FF,
+    2 => 'Geometric Shapes',
+  ),
+  69 =>
+  array (
+    0 => 0x2600,
+    1 => 0x26FF,
+    2 => 'Miscellaneous Symbols',
+  ),
+  70 =>
+  array (
+    0 => 0x2700,
+    1 => 0x27BF,
+    2 => 'Dingbats',
+  ),
+  71 =>
+  array (
+    0 => 0x27C0,
+    1 => 0x27EF,
+    2 => 'Miscellaneous Mathematical Symbols-A',
+  ),
+  72 =>
+  array (
+    0 => 0x27F0,
+    1 => 0x27FF,
+    2 => 'Supplemental Arrows-A',
+  ),
+  73 =>
+  array (
+    0 => 0x2800,
+    1 => 0x28FF,
+    2 => 'Braille Patterns',
+  ),
+  74 =>
+  array (
+    0 => 0x2900,
+    1 => 0x297F,
+    2 => 'Supplemental Arrows-B',
+  ),
+  75 =>
+  array (
+    0 => 0x2980,
+    1 => 0x29FF,
+    2 => 'Miscellaneous Mathematical Symbols-B',
+  ),
+  76 =>
+  array (
+    0 => 0x2A00,
+    1 => 0x2AFF,
+    2 => 'Supplemental Mathematical Operators',
+  ),
+  77 =>
+  array (
+    0 => 0x2B00,
+    1 => 0x2BFF,
+    2 => 'Miscellaneous Symbols and Arrows',
+  ),
+  78 =>
+  array (
+    0 => 0x2C00,
+    1 => 0x2C5F,
+    2 => 'Glagolitic',
+  ),
+  79 =>
+  array (
+    0 => 0x2C80,
+    1 => 0x2CFF,
+    2 => 'Coptic',
+  ),
+  80 =>
+  array (
+    0 => 0x2D00,
+    1 => 0x2D2F,
+    2 => 'Georgian Supplement',
+  ),
+  81 =>
+  array (
+    0 => 0x2D30,
+    1 => 0x2D7F,
+    2 => 'Tifinagh',
+  ),
+  82 =>
+  array (
+    0 => 0x2D80,
+    1 => 0x2DDF,
+    2 => 'Ethiopic Extended',
+  ),
+  83 =>
+  array (
+    0 => 0x2E00,
+    1 => 0x2E7F,
+    2 => 'Supplemental Punctuation',
+  ),
+  84 =>
+  array (
+    0 => 0x2E80,
+    1 => 0x2EFF,
+    2 => 'CJK Radicals Supplement',
+  ),
+  85 =>
+  array (
+    0 => 0x2F00,
+    1 => 0x2FDF,
+    2 => 'Kangxi Radicals',
+  ),
+  86 =>
+  array (
+    0 => 0x2FF0,
+    1 => 0x2FFF,
+    2 => 'Ideographic Description Characters',
+  ),
+  87 =>
+  array (
+    0 => 0x3000,
+    1 => 0x303F,
+    2 => 'CJK Symbols and Punctuation',
+  ),
+  88 =>
+  array (
+    0 => 0x3040,
+    1 => 0x309F,
+    2 => 'Hiragana',
+  ),
+  89 =>
+  array (
+    0 => 0x30A0,
+    1 => 0x30FF,
+    2 => 'Katakana',
+  ),
+  90 =>
+  array (
+    0 => 0x3100,
+    1 => 0x312F,
+    2 => 'Bopomofo',
+  ),
+  91 =>
+  array (
+    0 => 0x3130,
+    1 => 0x318F,
+    2 => 'Hangul Compatibility Jamo',
+  ),
+  92 =>
+  array (
+    0 => 0x3190,
+    1 => 0x319F,
+    2 => 'Kanbun',
+  ),
+  93 =>
+  array (
+    0 => 0x31A0,
+    1 => 0x31BF,
+    2 => 'Bopomofo Extended',
+  ),
+  94 =>
+  array (
+    0 => 0x31C0,
+    1 => 0x31EF,
+    2 => 'CJK Strokes',
+  ),
+  95 =>
+  array (
+    0 => 0x31F0,
+    1 => 0x31FF,
+    2 => 'Katakana Phonetic Extensions',
+  ),
+  96 =>
+  array (
+    0 => 0x3200,
+    1 => 0x32FF,
+    2 => 'Enclosed CJK Letters and Months',
+  ),
+  97 =>
+  array (
+    0 => 0x3300,
+    1 => 0x33FF,
+    2 => 'CJK Compatibility',
+  ),
+  98 =>
+  array (
+    0 => 0x3400,
+    1 => 0x4DBF,
+    2 => 'CJK Unified Ideographs Extension A',
+  ),
+  99 =>
+  array (
+    0 => 0x4DC0,
+    1 => 0x4DFF,
+    2 => 'Yijing Hexagram Symbols',
+  ),
+  100 =>
+  array (
+    0 => 0x4E00,
+    1 => 0x9FFF,
+    2 => 'CJK Unified Ideographs',
+  ),
+  101 =>
+  array (
+    0 => 0xA000,
+    1 => 0xA48F,
+    2 => 'Yi Syllables',
+  ),
+  102 =>
+  array (
+    0 => 0xA490,
+    1 => 0xA4CF,
+    2 => 'Yi Radicals',
+  ),
+  103 =>
+  array (
+    0 => 0xA700,
+    1 => 0xA71F,
+    2 => 'Modifier Tone Letters',
+  ),
+  104 =>
+  array (
+    0 => 0xA800,
+    1 => 0xA82F,
+    2 => 'Syloti Nagri',
+  ),
+  105 =>
+  array (
+    0 => 0xAC00,
+    1 => 0xD7AF,
+    2 => 'Hangul Syllables',
+  ),
+  106 =>
+  array (
+    0 => 0xD800,
+    1 => 0xDB7F,
+    2 => 'High Surrogates',
+  ),
+  107 =>
+  array (
+    0 => 0xDB80,
+    1 => 0xDBFF,
+    2 => 'High Private Use Surrogates',
+  ),
+  108 =>
+  array (
+    0 => 0xDC00,
+    1 => 0xDFFF,
+    2 => 'Low Surrogates',
+  ),
+  109 =>
+  array (
+    0 => 0xE000,
+    1 => 0xF8FF,
+    2 => 'Private Use Area',
+  ),
+  110 =>
+  array (
+    0 => 0xF900,
+    1 => 0xFAFF,
+    2 => 'CJK Compatibility Ideographs',
+  ),
+  111 =>
+  array (
+    0 => 0xFB00,
+    1 => 0xFB4F,
+    2 => 'Alphabetic Presentation Forms',
+  ),
+  112 =>
+  array (
+    0 => 0xFB50,
+    1 => 0xFDFF,
+    2 => 'Arabic Presentation Forms-A',
+  ),
+  113 =>
+  array (
+    0 => 0xFE00,
+    1 => 0xFE0F,
+    2 => 'Variation Selectors',
+  ),
+  114 =>
+  array (
+    0 => 0xFE10,
+    1 => 0xFE1F,
+    2 => 'Vertical Forms',
+  ),
+  115 =>
+  array (
+    0 => 0xFE20,
+    1 => 0xFE2F,
+    2 => 'Combining Half Marks',
+  ),
+  116 =>
+  array (
+    0 => 0xFE30,
+    1 => 0xFE4F,
+    2 => 'CJK Compatibility Forms',
+  ),
+  117 =>
+  array (
+    0 => 0xFE50,
+    1 => 0xFE6F,
+    2 => 'Small Form Variants',
+  ),
+  118 =>
+  array (
+    0 => 0xFE70,
+    1 => 0xFEFF,
+    2 => 'Arabic Presentation Forms-B',
+  ),
+  119 =>
+  array (
+    0 => 0xFF00,
+    1 => 0xFFEF,
+    2 => 'Halfwidth and Fullwidth Forms',
+  ),
+  120 =>
+  array (
+    0 => 0xFFF0,
+    1 => 0xFFFF,
+    2 => 'Specials',
+  ),
+  121 =>
+  array (
+    0 => 0x10000,
+    1 => 0x1007F,
+    2 => 'Linear B Syllabary',
+  ),
+  122 =>
+  array (
+    0 => 0x10080,
+    1 => 0x100FF,
+    2 => 'Linear B Ideograms',
+  ),
+  123 =>
+  array (
+    0 => 0x10100,
+    1 => 0x1013F,
+    2 => 'Aegean Numbers',
+  ),
+  124 =>
+  array (
+    0 => 0x10140,
+    1 => 0x1018F,
+    2 => 'Ancient Greek Numbers',
+  ),
+  125 =>
+  array (
+    0 => 0x10300,
+    1 => 0x1032F,
+    2 => 'Old Italic',
+  ),
+  126 =>
+  array (
+    0 => 0x10330,
+    1 => 0x1034F,
+    2 => 'Gothic',
+  ),
+  127 =>
+  array (
+    0 => 0x10380,
+    1 => 0x1039F,
+    2 => 'Ugaritic',
+  ),
+  128 =>
+  array (
+    0 => 0x103A0,
+    1 => 0x103DF,
+    2 => 'Old Persian',
+  ),
+  129 =>
+  array (
+    0 => 0x10400,
+    1 => 0x1044F,
+    2 => 'Deseret',
+  ),
+  130 =>
+  array (
+    0 => 0x10450,
+    1 => 0x1047F,
+    2 => 'Shavian',
+  ),
+  131 =>
+  array (
+    0 => 0x10480,
+    1 => 0x104AF,
+    2 => 'Osmanya',
+  ),
+  132 =>
+  array (
+    0 => 0x10800,
+    1 => 0x1083F,
+    2 => 'Cypriot Syllabary',
+  ),
+  133 =>
+  array (
+    0 => 0x10A00,
+    1 => 0x10A5F,
+    2 => 'Kharoshthi',
+  ),
+  134 =>
+  array (
+    0 => 0x1D000,
+    1 => 0x1D0FF,
+    2 => 'Byzantine Musical Symbols',
+  ),
+  135 =>
+  array (
+    0 => 0x1D100,
+    1 => 0x1D1FF,
+    2 => 'Musical Symbols',
+  ),
+  136 =>
+  array (
+    0 => 0x1D200,
+    1 => 0x1D24F,
+    2 => 'Ancient Greek Musical Notation',
+  ),
+  137 =>
+  array (
+    0 => 0x1D300,
+    1 => 0x1D35F,
+    2 => 'Tai Xuan Jing Symbols',
+  ),
+  138 =>
+  array (
+    0 => 0x1D400,
+    1 => 0x1D7FF,
+    2 => 'Mathematical Alphanumeric Symbols',
+  ),
+  139 =>
+  array (
+    0 => 0x20000,
+    1 => 0x2A6DF,
+    2 => 'CJK Unified Ideographs Extension B',
+  ),
+  140 =>
+  array (
+    0 => 0x2F800,
+    1 => 0x2FA1F,
+    2 => 'CJK Compatibility Ideographs Supplement',
+  ),
+  141 =>
+  array (
+    0 => 0xE0000,
+    1 => 0xE007F,
+    2 => 'Tags',
+  ),
+  142 =>
+  array (
+    0 => 0xE0100,
+    1 => 0xE01EF,
+    2 => 'Variation Selectors Supplement',
+  ),
+  143 =>
+  array (
+    0 => 0xF0000,
+    1 => 0xFFFFF,
+    2 => 'Supplementary Private Use Area-A',
+  ),
+  144 =>
+  array (
+    0 => 0x100000,
+    1 => 0x10FFFF,
+    2 => 'Supplementary Private Use Area-B',
+  ),
+);
+?>
diff --git a/library/langdet/docs/confidence.php b/library/langdet/docs/confidence.php

new file mode 100644 (file)

index 0000000..5be0fb9
--- /dev/null
+++ b/library/langdet/docs/confidence.php
@@ -0,0 +1,18 @@
+<?php
+require_once 'Text/LanguageDetect.php';
+
+$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
+
+$ld = new Text_LanguageDetect();
+//3 most probable languages
+$results = $ld->detect($text, 3);
+
+foreach ($results as $language => $confidence) {
+    echo $language . ': ' . number_format($confidence, 2) . "\n";
+}
+
+//output:
+//german: 0.35
+//dutch: 0.25
+//swedish: 0.20
+?>
+\ No newline at end of file
diff --git a/library/langdet/docs/errorhandling.php b/library/langdet/docs/errorhandling.php

new file mode 100644 (file)

index 0000000..b68e424
--- /dev/null
+++ b/library/langdet/docs/errorhandling.php
@@ -0,0 +1,15 @@
+<?php
+/**
+ * How to handle errors
+ */
+require_once 'Text/LanguageDetect.php';
+require_once 'Text/LanguageDetect/Exception.php';
+
+try {
+    $ld = new Text_LanguageDetect();
+    $lang = $ld->detectSimple('Das ist ein kleiner Text');
+    echo "Language is: $lang\n";
+} catch (Text_LanguageDetect_Exception $e) {
+    echo 'An error occured! Message: ' . $e . "\n";
+}
+?>
+\ No newline at end of file
diff --git a/library/langdet/docs/example_clui.php b/library/langdet/docs/example_clui.php

index 8e7d8577d377f1e922876b783cfb90c014647445..210b0eec4d8dbd131bf2716e432c971f3a90f45b 100644 (file)
--- a/library/langdet/docs/example_clui.php
+++ b/library/langdet/docs/example_clui.php
@@ -4,7 +4,7 @@
   * example usage (CLI)
   *
   * @package Text_LanguageDetect
- * @version CVS: $Id: example_clui.php 322305 2012-01-15 00:04:17Z clockwerx $
+ * @version CVS: $Id$
   */
  
  require_once 'Text/LanguageDetect.php';
diff --git a/library/langdet/docs/example_web.php b/library/langdet/docs/example_web.php

index 1e155fef2cbdabea3d2a4d786e7d35dfc4a2a48c..bee8f51a44e53e9a62db918375121afe9f1d14a8 100644 (file)
--- a/library/langdet/docs/example_web.php
+++ b/library/langdet/docs/example_web.php
@@ -4,7 +4,7 @@
   * example usage (web)
   *
   * @package Text_LanguageDetect
- * @version CVS: $Id: example_web.php 205493 2006-01-18 00:26:57Z taak $
+ * @version CVS: $Id$
   */
  
  // browsers will encode multi-byte characters wrong unless they think the page is utf8-encoded
diff --git a/library/langdet/docs/iso.php b/library/langdet/docs/iso.php

index 6d7ec1d2e6012d18b3ba14e04e9b148ab3677cc4..547316313ff8bd4f0c1c0909fbf2e6fe614239df 100644 (file)
--- a/library/langdet/docs/iso.php
+++ b/library/langdet/docs/iso.php
@@ -5,17 +5,15 @@
   * The "name mode" changes the way languages are accepted and returned.
   */ 
  require_once 'Text/LanguageDetect.php';
-$l = new Text_LanguageDetect();
-
+$ld = new Text_LanguageDetect();
  
  //will output the ISO 639-1 two-letter language code
  // "de"
-$l->setNameMode(2);
-echo $l->detectSimple('Das ist ein kleiner Text') . "\n";
+$ld->setNameMode(2);
+echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
  
  //will output the ISO 639-2 three-letter language code
  // "deu"
-$l->setNameMode(3);
-echo $l->detectSimple('Das ist ein kleiner Text') . "\n";
-
-?>
-\ No newline at end of file
+$ld->setNameMode(3);
+echo $ld->detectSimple('Das ist ein kleiner Text') . "\n";
+?>
diff --git a/library/langdet/docs/languages.php b/library/langdet/docs/languages.php

new file mode 100644 (file)

index 0000000..f6d022c
--- /dev/null
+++ b/library/langdet/docs/languages.php
@@ -0,0 +1,11 @@
+<?php
+/**
+ * List all supported languages
+ */
+require_once 'Text/LanguageDetect.php';
+$ld = new Text_LanguageDetect();
+
+foreach ($ld->getLanguages() as $lang) {
+    echo $lang . "\n";
+}
+?>
diff --git a/library/langdet/docs/simple.php b/library/langdet/docs/simple.php

new file mode 100644 (file)

index 0000000..0bfc11e
--- /dev/null
+++ b/library/langdet/docs/simple.php
@@ -0,0 +1,10 @@
+<?php
+require_once 'Text/LanguageDetect.php';
+
+$text = 'Was wäre, wenn ich Ihnen das jetzt sagen würde?';
+
+$ld = new Text_LanguageDetect();
+$result = $ld->detectSimple($text);
+var_dump($result);
+//output: german
+?>
diff --git a/library/langdet/tests/PrivProxy.php b/library/langdet/tests/PrivProxy.php

new file mode 100644 (file)

index 0000000..0f6af1d
--- /dev/null
+++ b/library/langdet/tests/PrivProxy.php
@@ -0,0 +1,42 @@
+<?php
+/**
+ * Helper that enables access to private and protected methods and properties.
+ */
+class PrivProxy
+{
+    private $obj;
+
+    public function __construct($obj)
+    {
+        $this->obj = $obj;
+    }
+
+    public function __call($method, $arguments)
+    {
+        $rm = new ReflectionMethod($this->obj, $method);
+        $rm->setAccessible(true);
+        return $rm->invokeArgs($this->obj, $arguments);
+    }
+
+    public static function __callStatic($method, $arguments)
+    {
+        $rm = new ReflectionMethod($this->obj, $method);
+        $rm->setAccessible(true);
+        return $rm->invokeArgs($this->obj, $arguments);
+    }
+
+    public function __set($var, $value)
+    {
+        $rp = new ReflectionProperty($this->obj, $var);
+        $rp->setAccessible(true);
+        $rp->setValue($this->obj, $value);
+    }
+
+    public function __get($var)
+    {
+        $rp = new ReflectionProperty($this->obj, $var);
+        $rp->setAccessible(true);
+        return $rp->getValue($this->obj);
+    }
+}
+?>
diff --git a/library/langdet/tests/Text_LanguageDetectTest.php b/library/langdet/tests/Text_LanguageDetectTest.php

index bbf4dd77989f2a2e5555bdab7e435fa5139d811c..a5cf442201253e02def62f42232cb55d35697f1a 100644 (file)
--- a/library/langdet/tests/Text_LanguageDetectTest.php
+++ b/library/langdet/tests/Text_LanguageDetectTest.php
@@ -2,7 +2,7 @@
  
  /**
   * @package Text_LanguageDetect
- * @version CVS: $Id: Text_LanguageDetectTest.php 322353 2012-01-16 08:41:43Z cweiske $
+ * @version CVS: $Id$
   */
  set_include_path(
      __DIR__ . '/../' . PATH_SEPARATOR . get_include_path()
@@ -10,7 +10,7 @@ set_include_path(
  error_reporting(E_ALL|E_STRICT);
  
  require_once 'Text/LanguageDetect.php';
-require_once 'PHPUnit/Framework/TestCase.php';
+require_once __DIR__ . '/PrivProxy.php';
  
  class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
  
@@ -18,6 +18,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      {
          ini_set('magic_quotes_runtime', 0);
          $this->x = new Text_LanguageDetect();
+        $this->xproxy = new PrivProxy($this->x);
      }
  
      function tearDown ()
@@ -29,16 +30,16 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      {
          $this->assertEquals(
              '/path/to/file',
-            $this->x->_get_data_loc('/path/to/file')
+            $this->xproxy->_get_data_loc('/path/to/file')
          );
      }
  
      function test_get_data_locPearPath()
      {
-        $this->x->_data_dir = '/path/to/pear/data';
+        $this->xproxy->_data_dir = '/path/to/pear/data';
          $this->assertEquals(
              '/path/to/pear/data/Text_LanguageDetect/file',
-            $this->x->_get_data_loc('file')
+            $this->xproxy->_get_data_loc('file')
          );
      }
  
@@ -48,7 +49,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
       */
      function test_readdbNonexistingFile()
      {
-        $this->x->_readdb('thisfiledoesnotexist');
+        $this->xproxy->_readdb('thisfiledoesnotexist');
      }
  
      /**
@@ -59,7 +60,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      {
          $name = tempnam(sys_get_temp_dir(), 'unittest-Text_LanguageDetect-');
          chmod($name, 0000);
-        $this->x->_readdb($name);
+        $this->xproxy->_readdb($name);
      }
  
      /**
@@ -68,7 +69,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
       */
      function test_checkTrigramEmpty()
      {
-        $this->x->_checkTrigram(array());
+        $this->xproxy->_checkTrigram(array());
      }
  
      /**
@@ -77,7 +78,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
       */
      function test_checkTrigramNoArray()
      {
-        $this->x->_checkTrigram('foo');
+        $this->xproxy->_checkTrigram('foo');
      }
  
      /**
@@ -90,26 +91,26 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
              $this->markTestSkipped('5.4.0 has no magic quotes anymore');
          }
          ini_set('magic_quotes_runtime', 1);
-        $this->x->_checkTrigram('foo');
+        $this->xproxy->_checkTrigram('foo');
      }
  
      function test_splitter ()
      {
          $str = 'hello';
  
-        $result = $this->x->_trigram($str);
+        $result = $this->xproxy->_trigram($str);
  
          $this->assertEquals(array(' he' => 1, 'hel' => 1, 'ell' => 1, 'llo' => 1, 'lo ' => 1), $result);
  
          $str = 'aa aa whatever';
  
-        $result = $this->x->_trigram($str);
+        $result = $this->xproxy->_trigram($str);
          $this->assertEquals(2, $result[' aa']);
          $this->assertEquals(2, $result['aa ']);
          $this->assertEquals(1, $result['a a']);
  
          $str = 'aa  aa';
-        $result = $this->x->_trigram($str);
+        $result = $this->xproxy->_trigram($str);
          $this->assertArrayNotHasKey('  a', $result, '  a');
          $this->assertArrayNotHasKey('a  ', $result, 'a  ');
      }
@@ -118,7 +119,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      {
          $str = 'resumé';
   
-        $result = $this->x->_trigram($str);
+        $result = $this->xproxy->_trigram($str);
   
          $this->assertTrue(isset($result['mé ']), 'mé ');
          $this->assertTrue(isset($result['umé']), 'umé');
@@ -127,7 +128,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          // tests lower-casing accented characters
          $str = 'resumÉ';
          
-        $result = $this->x->_trigram($str);
+        $result = $this->xproxy->_trigram($str);
   
          $this->assertTrue(isset($result['mé ']),'mé ');
          $this->assertTrue(isset($result['umé']),'umé');
@@ -137,7 +138,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      function test_sort ()
      {
          $arr = array('a' => 1, 'b' => 2, 'c' => 2);
-        $this->x->_bub_sort($arr);
+        $this->xproxy->__call('_bub_sort',[&$arr]);
  
          $final_arr = array('b' => 2, 'c' => 2, 'a' => 1);
  
@@ -175,8 +176,9 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $str = 'This function may return Boolean FALSE, but may also return a non-Boolean value which evaluates to FALSE, such as 0 or "". Please read the section on Booleans for more information. Use the === operator for testing the return value of this function.';
  
          $myobj = new Text_LanguageDetect;
+        $myobjproxy = new PrivProxy($myobj);
  
-        $myobj->_use_unicode_narrowing = false;
+        $myobjproxy->_use_unicode_narrowing = false;
  
          $count = $myobj->getLanguageCount();
          $returnval = $myobj->omitLanguages('english');
@@ -228,23 +230,22 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      function testOmitLanguagesClearsClusterCache()
      {
          $this->x->omitLanguages(array('english', 'german'), true);
-        $this->assertNull($this->x->_clusters);
+        $this->assertNull($this->xproxy->_clusters);
          $this->x->clusterLanguages();
-        $this->assertNotNull($this->x->_clusters);
+        $this->assertNotNull($this->xproxy->_clusters);
          $this->x->omitLanguages('german');
-        $this->assertNull($this->x->_clusters, 'cluster cache be empty now');
+        $this->assertNull($this->xproxy->_clusters, 'cluster cache be empty now');
      }
  
      function test_perl_compatibility()
      {
          // if this test fails, then many of the others will
  
-        $myobj = new Text_LanguageDetect;
-        $myobj->setPerlCompatible(true);
+        $this->x->setPerlCompatible(true);
  
          $testtext = "hello";
  
-        $result = $myobj->_trigram($testtext);
+        $result = $this->xproxy->_trigram($testtext);
  
          $this->assertTrue(!isset($result[' he']));
      }
@@ -316,7 +317,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          );
  
  
-        $my_arr = $this->x->_lang_db['french'];
+        $my_arr = $this->xproxy->_lang_db['french'];
  
          foreach ($safe_model as $key => $value) {
              $this->assertTrue(isset($my_arr[$key]),$key);
@@ -392,7 +393,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
              "ess" => 295,     "ie " => 296,     "ist" => 297,     "lat" => 298,     "uri" => 299,
          );
  
-        $mod = $this->x->_lang_db['english'];
+        $mod = $this->xproxy->_lang_db['english'];
  
          foreach ($realdb as $key => $value) {
              $this->assertTrue(isset($mod[$key]), $key);
@@ -432,7 +433,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $str = 'The Italian Renaissance began the opening phase of the Renaissance, a period of great cultural change and achievement from the 14th to the 16th century. The word renaissance means "rebirth," and the era is best known for the renewed interest in the culture of classical antiquity. The Italian Renaissance began in northern Italy, centering in Florence. It then spread south, having an especially significant impact on Rome, which was largely rebuilt by the Renaissance popes. The Italian Renaissance is best known for its cultural achievements. This includes works of literature by such figures as Petrarch, Castiglione, and Machiavelli; artists such as Michaelangelo and Leonardo da Vinci, and great works of architecture such as The Duomo in Florence and St. Peter\'s Basilica in Rome. At the same time, present-day historians also see the era as one of economic regression and of little progress in science. Furthermore, some historians argue that the lot of the peasants and urban poor, the majority of the population, worsened during this period.';
      
          $this->x->setPerlCompatible();
-        $tri = $this->x->_trigram($str);
+        $tri = $this->xproxy->_trigram($str);
          
          $exp_tri = array(
              ' th',
@@ -956,7 +957,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          //print_r(array_diff(array_keys($tri), $exp_tri));
  
          // tests the bubble sort mechanism
-        $this->x->_bub_sort($tri);
+        $this->xproxy->__call('_bub_sort', [&$tri]);
          $this->assertEquals($exp_tri, array_keys($tri));
  
          $true_differences = array(
@@ -1112,16 +1113,16 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
              " wh" => array('change' => 195, 'baserank' => 232, 'refrank' => 37),      " ph" => array('change' => 300, 'baserank' => 220, 'refrank' => null),
          );
          
-        $ranked = $this->x->_arr_rank($this->x->_trigram($str));
+        $ranked = $this->xproxy->_arr_rank($this->xproxy->_trigram($str));
          $results = $this->x->detect($str);
  
          $count = count($ranked);
          $sum = 0;
  
-        //foreach ($this->x->_lang_db['english'] as $key => $value) {
+        //foreach ($this->xproxy->_lang_db['english'] as $key => $value) {
          foreach ($ranked as $key => $value) {
-            if (isset($ranked[$key]) && isset($this->x->_lang_db['english'][$key])) {
-                $difference = abs($this->x->_lang_db['english'][$key] - $ranked[$key]);
+            if (isset($ranked[$key]) && isset($this->xproxy->_lang_db['english'][$key])) {
+                $difference = abs($this->xproxy->_lang_db['english'][$key] - $ranked[$key]);
              } else {
                  $difference = 300;
              }
@@ -1148,11 +1149,11 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $this->x->setPerlCompatible();
          $str = "Verifions que le détecteur de langues marche";
  
-        $trigrams = $this->x->_trigram($str);
+        $trigrams = $this->xproxy->_trigram($str);
          $this->assertEquals(42, count($trigrams));
          // verified in Language::Guess
  
-        $ranked = $this->x->_arr_rank($trigrams);
+        $ranked = $this->xproxy->_arr_rank($trigrams);
          $this->assertEquals(0, $ranked['e l']);
  
          $correct_ranks = array(
@@ -1250,7 +1251,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
  
  
  
-        $french_ranks = $this->x->_lang_db['french'];
+        $french_ranks = $this->xproxy->_lang_db['french'];
  
          $sumchange = 0;
          foreach ($ranked as $key => $value) {
@@ -1273,7 +1274,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
              $sumchange += $difference;
          }
  
-        $actual_result = $this->x->_distance($french_ranks, $ranked);
+        $actual_result = $this->xproxy->_distance($french_ranks, $ranked);
          $this->assertEquals($sumchange, $actual_result);
          $this->assertEquals(7091, $actual_result);
          $this->assertEquals(168, floor($sumchange/count($trigrams)));
@@ -1288,8 +1289,8 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $str = 'авай проверить  узнает ли наш угадатель русски язык';
  
          $this->x->setPerlCompatible();
-        $trigrams = $this->x->_trigram($str);
-        $ranked = $this->x->_arr_rank($trigrams);
+        $trigrams = $this->xproxy->_trigram($str);
+        $ranked = $this->xproxy->_arr_rank($trigrams);
  
          $correct_ranks = array(
              ' ру' => array('change' => 300, 'baserank' => 3, 'refrank' => null),
@@ -1345,7 +1346,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $this->assertEquals(48, count($ranked));
  
  
-        $russian = $this->x->_lang_db['russian'];
+        $russian = $this->xproxy->_lang_db['russian'];
  
          $sumchange = 0;
          foreach ($ranked as $key => $value) {
@@ -1368,7 +1369,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
              $sumchange += $difference;
          }
  
-        $actual_result = $this->x->_distance($russian, $ranked);
+        $actual_result = $this->xproxy->_distance($russian, $ranked);
          $this->assertEquals($sumchange, $actual_result);
          $this->assertEquals(10428, $actual_result);
          $this->assertEquals(217, floor($sumchange/count($trigrams)));
@@ -1381,7 +1382,7 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
      {
          $str = 'is it s i';
  
-        $result = $this->x->_arr_rank($this->x->_trigram($str));
+        $result = $this->xproxy->_arr_rank($this->xproxy->_trigram($str));
  
          $this->assertEquals(0, $result['s i']);
      }
@@ -1621,9 +1622,11 @@ class Text_LanguageDetectTest extends PHPUnit_Framework_TestCase {
          $i = 0;
          $j = 0;
          $new_u = '';
+        $rm = new ReflectionMethod('Text_LanguageDetect', '_next_char');
+        $rm->setAccessible(true);
          while ($i < strlen($uppercased)) {
-            $u = Text_LanguageDetect::_next_char($uppercased, $i, true);
-            $l = Text_LanguageDetect::_next_char($lowercased, $j, true);
+            $u = $rm->invokeArgs($this->x, [$uppercased, &$i, true]);
+            $l = $rm->invokeArgs($this->x, [$lowercased, &$j, true]);
              $this->assertEquals($u, $l);
  
              $new_u .= $u;
@@ -1777,7 +1780,7 @@ EOF;
          
  
          foreach ($chars as $utf8 => $unicode) {
-            $this->assertEquals($unicode, $this->x->_utf8char2unicode($utf8), $utf8);
+            $this->assertEquals($unicode, $this->xproxy->_utf8char2unicode($utf8), $utf8);
          }
      }
  
@@ -1922,7 +1925,7 @@ EOF;
      {
          $this->assertEquals(
              'english',
-            $this->x->_convertFromNameMode('english')
+            $this->xproxy->_convertFromNameMode('english')
          );
      }
  
@@ -1931,7 +1934,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              'english',
-            $this->x->_convertFromNameMode('en')
+            $this->xproxy->_convertFromNameMode('en')
          );
      }
  
@@ -1940,7 +1943,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              'english',
-            $this->x->_convertFromNameMode('eng')
+            $this->xproxy->_convertFromNameMode('eng')
          );
      }
  
@@ -1949,7 +1952,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              array('english', 'german'),
-            $this->x->_convertFromNameMode(array('en', 'de'))
+            $this->xproxy->_convertFromNameMode(array('en', 'de'))
          );
      }
  
@@ -1958,7 +1961,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              array('english' => 'foo', 'german' => 'test'),
-            $this->x->_convertFromNameMode(
+            $this->xproxy->_convertFromNameMode(
                  array('en' => 'foo', 'de' => 'test'),
                  true
              )
@@ -1970,7 +1973,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              array('english', 'german'),
-            $this->x->_convertFromNameMode(array('eng', 'deu'))
+            $this->xproxy->_convertFromNameMode(array('eng', 'deu'))
          );
      }
  
@@ -1979,7 +1982,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              array('english' => 'foo', 'german' => 'test'),
-            $this->x->_convertFromNameMode(
+            $this->xproxy->_convertFromNameMode(
                  array('eng' => 'foo', 'deu' => 'test'),
                  true
              )
@@ -1990,7 +1993,7 @@ EOF;
      {
          $this->assertEquals(
              'english',
-            $this->x->_convertToNameMode('english')
+            $this->xproxy->_convertToNameMode('english')
          );
      }
  
@@ -1999,7 +2002,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              'en',
-            $this->x->_convertToNameMode('english')
+            $this->xproxy->_convertToNameMode('english')
          );
      }
  
@@ -2008,7 +2011,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              'eng',
-            $this->x->_convertToNameMode('english')
+            $this->xproxy->_convertToNameMode('english')
          );
      }
  
@@ -2017,7 +2020,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              array('en', 'de'),
-            $this->x->_convertToNameMode(array('english', 'german'))
+            $this->xproxy->_convertToNameMode(array('english', 'german'))
          );
      }
  
@@ -2026,7 +2029,7 @@ EOF;
          $this->x->setNameMode(2);
          $this->assertEquals(
              array('en' => 'foo', 'de' => 'test'),
-            $this->x->_convertToNameMode(
+            $this->xproxy->_convertToNameMode(
                  array('english' => 'foo', 'german' => 'test'),
                  true
              )
@@ -2038,7 +2041,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              array('eng', 'deu'),
-            $this->x->_convertToNameMode(array('english', 'german'))
+            $this->xproxy->_convertToNameMode(array('english', 'german'))
          );
      }
  
@@ -2047,7 +2050,7 @@ EOF;
          $this->x->setNameMode(3);
          $this->assertEquals(
              array('eng' => 'foo', 'deu' => 'test'),
-            $this->x->_convertToNameMode(
+            $this->xproxy->_convertToNameMode(
                  array('english' => 'foo', 'german' => 'test'),
                  true
              )
author	Tobias Diekershoff <tobias.diekershoff@gmx.net>
	Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)
committer	Tobias Diekershoff <tobias.diekershoff@gmx.net>
	Mon, 4 Sep 2017 07:52:14 +0000 (09:52 +0200)
library/langdet/README.rst	[new file with mode: 0644]	patch \| blob
library/langdet/Text/LanguageDetect.php		patch \| blob \| history
library/langdet/Text/LanguageDetect/Exception.php		patch \| blob \| history
library/langdet/Text/LanguageDetect/ISO639.php		patch \| blob \| history
library/langdet/Text/LanguageDetect/Parser.php		patch \| blob \| history
library/langdet/data/build-unicode_blocks.php	[new file with mode: 0644]	patch \| blob
library/langdet/data/unicode_blocks.dat		patch \| blob \| history
library/langdet/data/unicode_blocks.php	[new file with mode: 0644]	patch \| blob
library/langdet/docs/confidence.php	[new file with mode: 0644]	patch \| blob
library/langdet/docs/errorhandling.php	[new file with mode: 0644]	patch \| blob
library/langdet/docs/example_clui.php		patch \| blob \| history
library/langdet/docs/example_web.php		patch \| blob \| history
library/langdet/docs/iso.php		patch \| blob \| history
library/langdet/docs/languages.php	[new file with mode: 0644]	patch \| blob
library/langdet/docs/simple.php	[new file with mode: 0644]	patch \| blob
library/langdet/tests/PrivProxy.php	[new file with mode: 0644]	patch \| blob
library/langdet/tests/Text_LanguageDetectTest.php		patch \| blob \| history