updated pear LangDetect to version 1.0.0

[friendica.git] / library / langdet / Text / LanguageDetect.php
diff --git a/library/langdet/Text/LanguageDetect.php b/library/langdet/Text/LanguageDetect.php

index d3caaff39023107a6e9951449190c02bc8159fd7..7f7d58a96b5d410cc444bafb9d0d9c9e09bc4234 100644 (file)
--- a/library/langdet/Text/LanguageDetect.php
+++ b/library/langdet/Text/LanguageDetect.php
@@ -1,13 +1,6 @@
  <?php
-
  /**
- * Detects the language of a given piece of text.
- *
- * Attempts to detect the language of a sample of text by correlating ranked
- * 3-gram frequencies to a table of 3-gram frequencies of known languages.
- *
- * Implements a version of a technique originally proposed by Cavnar & Trenkle
- * (1994): "N-Gram-Based Text Categorization"
+ * Part of Text_LanguageDetect
   *
   * PHP version 5
   *
@@ -15,20 +8,24 @@
   * @package   Text_LanguageDetect
   * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
   * @copyright 2005-2006 Nicholas Pisarro
- * @license   http://www.debian.org/misc/bsd.license BSD
- * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
   * @link      http://pear.php.net/package/Text_LanguageDetect/
- * @link      http://langdetect.blogspot.com/
   */
  
-require_once 'library/langdet/Text/LanguageDetect/Exception.php';
-require_once 'library/langdet/Text/LanguageDetect/Parser.php';
-require_once 'library/langdet/Text/LanguageDetect/ISO639.php';
+require_once 'Text/LanguageDetect/Exception.php';
+require_once 'Text/LanguageDetect/Parser.php';
+require_once 'Text/LanguageDetect/ISO639.php';
  
  /**
- * Language detection class
+ * Detects the language of a given piece of text.
+ *
+ * Attempts to detect the language of a sample of text by correlating ranked
+ * 3-gram frequencies to a table of 3-gram frequencies of known languages.
+ *
+ * Implements a version of a technique originally proposed by Cavnar & Trenkle
+ * (1994): "N-Gram-Based Text Categorization"
   *
- * Requires the langauge model database (lang.dat) that should have
+ * Requires the language model database (lang.dat) that should have
   * accompanied this class definition in order to be instantiated.
   *
   * Example usage:
@@ -60,10 +57,9 @@ require_once 'library/langdet/Text/LanguageDetect/ISO639.php';
   * @package   Text_LanguageDetect
   * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
   * @copyright 2005 Nicholas Pisarro
- * @license   http://www.debian.org/misc/bsd.license BSD
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
   * @version   Release: @package_version@
   * @link      http://pear.php.net/package/Text_LanguageDetect/
- * @todo      allow users to generate their own language models
   */
  class Text_LanguageDetect
  {
@@ -73,10 +69,9 @@ class Text_LanguageDetect
       * If this value starts with a slash (/) or a dot (.) the value of
       * $this->_data_dir will be ignored
       *
-     * @var      string
-     * @access   private
+     * @var string
       */
-    var $_db_filename = 'lang.dat';
+    protected $_db_filename = 'lang.dat';
  
      /**
       * The filename that stores the unicode block definitions
@@ -85,83 +80,74 @@ class Text_LanguageDetect
       * $this->_data_dir will be ignored
       *
       * @var string
-     * @access private
       */
-    var $_unicode_db_filename = 'unicode_blocks.dat';
+    protected $_unicode_db_filename = 'unicode_blocks.dat';
  
      /**
       * The data directory
       *
       * Should be set by PEAR installer
       *
-     * @var      string
-     * @access   private
+     * @var string
       */
-    var $_data_dir = '@data_dir@';
+    protected $_data_dir = '@data_dir@';
  
      /**
       * The trigram data for comparison
       *
       * Will be loaded on start from $this->_db_filename
       *
-     * @var      array
-     * @access   private
+     * @var array
       */
-    var $_lang_db = array();
+    protected $_lang_db = array();
  
      /**
-     * stores the map of the trigram data to unicode characters
+     * Stores the map of the trigram data to unicode characters
       *
-     * @access private
       * @var array
       */
-    var $_unicode_map;
+    protected $_unicode_map;
  
      /**
       * The size of the trigram data arrays
       *
-     * @var      int
-     * @access   private
+     * @var int
       */
-    var $_threshold = 300;
+    protected $_threshold = 300;
  
      /**
-     * the maximum possible score.
+     * The maximum possible score.
       *
-     * needed for score normalization. Different depending on the
+     * Needed for score normalization. Different depending on the
       * perl compatibility setting
       *
-     * @access  private
-     * @var     int
-     * @see     setPerlCompatible()
+     * @var int
+     * @see setPerlCompatible()
       */
-    var $_max_score = 0;
+    protected $_max_score = 0;
  
      /**
       * Whether or not to simulate perl's Language::Guess exactly
       *
-     * @access  private
-     * @var     bool
-     * @see     setPerlCompatible()
+     * @var bool
+     * @see setPerlCompatible()
       */
-    var $_perl_compatible = false;
+    protected $_perl_compatible = false;
  
      /**
       * Whether to use the unicode block detection to speed up processing
       *
-     * @access private
       * @var bool
       */
-    var $_use_unicode_narrowing = true;
+    protected $_use_unicode_narrowing = true;
  
      /**
-     * stores the result of the clustering operation
+     * Stores the result of the clustering operation
       *
-     * @access  private
-     * @var     array
-     * @see     clusterLanguages()
+     * @var array
+     * @see clusterLanguages()
       */
-    var $_clusters;
+    protected $_clusters;
  
      /**
       * Which type of "language names" are accepted and returned:
@@ -170,7 +156,7 @@ class Text_LanguageDetect
       * 2 - 2-letter ISO 639-1 code ("en")
       * 3 - 3-letter ISO 639-2 code ("eng")
       */
-    var $_name_mode = 0;
+    protected $_name_mode = 0;
  
      /**
       * Constructor
@@ -178,7 +164,7 @@ class Text_LanguageDetect
       * Will attempt to load the language database. If it fails, you will get
       * an exception.
       */
-    function __construct()
+    public function __construct()
      {
          $data = $this->_readdb($this->_db_filename);
          $this->_checkTrigram($data['trigram']);
@@ -200,9 +186,8 @@ class Text_LanguageDetect
       * @param string $fname File name to load
       *
       * @return string expected path to the language model database
-     * @access private
       */
-    function _get_data_loc($fname)
+    protected function _get_data_loc($fname)
      {
          if ($fname{0} == '/' || $fname{0} == '.') {
              // if filename starts with a slash, assume it's an absolute pathname
@@ -229,9 +214,8 @@ class Text_LanguageDetect
       *
       * @return array the language model data
       * @throws Text_LanguageDetect_Exception
-     * @access private
       */
-    function _readdb($fname)
+    protected function _readdb($fname)
      {
          // finds the correct data dir
          $fname = $this->_get_data_loc($fname);
@@ -259,9 +243,8 @@ class Text_LanguageDetect
       * @param array $trigram Trigram data from database
       *
       * @return void
-     * @access private
       */
-    function _checkTrigram($trigram)
+    protected function _checkTrigram($trigram)
      {
          if (!is_array($trigram)) {
              if (ini_get('magic_quotes_runtime')) {
@@ -353,11 +336,10 @@ class Text_LanguageDetect
      /**
       * Returns the number of languages that this object can detect
       *
-     * @access public
       * @return int            the number of languages
-     * @throws   Text_LanguageDetect_Exception
+     * @throws Text_LanguageDetect_Exception
       */
-    function getLanguageCount()
+    public function getLanguageCount()
      {
          return count($this->_lang_db);
      }
@@ -395,11 +377,10 @@ class Text_LanguageDetect
      /**
       * Returns the list of detectable languages
       *
-     * @access public
       * @return array        the names of the languages known to this object<<<<<<<
-     * @throws   Text_LanguageDetect_Exception
+     * @throws Text_LanguageDetect_Exception
       */
-    function getLanguages()
+    public function getLanguages()
      {
          return $this->_convertToNameMode(
              array_keys($this->_lang_db)
@@ -437,7 +418,7 @@ class Text_LanguageDetect
       *
       * @return void
       */
-    function setNameMode($name_mode)
+    public function setNameMode($name_mode)
      {
          $this->_name_mode = $name_mode;
      }
@@ -467,10 +448,9 @@ class Text_LanguageDetect
       * @param string $text text to convert
       *
       * @return     array array of trigram frequencies
-     * @access     private
       * @deprecated Superceded by the Text_LanguageDetect_Parser class
       */
-    function _trigram($text)
+    protected function _trigram($text)
      {
          $s = new Text_LanguageDetect_Parser($text);
          $s->prepareTrigram();
@@ -488,9 +468,8 @@ class Text_LanguageDetect
       * @param array $arr array of trigram
       *
       * @return array ranks of trigrams
-     * @access protected
       */
-    function _arr_rank($arr)
+    protected function _arr_rank($arr)
      {
  
          // sorts alphabetically first as a standard way of breaking rank ties
@@ -518,12 +497,11 @@ class Text_LanguageDetect
      /**
       * Sorts an array by value breaking ties alphabetically
       *
-     * @param array &$arr the array to sort
+     * @param array $arr the array to sort
       *
       * @return void
-     * @access private
       */
-    function _bub_sort(&$arr)
+    protected function _bub_sort(&$arr)
      {
          // should do the same as this perl statement:
          // sort { $trigrams{$b} == $trigrams{$a}
@@ -561,9 +539,8 @@ class Text_LanguageDetect
       *
       * @return int 1 if $a is greater, -1 if not
       * @see    _bub_sort()
-     * @access private
       */
-    function _sort_func($a, $b)
+    protected function _sort_func($a, $b)
      {
          // each is actually a key/value pair, so that it can compare using both
          list($a_key, $a_value) = $a;
@@ -601,9 +578,8 @@ class Text_LanguageDetect
       *
       * @return int the sum of the differences between the ranks of
       *             the two trigram sets
-     * @access private
       */
-    function _distance($arr1, $arr2)
+    protected function _distance($arr1, $arr2)
      {
          $sumdist = 0;
  
@@ -634,9 +610,8 @@ class Text_LanguageDetect
       *
       * @return float the normalized score
       * @see    _distance()
-     * @access private
       */
-    function _normalize_score($score, $base_count = null)
+    protected function _normalize_score($score, $base_count = null)
      {
          if ($base_count === null) {
              $base_count = $this->_threshold;
@@ -712,7 +687,7 @@ class Text_LanguageDetect
          $sample_obj->setPadStart(!$this->_perl_compatible);
          $sample_obj->analyze();
  
-        $trigram_freqs =& $sample_obj->getTrigramRanks();
+        $trigram_freqs = $sample_obj->getTrigramRanks();
          $trigram_count = count($trigram_freqs);
  
          if ($trigram_count == 0) {
@@ -723,7 +698,7 @@ class Text_LanguageDetect
  
          // use unicode block detection to narrow down the possibilities
          if ($this->_use_unicode_narrowing) {
-            $blocks =& $sample_obj->getUnicodeBlocks();
+            $blocks = $sample_obj->getUnicodeBlocks();
  
              if (is_array($blocks)) {
                  $present_blocks = array_keys($blocks);
@@ -975,9 +950,8 @@ class Text_LanguageDetect
       *
       * @return mixed Block name, -1 if it failed
       * @see    unicodeBlockName()
-     * @access protected
       */
-    function _unicode_block_name($unicode, $blocks, $block_count = -1)
+    protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
      {
          // for a reference, see
          // http://www.unicode.org/Public/UNIDATA/Blocks.txt
@@ -1028,9 +1002,8 @@ class Text_LanguageDetect
       *
       * @return array the database of unicode block definitions
       * @throws Text_LanguageDetect_Exception
-     * @access protected
       */
-    function _read_unicode_block_db()
+    protected function _read_unicode_block_db()
      {
          // since the unicode definitions are always going to be the same,
          // might as well share the memory for the db with all other instances
@@ -1149,14 +1122,13 @@ class Text_LanguageDetect
       * Uses a nearest neighbor technique to generate the maximum possible
       * number of dendograms from the similarity data.
       *
-     * @access      public
-     * @return      array language cluster data
-     * @throws      Text_LanguageDetect_Exception
-     * @see         languageSimilarity()
-     * @deprecated  this function will eventually be removed and placed into
+     * @return     array language cluster data
+     * @throws     Text_LanguageDetect_Exception
+     * @see        languageSimilarity()
+     * @deprecated this function will eventually be removed and placed into
       *              the model generation class
       */
-    function clusterLanguages()
+    public function clusterLanguages()
      {
          // todo: set the maximum number of clusters
          // return cached result, if any
@@ -1465,7 +1437,7 @@ class Text_LanguageDetect
      }
  
      /**
-     * ut8-safe strlen()
+     * UTF8-safe strlen()
       *
       * Returns the numbers of characters (not bytes) in a utf8 string
       *
@@ -1489,10 +1461,9 @@ class Text_LanguageDetect
       * @param string $char a utf8 (possibly multi-byte) char
       *
       * @return int unicode value
-     * @access protected
       * @link   http://en.wikipedia.org/wiki/UTF-8
       */
-    function _utf8char2unicode($char)
+    protected function _utf8char2unicode($char)
      {
          // strlen() here will actually get the binary length of a single char
          switch (strlen($char)) {
@@ -1529,20 +1500,19 @@ class Text_LanguageDetect
      }
  
      /**
-     * utf8-safe fast character iterator
+     * UTF8-safe fast character iterator
       *
       * Will get the next character starting from $counter, which will then be
       * incremented. If a multi-byte char the bytes will be concatenated and
       * $counter will be incremeted by the number of bytes in the char.
       *
       * @param string $str             the string being iterated over
-     * @param int    &$counter        the iterator, will increment by reference
+     * @param int    $counter         the iterator, will increment by reference
       * @param bool   $special_convert whether to do special conversions
       *
       * @return char the next (possibly multi-byte) char from $counter
-     * @access private
       */
-    static function _next_char($str, &$counter, $special_convert = false)
+    protected static function _next_char($str, &$counter, $special_convert = false)
      {
          $char = $str{$counter++};
          $ord = ord($char);
@@ -1634,7 +1604,7 @@ class Text_LanguageDetect
       *
       * @return string|array Language name
       */
-    function _convertFromNameMode($lang, $convertKey = false)
+    protected function _convertFromNameMode($lang, $convertKey = false)
      {
          if ($this->_name_mode == 0) {
              return $lang;
@@ -1674,7 +1644,7 @@ class Text_LanguageDetect
       *
       * @return string|array Language name
       */
-    function _convertToNameMode($lang, $convertKey = false)
+    protected function _convertToNameMode($lang, $convertKey = false)
      {
          if ($this->_name_mode == 0) {
              return $lang;