3 # Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
5 # This file is part of the b8 package
7 # This program is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU Lesser General Public License as published by
9 # the Free Software Foundation in version 2.1 of the License.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 # License for more details.
16 # You should have received a copy of the GNU Lesser General Public License
17 # along with this program; if not, write to the Free Software Foundation,
18 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21 * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
26 * @author Tobias Leupold
29 class b8_degenerator_default
32 public $degenerates = array();
35 * Generates a list of "degenerated" words for a list of words.
38 * @param array $tokens
39 * @return array An array containing an array of degenerated tokens for each token
42 public function degenerate(array $words)
45 $degenerates = array();
47 foreach($words as $word)
48 $degenerates[$word] = $this->_degenerate_word($word);
55 * If the original word is not found in the database then
56 * we build "degenerated" versions of the word to lookup.
60 * @return array An array of degenerated words
63 protected function _degenerate_word($word)
66 # Check for any stored words so the process doesn't have to repeat
67 if(isset($this->degenerates[$word]) === TRUE)
68 return $this->degenerates[$word];
70 $degenerate = array();
72 # Add different version of upper and lower case and ucfirst
73 array_push($degenerate, strtolower($word));
74 array_push($degenerate, strtoupper($word));
75 array_push($degenerate, ucfirst($word));
77 # Degenerate all versions
79 foreach($degenerate as $alt_word) {
81 # Look for stuff like !!! and ???
83 if(preg_match('/[!?]$/', $alt_word) > 0) {
85 # Add versions with different !s and ?s
87 if(preg_match('/[!?]{2,}$/', $alt_word) > 0) {
88 $tmp = preg_replace('/([!?])+$/', '$1', $alt_word);
89 array_push($degenerate, $tmp);
92 $tmp = preg_replace('/([!?])+$/', '', $alt_word);
93 array_push($degenerate, $tmp);
97 # Look for ... at the end of the word
99 $alt_word_int = $alt_word;
101 while(preg_match('/[\.]$/', $alt_word_int) > 0) {
102 $alt_word_int = substr($alt_word_int, 0, strlen($alt_word_int) - 1);
103 array_push($degenerate, $alt_word_int);
108 # Some degenerates are the same as the original word. These don't have
109 # to be fetched, so we create a new array with only new tokens
111 $real_degenerate = array();
113 foreach($degenerate as $deg_word) {
114 if($word != $deg_word)
115 array_push($real_degenerate, $deg_word);
118 # Store the list of degenerates for the token
119 $this->degenerates[$word] = $real_degenerate;
121 return $real_degenerate;