3 # Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
5 # This file is part of the b8 package
7 # This program is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU Lesser General Public License as published by
9 # the Free Software Foundation in version 2.1 of the License.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 # License for more details.
16 # You should have received a copy of the GNU Lesser General Public License
17 # along with this program; if not, write to the Free Software Foundation,
18 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21 * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
26 * @author Tobias Leupold
27 * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
30 class b8_lexer_default
33 const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
34 const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY';
36 public $config = NULL;
38 # The regular expressions we use to split the text to tokens
40 public $regexp = array(
41 'ip' => '/([A-Za-z0-9\_\-\.]+)/',
42 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
43 'html' => '/(<.+?>)/',
44 'tagname' => '/(.+?)\s/',
45 'numbers' => '/^[0-9]+$/'
49 * Constructs the lexer.
55 function __construct($config)
57 $this->config = $config;
61 * Generates the tokens required for the bayesian filter.
65 * @return array Returns the list of tokens
68 public function get_tokens($text)
71 # Check that we actually have a string ...
72 if(is_string($text) === FALSE)
73 return self::LEXER_TEXT_NOT_STRING;
75 # ... and that it's not empty
76 if(empty($text) === TRUE)
77 return self::LEXER_TEXT_EMPTY;
79 # Re-convert the text to the original characters coded in UTF-8, as
80 # they have been coded in html entities during the post process
81 $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
85 # Find URLs and IP addresses
87 preg_match_all($this->regexp['ip'], $text, $raw_tokens);
89 foreach($raw_tokens[1] as $word) {
92 if(strpos($word, '.') === FALSE)
95 # Check that the word is valid, min and max sizes, etc.
96 if($this->_is_valid($word) === FALSE)
99 if(isset($tokens[$word]) === FALSE)
104 # Delete the word from the text so it doesn't get re-added.
105 $text = str_replace($word, '', $text);
107 # Also process the parts of the URLs
108 $url_parts = preg_split($this->regexp['raw_split'], $word);
110 foreach($url_parts as $word) {
112 # Again validate the part
114 if($this->_is_valid($word) === FALSE)
117 if(isset($tokens[$word]) === FALSE)
126 # Split the remaining text
128 $raw_tokens = preg_split($this->regexp['raw_split'], $text);
130 foreach($raw_tokens as $word) {
132 # Again validate the part
134 if($this->_is_valid($word) === FALSE)
137 if(isset($tokens[$word]) === FALSE)
146 preg_match_all($this->regexp['html'], $text, $raw_tokens);
148 foreach($raw_tokens[1] as $word) {
150 # Again validate the part
152 if($this->_is_valid($word) === FALSE)
155 # If the tag has parameters, just use the tag itself
157 if(strpos($word, ' ') !== FALSE) {
158 preg_match($this->regexp['tagname'], $word, $tmp);
159 $word = "{$tmp[1]}...>";
162 if(isset($tokens[$word]) === FALSE)
169 # Return a list of all found tokens
178 * @param string $token The token string.
179 * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
182 private function _is_valid($token)
185 # Validate the size of the token
187 $len = strlen($token);
189 if($len < $this->config['min_size'] or $len > $this->config['max_size'])
192 # We may want to exclude pure numbers
193 if($this->config['allow_numbers'] === FALSE) {
194 if(preg_match($this->regexp['numbers'], $token) > 0)