3 # Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
5 # b8 - A Bayesian spam filter written in PHP 5
7 # This program is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU Lesser General Public License as published by
9 # the Free Software Foundation in version 2.1 of the License.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 # License for more details.
16 # You should have received a copy of the GNU Lesser General Public License
17 # along with this program; if not, write to the Free Software Foundation,
18 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21 * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
26 * @author Tobias Leupold
27 * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
33 public $config = array(
36 'allow_numbers' => FALSE,
38 'degenerator' => 'default',
46 private $_lexer = NULL;
47 private $_database = NULL;
48 private $_token_data = NULL;
52 const LEARN = 'learn';
53 const UNLEARN = 'unlearn';
55 const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
56 const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER';
57 const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
66 function __construct($config = array(), $database_config)
69 # Validate config data
71 if(count($config) > 0) {
73 foreach ($config as $name=>$value) {
80 $this->config[$name] = (float) $value;
86 $this->config[$name] = (int) $value;
90 $this->config[$name] = (bool) $value;
94 $value = (string) strtolower($value);
95 $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
99 $this->config[$name] = (string) $value;
108 # Setup the database backend
110 # Get the basic storage class used by all backends
111 if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
114 # Get the degenerator we need
115 if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
118 # Get the actual storage backend we need
119 if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
123 $class = 'b8_storage_' . $this->config['storage'];
124 $this->_database = new $class(
126 $this->config['degenerator'], date('ymd')
129 # Setup the lexer class
131 if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
134 $class = 'b8_lexer_' . $this->config['lexer'];
135 $this->_lexer = new $class(
137 'min_size' => $this->config['min_size'],
138 'max_size' => $this->config['max_size'],
139 'allow_numbers' => $this->config['allow_numbers']
146 * Load a class file if a class has not been defined yet.
149 * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
152 public function load_class($class_name, $class_file)
155 if(class_exists($class_name, FALSE) === FALSE) {
157 $included = require_once $class_file;
159 if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
169 * Validates the class has all it needs to work.
172 * @return mixed Returns TRUE if everything is okay, otherwise an error code.
175 public function validate()
178 if($this->_database === NULL)
179 return self::STARTUP_FAIL_DATABASE;
181 # Connect the database backend if we aren't connected yet
183 elseif($this->_database->connected === FALSE) {
185 $connection = $this->_database->connect();
187 if($connection !== TRUE)
192 if($this->_lexer === NULL)
193 return self::STARTUP_FAIL_LEXER;
204 * @param string $text
205 * @return float The rating between 0 (ham) and 1 (spam)
208 public function classify($uid,$text)
211 # Validate the startup
213 $started_up = $this->validate();
215 if($started_up !== TRUE)
218 # Get the internal database variables, containing the number of ham and
219 # spam texts so the spam probability can be calculated in relation to them
220 $internals = $this->_database->get_internals($uid);
222 # Calculate the spamminess of all tokens
224 # Get all tokens we want to rate
226 $tokens = $this->_lexer->get_tokens($text);
228 # Check if the lexer failed
229 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
230 if(!is_array($tokens))
233 # Fetch all availible data for the token set from the database
234 $this->_token_data = $this->_database->get(array_keys($tokens),$uid);
236 # Calculate the spamminess and importance for each token (or a degenerated form of it)
238 $word_count = array();
240 $importance = array();
242 foreach($tokens as $word => $count) {
244 $word_count[$word] = $count;
246 # Although we only call this function only here ... let's do the
247 # calculation stuff in a function to make this a bit less confusing ;-)
248 $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);
250 $importance[$word] = abs(0.5 - $rating[$word]);
254 # Order by importance
258 # Get the most interesting tokens (use all if we have less than the given number)
262 for($i = 0; $i < $this->config['use_relevant']; $i++) {
264 if($tmp = each($importance)) {
266 # Important tokens remain
268 # If the token's rating is relevant enough, use it
270 if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {
272 # Tokens that appear more than once also count more than once
274 for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
275 array_push($relevant, $rating[$tmp['key']]);
282 # We have less than words to use, so we already
283 # use what we have and can break here
289 # Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
290 # We set both hamminess and Spamminess to 1 for the first multiplying
294 # Consider all relevant ratings
295 foreach($relevant as $value) {
296 $hamminess *= (1.0 - $value);
297 $spamminess *= $value;
300 # If no token was good for calculation, we really don't know how
301 # to rate this text; so we assume a spam and ham probability of 0.5
303 if($hamminess === 1 and $spamminess === 1) {
309 # Get the number of relevant ratings
310 $n = count($relevant);
313 # Calculate the combined rating
315 # The actual hamminess and spamminess
316 $hamminess = 1 - pow($hamminess, (1 / $n));
317 $spamminess = 1 - pow($spamminess, (1 / $n));
319 # Calculate the combined indicator
320 $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);
322 # We want a value between 0 and 1, not between -1 and +1, so ...
323 $probability = (1 + $probability) / 2;
331 * Calculate the spamminess of a single token also considering "degenerated" versions
334 * @param string $word
335 * @param string $texts_ham
336 * @param string $texts_spam
340 private function _get_probability($word, $texts_ham, $texts_spam)
343 # Let's see what we have!
345 if(isset($this->_token_data['tokens'][$word]) === TRUE) {
346 # The token was in the database, so we can use it's data as-is
347 # and calculate the spamminess of this token directly
348 return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
351 # Damn. The token was not found, so do we have at least similar words?
353 if(isset($this->_token_data['degenerates'][$word]) === TRUE) {
355 # We found similar words, so calculate the spamminess for each one
356 # and choose the most important one for the further calculation
358 # The default rating is 0.5 simply saying nothing
361 foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {
363 # Calculate the rating of the current degenerated token
364 $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);
366 # Is it more important than the rating of another degenerated version?
367 if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
368 $rating = $rating_tmp;
377 # The token is really unknown, so choose the default rating
378 # for completely unknown tokens. This strips down to the
379 # robX parameter so we can cheap out the freaky math ;-)
380 return $this->config['rob_x'];
386 * Do the actual spamminess calculation of a single token
390 * @param string $texts_ham
391 * @param string $texts_spam
395 private function _calc_probability($data, $texts_ham, $texts_spam)
398 # Calculate the basic probability by Mr. Graham
400 # But: consider the number of ham and spam texts saved instead of the
401 # number of entries where the token appeared to calculate a relative
402 # spamminess because we count tokens appearing multiple times not just
403 # once but as often as they appear in the learned texts
405 $rel_ham = $data['count_ham'];
406 $rel_spam = $data['count_spam'];
409 $rel_ham = $data['count_ham'] / $texts_ham;
412 $rel_spam = $data['count_spam'] / $texts_spam;
414 $rating = $rel_spam / ($rel_ham + $rel_spam);
416 # Calculate the better probability proposed by Mr. Robinson
417 $all = $data['count_ham'] + $data['count_spam'];
418 return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);
423 * Check the validity of the category of a request
426 * @param string $category
430 private function _check_category($category)
432 return $category === self::HAM or $category === self::SPAM;
436 * Learn a reference text
439 * @param string $text
440 * @param const $category Either b8::SPAM or b8::HAM
444 public function learn($text, $category, $uid)
446 return $this->_process_text($text, $category, self::LEARN, $uid);
450 * Unlearn a reference text
453 * @param string $text
454 * @param const $category Either b8::SPAM or b8::HAM
458 public function unlearn($text, $category, $uid)
460 return $this->_process_text($text, $category, self::UNLEARN, $uid);
464 * Does the actual interaction with the storage backend for learning or unlearning texts
467 * @param string $text
468 * @param const $category Either b8::SPAM or b8::HAM
469 * @param const $action Either b8::LEARN or b8::UNLEARN
473 private function _process_text($text, $category, $action, $uid = 0)
476 # Validate the startup
478 $started_up = $this->validate();
480 if($started_up !== TRUE)
483 # Look if the request is okay
484 if($this->_check_category($category) === FALSE)
485 return self::TRAINER_CATEGORY_FAIL;
487 # Get all tokens from $text
489 $tokens = $this->_lexer->get_tokens($text);
491 # Check if the lexer failed
492 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
493 if(!is_array($tokens))
496 # Pass the tokens and what to do with it to the storage backend
497 return $this->_database->process_text($tokens, $category, $action, $uid);