library/spam/b8/b8.php

   1 <?php
   2
   3 #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
   4 #
   5 #   b8 - A Bayesian spam filter written in PHP 5
   6 #
   7 #   This program is free software; you can redistribute it and/or modify it
   8 #   under the terms of the GNU Lesser General Public License as published by
   9 #   the Free Software Foundation in version 2.1 of the License.
  10 #
  11 #   This program is distributed in the hope that it will be useful, but
  12 #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14 #   License for more details.
  15 #
  16 #   You should have received a copy of the GNU Lesser General Public License
  17 #   along with this program; if not, write to the Free Software Foundation,
  18 #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  19
  20 /**
  21  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
  22  *
  23  * @license LGPL
  24  * @access public
  25  * @package b8
  26  * @author Tobias Leupold
  27  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
  28  */
  29
  30 class b8
  31 {
  32
  33         public $config = array(
  34                 'min_size'      => 3,
  35                 'max_size'      => 30,
  36                 'allow_numbers' => FALSE,
  37                 'lexer'         => 'default',
  38                 'degenerator'   => 'default',
  39                 'storage'       => 'dba',
  40                 'use_relevant'  => 15,
  41                 'min_dev'       => 0.2,
  42                 'rob_s'         => 0.3,
  43                 'rob_x'         => 0.5
  44         );
  45
  46         private $_lexer      = NULL;
  47         private $_database   = NULL;
  48         private $_token_data = NULL;
  49
  50         const SPAM    = 'spam';
  51         const HAM     = 'ham';
  52         const LEARN   = 'learn';
  53         const UNLEARN = 'unlearn';
  54
  55         const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
  56         const STARTUP_FAIL_LEXER    = 'STARTUP_FAIL_LEXER';
  57         const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
  58
  59         /**
  60          * Constructs b8
  61          *
  62          * @access public
  63          * @return void
  64          */
  65
  66         function __construct($config = array(), $database_config)
  67         {
  68
  69                 # Validate config data
  70
  71                 if(count($config) > 0) {
  72
  73                         foreach ($config as $name=>$value) {
  74
  75                                 switch($name) {
  76
  77                                         case 'min_dev':
  78                                         case 'rob_s':
  79                                         case 'rob_x':
  80                                                 $this->config[$name] = (float) $value;
  81                                                 break;
  82
  83                                         case 'min_size':
  84                                         case 'max_size':
  85                                         case 'use_relevant':
  86                                                 $this->config[$name] = (int) $value;
  87                                                 break;
  88
  89                                         case 'allow_numbers':
  90                                                 $this->config[$name] = (bool) $value;
  91                                                 break;
  92
  93                                         case 'lexer':
  94                                                 $value = (string) strtolower($value);
  95                                                 $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
  96                                                 break;
  97
  98                                         case 'storage':
  99                                                 $this->config[$name] = (string) $value;
 100                                                 break;
 101
 102                                 }
 103
 104                         }
 105
 106                 }
 107
 108                 # Setup the database backend
 109
 110                 # Get the basic storage class used by all backends
 111                 if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
 112                         return;
 113
 114                 # Get the degenerator we need
 115                 if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
 116                         return;
 117
 118                 # Get the actual storage backend we need
 119                 if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
 120                         return;
 121
 122                 # Setup the backend
 123                 $class = 'b8_storage_' . $this->config['storage'];
 124                 $this->_database = new $class(
 125                         $database_config,
 126                         $this->config['degenerator'], date('ymd')
 127                 );
 128
 129                 # Setup the lexer class
 130
 131                 if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
 132                         return;
 133
 134                 $class = 'b8_lexer_' . $this->config['lexer'];
 135                 $this->_lexer = new $class(
 136                         array(
 137                                 'min_size' => $this->config['min_size'],
 138                                 'max_size' => $this->config['max_size'],
 139                                 'allow_numbers' => $this->config['allow_numbers']
 140                         )
 141                 );
 142
 143         }
 144
 145         /**
 146          * Load a class file if a class has not been defined yet.
 147          *
 148          * @access public
 149          * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
 150          */
 151
 152         public function load_class($class_name, $class_file)
 153         {
 154
 155                 if(class_exists($class_name, FALSE) === FALSE) {
 156
 157                         $included = require_once $class_file;
 158
 159                         if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
 160                                 return FALSE;
 161
 162                 }
 163
 164                 return TRUE;
 165
 166         }
 167
 168         /**
 169          * Validates the class has all it needs to work.
 170          *
 171          * @access public
 172          * @return mixed Returns TRUE if everything is okay, otherwise an error code.
 173          */
 174
 175         public function validate()
 176         {
 177
 178                 if($this->_database === NULL)
 179                         return self::STARTUP_FAIL_DATABASE;
 180
 181                 # Connect the database backend if we aren't connected yet
 182
 183                 elseif($this->_database->connected === FALSE) {
 184
 185                         $connection = $this->_database->connect();
 186
 187                         if($connection !== TRUE)
 188                                 return $connection;
 189
 190                 }
 191
 192                 if($this->_lexer === NULL)
 193                         return self::STARTUP_FAIL_LEXER;
 194
 195                 return TRUE;
 196
 197         }
 198
 199         /**
 200          * Classifies a text
 201          *
 202          * @access public
 203          * @package default
 204          * @param string $text
 205          * @return float The rating between 0 (ham) and 1 (spam)
 206          */
 207
 208         public function classify($uid,$text)
 209         {
 210
 211                 # Validate the startup
 212
 213                 $started_up = $this->validate();
 214
 215                 if($started_up !== TRUE)
 216                         return $started_up;
 217
 218                 # Get the internal database variables, containing the number of ham and
 219                 # spam texts so the spam probability can be calculated in relation to them
 220                 $internals = $this->_database->get_internals($uid);
 221
 222                 # Calculate the spamminess of all tokens
 223
 224                 # Get all tokens we want to rate
 225
 226                 $tokens = $this->_lexer->get_tokens($text);
 227
 228                 # Check if the lexer failed
 229                 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
 230                 if(!is_array($tokens))
 231                         return $tokens;
 232
 233                 # Fetch all availible data for the token set from the database
 234                 $this->_token_data = $this->_database->get(array_keys($tokens),$uid);
 235
 236                 # Calculate the spamminess and importance for each token (or a degenerated form of it)
 237
 238                 $word_count = array();
 239                 $rating     = array();
 240                 $importance = array();
 241
 242                 foreach($tokens as $word => $count) {
 243
 244                         $word_count[$word] = $count;
 245
 246                         # Although we only call this function only here ... let's do the
 247                         # calculation stuff in a function to make this a bit less confusing ;-)
 248                         $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);
 249
 250                         $importance[$word] = abs(0.5 - $rating[$word]);
 251
 252                 }
 253
 254                 # Order by importance
 255                 arsort($importance);
 256                 reset($importance);
 257
 258                 # Get the most interesting tokens (use all if we have less than the given number)
 259
 260                 $relevant = array();
 261
 262                 for($i = 0; $i < $this->config['use_relevant']; $i++) {
 263
 264                         if($tmp = each($importance)) {
 265
 266                                 # Important tokens remain
 267
 268                                 # If the token's rating is relevant enough, use it
 269
 270                                 if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {
 271
 272                                         # Tokens that appear more than once also count more than once
 273
 274                                         for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
 275                                                 array_push($relevant, $rating[$tmp['key']]);
 276
 277                                 }
 278
 279                         }
 280
 281                         else {
 282                                 # We have less than words to use, so we already
 283                                 # use what we have and can break here
 284                                 break;
 285                         }
 286
 287                 }
 288
 289                 # Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
 290                 # We set both hamminess and Spamminess to 1 for the first multiplying
 291                 $hamminess  = 1;
 292                 $spamminess = 1;
 293
 294                 # Consider all relevant ratings
 295                 foreach($relevant as $value) {
 296                         $hamminess  *= (1.0 - $value);
 297                         $spamminess *= $value;
 298                 }
 299
 300                 # If no token was good for calculation, we really don't know how
 301                 # to rate this text; so we assume a spam and ham probability of 0.5
 302
 303                 if($hamminess === 1 and $spamminess === 1) {
 304                         $hamminess = 0.5;
 305                         $spamminess = 0.5;
 306                         $n = 1;
 307                 }
 308                 else {
 309                         # Get the number of relevant ratings
 310                         $n = count($relevant);
 311                 }
 312
 313                 # Calculate the combined rating
 314
 315                 # The actual hamminess and spamminess
 316                 $hamminess  = 1 - pow($hamminess,  (1 / $n));
 317                 $spamminess = 1 - pow($spamminess, (1 / $n));
 318
 319                 # Calculate the combined indicator
 320                 $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);
 321
 322                 # We want a value between 0 and 1, not between -1 and +1, so ...
 323                 $probability = (1 + $probability) / 2;
 324
 325                 # Alea iacta est
 326                 return $probability;
 327
 328         }
 329
 330         /**
 331          * Calculate the spamminess of a single token also considering "degenerated" versions
 332          *
 333          * @access private
 334          * @param string $word
 335          * @param string $texts_ham
 336          * @param string $texts_spam
 337          * @return void
 338          */
 339
 340         private function _get_probability($word, $texts_ham, $texts_spam)
 341         {
 342
 343                 # Let's see what we have!
 344
 345                 if(isset($this->_token_data['tokens'][$word]) === TRUE) {
 346                         # The token was in the database, so we can use it's data as-is
 347                         # and calculate the spamminess of this token directly
 348                         return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
 349                 }
 350
 351                 # Damn. The token was not found, so do we have at least similar words?
 352
 353                 if(isset($this->_token_data['degenerates'][$word]) === TRUE) {
 354
 355                         # We found similar words, so calculate the spamminess for each one
 356                         # and choose the most important one for the further calculation
 357
 358                         # The default rating is 0.5 simply saying nothing
 359                         $rating = 0.5;
 360
 361                         foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {
 362
 363                                 # Calculate the rating of the current degenerated token
 364                                 $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);
 365
 366                                 # Is it more important than the rating of another degenerated version?
 367                                 if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
 368                                         $rating = $rating_tmp;
 369
 370                         }
 371
 372                         return $rating;
 373
 374                 }
 375
 376                 else {
 377                         # The token is really unknown, so choose the default rating
 378                         # for completely unknown tokens. This strips down to the
 379                         # robX parameter so we can cheap out the freaky math ;-)
 380                         return $this->config['rob_x'];
 381                 }
 382
 383         }
 384
 385         /**
 386          * Do the actual spamminess calculation of a single token
 387          *
 388          * @access private
 389          * @param array $data
 390          * @param string $texts_ham
 391          * @param string $texts_spam
 392          * @return void
 393          */
 394
 395         private function _calc_probability($data, $texts_ham, $texts_spam)
 396         {
 397
 398                 # Calculate the basic probability by Mr. Graham
 399
 400                 # But: consider the number of ham and spam texts saved instead of the
 401                 # number of entries where the token appeared to calculate a relative
 402                 # spamminess because we count tokens appearing multiple times not just
 403                 # once but as often as they appear in the learned texts
 404
 405                 $rel_ham = $data['count_ham'];
 406                 $rel_spam = $data['count_spam'];
 407
 408                 if($texts_ham > 0)
 409                         $rel_ham = $data['count_ham'] / $texts_ham;
 410
 411                 if($texts_spam > 0)
 412                         $rel_spam = $data['count_spam'] / $texts_spam;
 413
 414                 $rating = $rel_spam / ($rel_ham + $rel_spam);
 415
 416                 # Calculate the better probability proposed by Mr. Robinson
 417                 $all = $data['count_ham'] + $data['count_spam'];
 418                 return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);
 419
 420         }
 421
 422         /**
 423          * Check the validity of the category of a request
 424          *
 425          * @access private
 426          * @param string $category
 427          * @return void
 428          */
 429
 430         private function _check_category($category)
 431         {
 432                 return $category === self::HAM or $category === self::SPAM;
 433         }
 434
 435         /**
 436          * Learn a reference text
 437          *
 438          * @access public
 439          * @param string $text
 440          * @param const $category Either b8::SPAM or b8::HAM
 441          * @return void
 442          */
 443
 444         public function learn($text, $category, $uid)
 445         {
 446                 return $this->_process_text($text, $category, self::LEARN, $uid);
 447         }
 448
 449         /**
 450          * Unlearn a reference text
 451          *
 452          * @access public
 453          * @param string $text
 454          * @param const $category Either b8::SPAM or b8::HAM
 455          * @return void
 456          */
 457
 458         public function unlearn($text, $category, $uid)
 459         {
 460                 return $this->_process_text($text, $category, self::UNLEARN, $uid);
 461         }
 462
 463         /**
 464          * Does the actual interaction with the storage backend for learning or unlearning texts
 465          *
 466          * @access private
 467          * @param string $text
 468          * @param const $category Either b8::SPAM or b8::HAM
 469          * @param const $action Either b8::LEARN or b8::UNLEARN
 470          * @return void
 471          */
 472
 473         private function _process_text($text, $category, $action, $uid = 0)
 474         {
 475
 476                 # Validate the startup
 477
 478                 $started_up = $this->validate();
 479
 480                 if($started_up !== TRUE)
 481                         return $started_up;
 482
 483                 # Look if the request is okay
 484                 if($this->_check_category($category) === FALSE)
 485                         return self::TRAINER_CATEGORY_FAIL;
 486
 487                 # Get all tokens from $text
 488
 489                 $tokens = $this->_lexer->get_tokens($text);
 490
 491                 # Check if the lexer failed
 492                 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
 493                 if(!is_array($tokens))
 494                         return $tokens;
 495
 496                 # Pass the tokens and what to do with it to the storage backend
 497                 return $this->_database->process_text($tokens, $category, $action, $uid);
 498
 499         }
 500
 501 }
 502
 503 ?>