]> git.mxchange.org Git - friendica.git/blob - library/spam/b8/b8.php
DE update to the strings
[friendica.git] / library / spam / b8 / b8.php
1 <?php
2
3 #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
4 #
5 #   b8 - A Bayesian spam filter written in PHP 5
6 #
7 #   This program is free software; you can redistribute it and/or modify it
8 #   under the terms of the GNU Lesser General Public License as published by
9 #   the Free Software Foundation in version 2.1 of the License.
10 #
11 #   This program is distributed in the hope that it will be useful, but
12 #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14 #   License for more details.
15 #
16 #   You should have received a copy of the GNU Lesser General Public License
17 #   along with this program; if not, write to the Free Software Foundation,
18 #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19
20 /**
21  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
22  *
23  * @license LGPL
24  * @access public
25  * @package b8
26  * @author Tobias Leupold
27  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
28  */
29
30 class b8
31 {
32
33         public $config = array(
34                 'min_size'      => 3,
35                 'max_size'      => 30,
36                 'allow_numbers' => FALSE,
37                 'lexer'         => 'default',
38                 'degenerator'   => 'default',
39                 'storage'       => 'dba',
40                 'use_relevant'  => 15,
41                 'min_dev'       => 0.2,
42                 'rob_s'         => 0.3,
43                 'rob_x'         => 0.5
44         );
45
46         private $_lexer      = NULL;
47         private $_database   = NULL;
48         private $_token_data = NULL;
49
50         const SPAM    = 'spam';
51         const HAM     = 'ham';
52         const LEARN   = 'learn';
53         const UNLEARN = 'unlearn';
54
55         const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
56         const STARTUP_FAIL_LEXER    = 'STARTUP_FAIL_LEXER';
57         const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';
58
59         /**
60          * Constructs b8
61          *
62          * @access public
63          * @return void
64          */
65
66         function __construct($config = array(), $database_config)
67         {
68
69                 # Validate config data
70
71                 if(count($config) > 0) {
72
73                         foreach ($config as $name=>$value) {
74
75                                 switch($name) {
76
77                                         case 'min_dev':
78                                         case 'rob_s':
79                                         case 'rob_x':
80                                                 $this->config[$name] = (float) $value;
81                                                 break;
82
83                                         case 'min_size':
84                                         case 'max_size':
85                                         case 'use_relevant':
86                                                 $this->config[$name] = (int) $value;
87                                                 break;
88
89                                         case 'allow_numbers':
90                                                 $this->config[$name] = (bool) $value;
91                                                 break;
92
93                                         case 'lexer':
94                                                 $value = (string) strtolower($value);
95                                                 $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
96                                                 break;
97
98                                         case 'storage':
99                                                 $this->config[$name] = (string) $value;
100                                                 break;
101
102                                 }
103
104                         }
105
106                 }
107
108                 # Setup the database backend
109
110                 # Get the basic storage class used by all backends
111                 if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
112                         return;
113
114                 # Get the degenerator we need
115                 if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
116                         return;
117
118                 # Get the actual storage backend we need
119                 if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
120                         return;
121
122                 # Setup the backend
123                 $class = 'b8_storage_' . $this->config['storage'];
124                 $this->_database = new $class(
125                         $database_config,
126                         $this->config['degenerator'], date('ymd')
127                 );
128
129                 # Setup the lexer class
130
131                 if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
132                         return;
133
134                 $class = 'b8_lexer_' . $this->config['lexer'];
135                 $this->_lexer = new $class(
136                         array(
137                                 'min_size' => $this->config['min_size'],
138                                 'max_size' => $this->config['max_size'],
139                                 'allow_numbers' => $this->config['allow_numbers']
140                         )
141                 );
142
143         }
144
145         /**
146          * Load a class file if a class has not been defined yet.
147          *
148          * @access public
149          * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
150          */
151
152         public function load_class($class_name, $class_file)
153         {
154
155                 if(class_exists($class_name, FALSE) === FALSE) {
156
157                         $included = require_once $class_file;
158
159                         if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
160                                 return FALSE;
161
162                 }
163
164                 return TRUE;
165
166         }
167
168         /**
169          * Validates the class has all it needs to work.
170          *
171          * @access public
172          * @return mixed Returns TRUE if everything is okay, otherwise an error code.
173          */
174
175         public function validate()
176         {
177
178                 if($this->_database === NULL)
179                         return self::STARTUP_FAIL_DATABASE;
180
181                 # Connect the database backend if we aren't connected yet
182
183                 elseif($this->_database->connected === FALSE) {
184
185                         $connection = $this->_database->connect();
186
187                         if($connection !== TRUE)
188                                 return $connection;
189
190                 }
191
192                 if($this->_lexer === NULL)
193                         return self::STARTUP_FAIL_LEXER;
194
195                 return TRUE;
196
197         }
198
199         /**
200          * Classifies a text
201          *
202          * @access public
203          * @package default
204          * @param string $text
205          * @return float The rating between 0 (ham) and 1 (spam)
206          */
207
208         public function classify($uid,$text)
209         {
210
211                 # Validate the startup
212
213                 $started_up = $this->validate();
214
215                 if($started_up !== TRUE)
216                         return $started_up;
217
218                 # Get the internal database variables, containing the number of ham and
219                 # spam texts so the spam probability can be calculated in relation to them
220                 $internals = $this->_database->get_internals($uid);
221
222                 # Calculate the spamminess of all tokens
223
224                 # Get all tokens we want to rate
225
226                 $tokens = $this->_lexer->get_tokens($text);
227
228                 # Check if the lexer failed
229                 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
230                 if(!is_array($tokens))
231                         return $tokens;
232
233                 # Fetch all availible data for the token set from the database
234                 $this->_token_data = $this->_database->get(array_keys($tokens),$uid);
235
236                 # Calculate the spamminess and importance for each token (or a degenerated form of it)
237
238                 $word_count = array();
239                 $rating     = array();
240                 $importance = array();
241
242                 foreach($tokens as $word => $count) {
243
244                         $word_count[$word] = $count;
245
246                         # Although we only call this function only here ... let's do the
247                         # calculation stuff in a function to make this a bit less confusing ;-)
248                         $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);
249
250                         $importance[$word] = abs(0.5 - $rating[$word]);
251
252                 }
253
254                 # Order by importance
255                 arsort($importance);
256                 reset($importance);
257
258                 # Get the most interesting tokens (use all if we have less than the given number)
259
260                 $relevant = array();
261
262                 for($i = 0; $i < $this->config['use_relevant']; $i++) {
263
264                         if($tmp = each($importance)) {
265
266                                 # Important tokens remain
267
268                                 # If the token's rating is relevant enough, use it
269
270                                 if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {
271
272                                         # Tokens that appear more than once also count more than once
273
274                                         for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
275                                                 array_push($relevant, $rating[$tmp['key']]);
276
277                                 }
278
279                         }
280
281                         else {
282                                 # We have less than words to use, so we already
283                                 # use what we have and can break here
284                                 break;
285                         }
286
287                 }
288
289                 # Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
290                 # We set both hamminess and Spamminess to 1 for the first multiplying
291                 $hamminess  = 1;
292                 $spamminess = 1;
293
294                 # Consider all relevant ratings
295                 foreach($relevant as $value) {
296                         $hamminess  *= (1.0 - $value);
297                         $spamminess *= $value;
298                 }
299
300                 # If no token was good for calculation, we really don't know how
301                 # to rate this text; so we assume a spam and ham probability of 0.5
302
303                 if($hamminess === 1 and $spamminess === 1) {
304                         $hamminess = 0.5;
305                         $spamminess = 0.5;
306                         $n = 1;
307                 }
308                 else {
309                         # Get the number of relevant ratings
310                         $n = count($relevant);
311                 }
312
313                 # Calculate the combined rating
314
315                 # The actual hamminess and spamminess
316                 $hamminess  = 1 - pow($hamminess,  (1 / $n));
317                 $spamminess = 1 - pow($spamminess, (1 / $n));
318
319                 # Calculate the combined indicator
320                 $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);
321
322                 # We want a value between 0 and 1, not between -1 and +1, so ...
323                 $probability = (1 + $probability) / 2;
324
325                 # Alea iacta est
326                 return $probability;
327
328         }
329
330         /**
331          * Calculate the spamminess of a single token also considering "degenerated" versions
332          *
333          * @access private
334          * @param string $word
335          * @param string $texts_ham
336          * @param string $texts_spam
337          * @return void
338          */
339
340         private function _get_probability($word, $texts_ham, $texts_spam)
341         {
342
343                 # Let's see what we have!
344
345                 if(isset($this->_token_data['tokens'][$word]) === TRUE) {
346                         # The token was in the database, so we can use it's data as-is
347                         # and calculate the spamminess of this token directly
348                         return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
349                 }
350
351                 # Damn. The token was not found, so do we have at least similar words?
352
353                 if(isset($this->_token_data['degenerates'][$word]) === TRUE) {
354
355                         # We found similar words, so calculate the spamminess for each one
356                         # and choose the most important one for the further calculation
357
358                         # The default rating is 0.5 simply saying nothing
359                         $rating = 0.5;
360
361                         foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {
362
363                                 # Calculate the rating of the current degenerated token
364                                 $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);
365
366                                 # Is it more important than the rating of another degenerated version?
367                                 if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
368                                         $rating = $rating_tmp;
369
370                         }
371
372                         return $rating;
373
374                 }
375
376                 else {
377                         # The token is really unknown, so choose the default rating
378                         # for completely unknown tokens. This strips down to the
379                         # robX parameter so we can cheap out the freaky math ;-)
380                         return $this->config['rob_x'];
381                 }
382
383         }
384
385         /**
386          * Do the actual spamminess calculation of a single token
387          *
388          * @access private
389          * @param array $data
390          * @param string $texts_ham
391          * @param string $texts_spam
392          * @return void
393          */
394
395         private function _calc_probability($data, $texts_ham, $texts_spam)
396         {
397
398                 # Calculate the basic probability by Mr. Graham
399
400                 # But: consider the number of ham and spam texts saved instead of the
401                 # number of entries where the token appeared to calculate a relative
402                 # spamminess because we count tokens appearing multiple times not just
403                 # once but as often as they appear in the learned texts
404
405                 $rel_ham = $data['count_ham'];
406                 $rel_spam = $data['count_spam'];
407
408                 if($texts_ham > 0)
409                         $rel_ham = $data['count_ham'] / $texts_ham;
410
411                 if($texts_spam > 0)
412                         $rel_spam = $data['count_spam'] / $texts_spam;
413
414                 $rating = $rel_spam / ($rel_ham + $rel_spam);
415
416                 # Calculate the better probability proposed by Mr. Robinson
417                 $all = $data['count_ham'] + $data['count_spam'];
418                 return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);
419
420         }
421
422         /**
423          * Check the validity of the category of a request
424          *
425          * @access private
426          * @param string $category
427          * @return void
428          */
429
430         private function _check_category($category)
431         {
432                 return $category === self::HAM or $category === self::SPAM;
433         }
434
435         /**
436          * Learn a reference text
437          *
438          * @access public
439          * @param string $text
440          * @param const $category Either b8::SPAM or b8::HAM
441          * @return void
442          */
443
444         public function learn($text, $category, $uid)
445         {
446                 return $this->_process_text($text, $category, self::LEARN, $uid);
447         }
448
449         /**
450          * Unlearn a reference text
451          *
452          * @access public
453          * @param string $text
454          * @param const $category Either b8::SPAM or b8::HAM
455          * @return void
456          */
457
458         public function unlearn($text, $category, $uid)
459         {
460                 return $this->_process_text($text, $category, self::UNLEARN, $uid);
461         }
462
463         /**
464          * Does the actual interaction with the storage backend for learning or unlearning texts
465          *
466          * @access private
467          * @param string $text
468          * @param const $category Either b8::SPAM or b8::HAM
469          * @param const $action Either b8::LEARN or b8::UNLEARN
470          * @return void
471          */
472
473         private function _process_text($text, $category, $action, $uid = 0)
474         {
475
476                 # Validate the startup
477
478                 $started_up = $this->validate();
479
480                 if($started_up !== TRUE)
481                         return $started_up;
482
483                 # Look if the request is okay
484                 if($this->_check_category($category) === FALSE)
485                         return self::TRAINER_CATEGORY_FAIL;
486
487                 # Get all tokens from $text
488
489                 $tokens = $this->_lexer->get_tokens($text);
490
491                 # Check if the lexer failed
492                 # (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
493                 if(!is_array($tokens))
494                         return $tokens;
495
496                 # Pass the tokens and what to do with it to the storage backend
497                 return $this->_database->process_text($tokens, $category, $action, $uid);
498
499         }
500
501 }
502
503 ?>