library/spam/b8/lexer/lexer_default.php

   1 <?php
   2
   3 #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
   4 #
   5 #   This file is part of the b8 package
   6 #
   7 #   This program is free software; you can redistribute it and/or modify it
   8 #   under the terms of the GNU Lesser General Public License as published by
   9 #   the Free Software Foundation in version 2.1 of the License.
  10 #
  11 #   This program is distributed in the hope that it will be useful, but
  12 #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  14 #   License for more details.
  15 #
  16 #   You should have received a copy of the GNU Lesser General Public License
  17 #   along with this program; if not, write to the Free Software Foundation,
  18 #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  19
  20 /**
  21  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
  22  *
  23  * @license LGPL
  24  * @access public
  25  * @package b8
  26  * @author Tobias Leupold
  27  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
  28  */
  29
  30 class b8_lexer_default
  31 {
  32
  33         const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
  34         const LEXER_TEXT_EMPTY      = 'LEXER_TEXT_EMPTY';
  35
  36         public $config = NULL;
  37
  38         # The regular expressions we use to split the text to tokens
  39
  40         public $regexp = array(
  41                 'ip'        => '/([A-Za-z0-9\_\-\.]+)/',
  42                 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
  43                 'html'      => '/(<.+?>)/',
  44                 'tagname'   => '/(.+?)\s/',
  45                 'numbers'   => '/^[0-9]+$/'
  46         );
  47
  48         /**
  49          * Constructs the lexer.
  50          *
  51          * @access public
  52          * @return void
  53          */
  54
  55         function __construct($config)
  56         {
  57                 $this->config = $config;
  58         }
  59
  60         /**
  61          * Generates the tokens required for the bayesian filter.
  62          *
  63          * @access public
  64          * @param string $text
  65          * @return array Returns the list of tokens
  66          */
  67
  68         public function get_tokens($text)
  69         {
  70
  71                 # Check that we actually have a string ...
  72                 if(is_string($text) === FALSE)
  73                         return self::LEXER_TEXT_NOT_STRING;
  74
  75                 # ... and that it's not empty
  76                 if(empty($text) === TRUE)
  77                         return self::LEXER_TEXT_EMPTY;
  78
  79                 # Re-convert the text to the original characters coded in UTF-8, as
  80                 # they have been coded in html entities during the post process
  81                 $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
  82
  83                 $tokens = array();
  84
  85                 # Find URLs and IP addresses
  86
  87                 preg_match_all($this->regexp['ip'], $text, $raw_tokens);
  88
  89                 foreach($raw_tokens[1] as $word) {
  90
  91                         # Check for a dot
  92                         if(strpos($word, '.') === FALSE)
  93                                 continue;
  94
  95                         # Check that the word is valid, min and max sizes, etc.
  96                         if($this->_is_valid($word) === FALSE)
  97                                 continue;
  98
  99                         if(isset($tokens[$word]) === FALSE)
 100                                 $tokens[$word] = 1;
 101                         else
 102                                 $tokens[$word] += 1;
 103
 104                         # Delete the word from the text so it doesn't get re-added.
 105                         $text = str_replace($word, '', $text);
 106
 107                         # Also process the parts of the URLs
 108                         $url_parts = preg_split($this->regexp['raw_split'], $word);
 109
 110                         foreach($url_parts as $word) {
 111
 112                                 # Again validate the part
 113
 114                                 if($this->_is_valid($word) === FALSE)
 115                                         continue;
 116
 117                                 if(isset($tokens[$word]) === FALSE)
 118                                         $tokens[$word] = 1;
 119                                 else
 120                                         $tokens[$word] += 1;
 121
 122                         }
 123
 124                 }
 125
 126                 # Split the remaining text
 127
 128                 $raw_tokens = preg_split($this->regexp['raw_split'], $text);
 129
 130                 foreach($raw_tokens as $word) {
 131
 132                         # Again validate the part
 133
 134                         if($this->_is_valid($word) === FALSE)
 135                                 continue;
 136
 137                         if(isset($tokens[$word]) === FALSE)
 138                                 $tokens[$word] = 1;
 139                         else
 140                                 $tokens[$word] += 1;
 141
 142                 }
 143
 144                 # Process the HTML
 145
 146                 preg_match_all($this->regexp['html'], $text, $raw_tokens);
 147
 148                 foreach($raw_tokens[1] as $word) {
 149
 150                         # Again validate the part
 151
 152                         if($this->_is_valid($word) === FALSE)
 153                                 continue;
 154
 155                         # If the tag has parameters, just use the tag itself
 156
 157                         if(strpos($word, ' ') !== FALSE) {
 158                                 preg_match($this->regexp['tagname'], $word, $tmp);
 159                                 $word = "{$tmp[1]}...>";
 160                         }
 161
 162                         if(isset($tokens[$word]) === FALSE)
 163                                 $tokens[$word] = 1;
 164                         else
 165                                 $tokens[$word] += 1;
 166
 167                 }
 168
 169                 # Return a list of all found tokens
 170                 return $tokens;
 171
 172         }
 173
 174         /**
 175          * Validates a token.
 176          *
 177          * @access private
 178          * @param string $token The token string.
 179          * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
 180          */
 181
 182         private function _is_valid($token)
 183         {
 184
 185                 # Validate the size of the token
 186
 187                 $len = strlen($token);
 188
 189                 if($len < $this->config['min_size'] or $len > $this->config['max_size'])
 190                         return FALSE;
 191
 192                 # We may want to exclude pure numbers
 193                 if($this->config['allow_numbers'] === FALSE) {
 194                         if(preg_match($this->regexp['numbers'], $token) > 0)
 195                                 return FALSE;
 196                 }
 197
 198                 # Token is okay
 199                 return TRUE;
 200
 201         }
 202
 203 }
 204
 205 ?>