]> git.mxchange.org Git - friendica.git/blob - library/spam/b8/lexer/lexer_default.php
Merge branch 'master' into spam
[friendica.git] / library / spam / b8 / lexer / lexer_default.php
1 <?php
2
3 #   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
4 #
5 #   This file is part of the b8 package
6 #
7 #   This program is free software; you can redistribute it and/or modify it
8 #   under the terms of the GNU Lesser General Public License as published by
9 #   the Free Software Foundation in version 2.1 of the License.
10 #
11 #   This program is distributed in the hope that it will be useful, but
12 #   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 #   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14 #   License for more details.
15 #
16 #   You should have received a copy of the GNU Lesser General Public License
17 #   along with this program; if not, write to the Free Software Foundation,
18 #   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19
20 /**
21  * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
22  *
23  * @license LGPL
24  * @access public
25  * @package b8
26  * @author Tobias Leupold
27  * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
28  */
29
30 class b8_lexer_default
31 {
32
33         const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
34         const LEXER_TEXT_EMPTY      = 'LEXER_TEXT_EMPTY';
35
36         public $config = NULL;
37
38         # The regular expressions we use to split the text to tokens
39
40         public $regexp = array(
41                 'ip'        => '/([A-Za-z0-9\_\-\.]+)/',
42                 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
43                 'html'      => '/(<.+?>)/',
44                 'tagname'   => '/(.+?)\s/',
45                 'numbers'   => '/^[0-9]+$/'
46         );
47
48         /**
49          * Constructs the lexer.
50          *
51          * @access public
52          * @return void
53          */
54
55         function __construct($config)
56         {
57                 $this->config = $config;
58         }
59
60         /**
61          * Generates the tokens required for the bayesian filter.
62          *
63          * @access public
64          * @param string $text
65          * @return array Returns the list of tokens
66          */
67
68         public function get_tokens($text)
69         {
70
71                 # Check that we actually have a string ...
72                 if(is_string($text) === FALSE)
73                         return self::LEXER_TEXT_NOT_STRING;
74
75                 # ... and that it's not empty
76                 if(empty($text) === TRUE)
77                         return self::LEXER_TEXT_EMPTY;
78
79                 # Re-convert the text to the original characters coded in UTF-8, as
80                 # they have been coded in html entities during the post process
81                 $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
82
83                 $tokens = array();
84
85                 # Find URLs and IP addresses
86
87                 preg_match_all($this->regexp['ip'], $text, $raw_tokens);
88
89                 foreach($raw_tokens[1] as $word) {
90
91                         # Check for a dot
92                         if(strpos($word, '.') === FALSE)
93                                 continue;
94
95                         # Check that the word is valid, min and max sizes, etc.
96                         if($this->_is_valid($word) === FALSE)
97                                 continue;
98
99                         if(isset($tokens[$word]) === FALSE)
100                                 $tokens[$word] = 1;
101                         else
102                                 $tokens[$word] += 1;
103
104                         # Delete the word from the text so it doesn't get re-added.
105                         $text = str_replace($word, '', $text);
106
107                         # Also process the parts of the URLs
108                         $url_parts = preg_split($this->regexp['raw_split'], $word);
109
110                         foreach($url_parts as $word) {
111
112                                 # Again validate the part
113
114                                 if($this->_is_valid($word) === FALSE)
115                                         continue;
116
117                                 if(isset($tokens[$word]) === FALSE)
118                                         $tokens[$word] = 1;
119                                 else
120                                         $tokens[$word] += 1;
121
122                         }
123
124                 }
125
126                 # Split the remaining text
127
128                 $raw_tokens = preg_split($this->regexp['raw_split'], $text);
129
130                 foreach($raw_tokens as $word) {
131
132                         # Again validate the part
133
134                         if($this->_is_valid($word) === FALSE)
135                                 continue;
136
137                         if(isset($tokens[$word]) === FALSE)
138                                 $tokens[$word] = 1;
139                         else
140                                 $tokens[$word] += 1;
141
142                 }
143
144                 # Process the HTML
145
146                 preg_match_all($this->regexp['html'], $text, $raw_tokens);
147
148                 foreach($raw_tokens[1] as $word) {
149
150                         # Again validate the part
151
152                         if($this->_is_valid($word) === FALSE)
153                                 continue;
154
155                         # If the tag has parameters, just use the tag itself
156
157                         if(strpos($word, ' ') !== FALSE) {
158                                 preg_match($this->regexp['tagname'], $word, $tmp);
159                                 $word = "{$tmp[1]}...>";
160                         }
161
162                         if(isset($tokens[$word]) === FALSE)
163                                 $tokens[$word] = 1;
164                         else
165                                 $tokens[$word] += 1;
166
167                 }
168
169                 # Return a list of all found tokens
170                 return $tokens;
171
172         }
173
174         /**
175          * Validates a token.
176          *
177          * @access private
178          * @param string $token The token string.
179          * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
180          */
181
182         private function _is_valid($token)
183         {
184
185                 # Validate the size of the token
186
187                 $len = strlen($token);
188
189                 if($len < $this->config['min_size'] or $len > $this->config['max_size'])
190                         return FALSE;
191
192                 # We may want to exclude pure numbers
193                 if($this->config['allow_numbers'] === FALSE) {
194                         if(preg_match($this->regexp['numbers'], $token) > 0)
195                                 return FALSE;
196                 }
197
198                 # Token is okay
199                 return TRUE;
200
201         }
202
203 }
204
205 ?>