]> git.mxchange.org Git - friendica.git/blobdiff - library/spam/b8/storage/storage_base.php
add spam engine
[friendica.git] / library / spam / b8 / storage / storage_base.php
diff --git a/library/spam/b8/storage/storage_base.php b/library/spam/b8/storage/storage_base.php
new file mode 100644 (file)
index 0000000..01f5a69
--- /dev/null
@@ -0,0 +1,395 @@
+<?php
+
+#   Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
+#
+#   This file is part of the b8 package
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU Lesser General Public License as published by
+#   the Free Software Foundation in version 2.1 of the License.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#   License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public License
+#   along with this program; if not, write to the Free Software Foundation,
+#   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+/**
+ * Functions used by all storage backends
+ * Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de>
+ *
+ * @license LGPL
+ * @access public
+ * @package b8
+ * @author Tobias Leupold
+ */
+
+abstract class b8_storage_base
+{
+
+       public $connected            = FALSE;
+
+       protected $_degenerator      = NULL;
+
+       const INTERNALS_TEXTS_HAM    = 'bayes*texts.ham';
+       const INTERNALS_TEXTS_SPAM   = 'bayes*texts.spam';
+       const INTERNALS_DBVERSION    = 'bayes*dbversion';
+
+       const BACKEND_NOT_CONNECTED  = 'BACKEND_NOT_CONNECTED';
+       const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION';
+       const DATABASE_NOT_B8        = 'DATABASE_NOT_B8';
+
+       /**
+        * Validates the class has all it needs to work.
+        *
+        * @access protected
+        * @return mixed Returns TRUE if everything is okay, otherwise an error code.
+        */
+
+       protected function validate()
+       {
+
+               # We set up the degenerator here, as we would have to duplicate code if it
+               # was done in the constructor of the respective storage backend.
+               $class = 'b8_degenerator_' . $this->b8_config['degenerator'];
+               $this->_degenerator = new $class();
+
+               if($this->connected !== TRUE)
+                       return self::BACKEND_NOT_CONNECTED;
+
+               return TRUE;
+
+       }
+
+       /**
+        * Checks if a b8 database is used and if it's version is okay
+        *
+        * @access protected
+        * @return mixed Returns TRUE if everything is okay, otherwise an error code.
+        */
+
+       protected function check_database()
+       {
+
+               $internals = $this->get_internals();
+
+               if(isset($internals['dbversion'])) {
+                       if($internals['dbversion'] == "2") {
+                               return TRUE;
+                       }
+                       else {
+                               $this->connected = FALSE;
+                               return self::DATABASE_WRONG_VERSION;
+                       }
+               }
+               else {
+                       $this->connected = FALSE;
+                       return self::DATABASE_NOT_B8;
+               }
+
+       }
+
+       /**
+        * Parses the "count" data of a token.
+        *
+        * @access private
+        * @param string $data
+        * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen).
+        */
+
+       private function _parse_count($data)
+       {
+
+               list($count_ham, $count_spam, $lastseen) = explode(' ', $data);
+
+               $count_ham  = (int) $count_ham;
+               $count_spam = (int) $count_spam;
+
+               return array(
+                       'count_ham'  => $count_ham,
+                       'count_spam' => $count_spam
+               );
+
+       }
+
+       /**
+        * Get the database's internal variables.
+        *
+        * @access public
+        * @return array Returns an array of all internals.
+        */
+
+       public function get_internals()
+       {
+
+               $internals = $this->_get_query(
+                       array(
+                               self::INTERNALS_TEXTS_HAM,
+                               self::INTERNALS_TEXTS_SPAM,
+                               self::INTERNALS_DBVERSION
+                       )
+               );
+
+               return array(
+                       'texts_ham'  => (int) $internals[self::INTERNALS_TEXTS_HAM],
+                       'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM],
+                       'dbversion'  => (int) $internals[self::INTERNALS_DBVERSION]
+               );
+
+       }
+
+       /**
+        * Get all data about a list of tags from the database.
+        *
+        * @access public
+        * @param array $tokens
+        * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))).
+        */
+
+       public function get($tokens)
+       {
+
+               # Validate the startup
+
+               $started_up = $this->validate();
+
+               if($started_up !== TRUE)
+                       return $started_up;
+
+               # First we see what we have in the database.
+               $token_data = $this->_get_query($tokens);
+
+               # Check if we have to degenerate some tokens
+
+               $missing_tokens = array();
+
+               foreach($tokens as $token) {
+                       if(!isset($token_data[$token]))
+                               $missing_tokens[] = $token;
+               }
+
+               if(count($missing_tokens) > 0) {
+
+                       # We have to degenerate some tokens
+                       $degenerates_list = array();
+
+                       # Generate a list of degenerated tokens for the missing tokens ...
+                       $degenerates = $this->_degenerator->degenerate($missing_tokens);
+
+                       # ... and look them up
+
+                       foreach($degenerates as $token => $token_degenerates)
+                               $degenerates_list = array_merge($degenerates_list, $token_degenerates);
+
+                       $token_data = array_merge($token_data, $this->_get_query($degenerates_list));
+
+               }
+
+               # Here, we have all availible data in $token_data.
+
+               $return_data_tokens = array();
+               $return_data_degenerates = array();
+
+               foreach($tokens as $token) {
+
+                       if(isset($token_data[$token]) === TRUE) {
+
+                               # The token was found in the database
+
+                               # Add the data ...
+                               $return_data_tokens[$token] = $this->_parse_count($token_data[$token]);
+
+                               # ... and update it's lastseen parameter
+                               $this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']);
+
+                       }
+
+                       else {
+
+                               # The token was not found, so we look if we
+                               # can return data for degenerated tokens
+
+                               # Check all degenerated forms of the token
+
+                               foreach($this->_degenerator->degenerates[$token] as $degenerate) {
+
+                                       if(isset($token_data[$degenerate]) === TRUE) {
+
+                                               # A degeneration of the token way found in the database
+
+                                               # Add the data ...
+                                               $return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]);
+
+                                               # ... and update it's lastseen parameter
+                                               $this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']);
+
+                                       }
+
+                               }
+
+                       }
+
+               }
+
+               # Now, all token data directly found in the database is in $return_data_tokens
+               # and all data for degenerated versions is in $return_data_degenerates
+
+               # First, we commit the changes to the lastseen parameters
+               $this->_commit();
+
+               # Then, we return what we have
+               return array(
+                       'tokens'      => $return_data_tokens,
+                       'degenerates' => $return_data_degenerates
+               );
+
+       }
+
+       /**
+        * Stores or deletes a list of tokens from the given category.
+        *
+        * @access public
+        * @param array $tokens
+        * @param const $category Either b8::HAM or b8::SPAM
+        * @param const $action Either b8::LEARN or b8::UNLEARN
+        * @return void
+        */
+
+       public function process_text($tokens, $category, $action)
+       {
+
+               # Validate the startup
+
+               $started_up = $this->validate();
+
+               if($started_up !== TRUE)
+                       return $started_up;
+
+               # No matter what we do, we first have to check what data we have.
+
+               # First get the internals, including the ham texts and spam texts counter
+               $internals = $this->get_internals();
+
+               # Then, fetch all data for all tokens we have (and update their lastseen parameters)
+               $token_data = $this->_get_query(array_keys($tokens));
+
+               # Process all tokens to learn/unlearn
+
+               foreach($tokens as $token => $count) {
+
+                       if(isset($token_data[$token])) {
+
+                               # We already have this token, so update it's data
+
+                               # Get the existing data
+                               list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]);
+                               $count_ham  = (int) $count_ham;
+                               $count_spam = (int) $count_spam;
+
+                               # Increase or decrease the right counter
+
+                               if($action === b8::LEARN) {
+                                       if($category === b8::HAM)
+                                               $count_ham += $count;
+                                       elseif($category === b8::SPAM)
+                                               $count_spam += $count;
+                               }
+
+                               elseif($action == b8::UNLEARN) {
+                                       if($category === b8::HAM)
+                                               $count_ham -= $count;
+                                       elseif($category === b8::SPAM)
+                                               $count_spam -= $count;
+                               }
+
+                               # We don't want to have negative values
+
+                               if($count_ham < 0)
+                                       $count_ham = 0;
+
+                               if($count_spam < 0)
+                                       $count_spam = 0;
+
+                               # Now let's see if we have to update or delete the token
+                               if($count_ham !== 0 or $count_spam !== 0)
+                                       $this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']);
+                               else
+                                       $this->_del($token);
+
+                       }
+
+                       else {
+
+                               # We don't have the token. If we unlearn a text, we can't delete it
+                               # as we don't have it anyway, so just do something if we learn a text
+
+                               if($action === b8::LEARN) {
+
+                                       if($category === b8::HAM)
+                                               $data = '1 0 ';
+                                       elseif($category === b8::SPAM)
+                                               $data = '0 1 ';
+
+                                       $data .= $this->b8_config['today'];
+
+                                       $this->_put($token, $data);
+
+                               }
+
+                       }
+
+               }
+
+               # Now, all token have been processed, so let's update the right text
+
+               if($action === b8::LEARN) {
+
+                       if($category === b8::HAM) {
+                               $internals['texts_ham']++;
+                               $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']);
+                       }
+
+                       elseif($category === b8::SPAM) {
+                               $internals['texts_spam']++;
+                               $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']);
+                       }
+
+               }
+
+               elseif($action == b8::UNLEARN) {
+
+                       if($category === b8::HAM) {
+
+                               $internals['texts_ham']--;
+
+                               if($internals['texts_ham'] < 0)
+                                       $internals['texts_ham'] = 0;
+
+                               $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']);
+
+                       }
+
+                       elseif($category === b8::SPAM) {
+
+                               $internals['texts_spam']--;
+
+                               if($internals['texts_spam'] < 0)
+                                       $internals['texts_spam'] = 0;
+
+                               $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']);
+
+                       }
+
+               }
+
+               # We're done and can commit all changes to the database now
+               $this->_commit();
+
+       }
+
+}
+
+?>
\ No newline at end of file