]> git.mxchange.org Git - hub.git/commitdiff
Merge branch 'master' into refacuring/protocol_handler
authorRoland Haeder <roland@mxchange.org>
Sun, 7 Dec 2014 22:30:22 +0000 (23:30 +0100)
committerRoland Haeder <roland@mxchange.org>
Sun, 7 Dec 2014 22:30:22 +0000 (23:30 +0100)
Signed-off-by: Roland Haeder <roland@mxchange.org>
Conflicts:
application/hub/config.php
application/hub/interfaces/resolver/.htaccess
application/hub/main/nodes/class_BaseHubNode.php
application/hub/main/resolver/protocol/class_BaseProtocolResolver.php
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php

1  2 
application/hub/config.php
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php

index 217135e032a9565784a9bbf68573fd575c11ef78,33069d54f026ad4de21587a26a55474c806ba3d0..f6196ff8b6f106de492d1078625554cf336ae068
@@@ -4,7 -4,7 +4,7 @@@
   *
   * @author            Roland Haeder <webmaster@shipsimu.org>
   * @version           0.0
 - * @copyright Copyright (c) 2007 - 2008 Roland Haeder, 2009 - 2012 Hub Developer Team
 + * @copyright Copyright (c) 2007 - 2008 Roland Haeder, 2009 - 2014 Hub Developer Team
   * @license           GNU GPL 3.0 or any newer version
   *
   * This program is free software: you can redistribute it and/or modify
@@@ -729,9 -729,6 +729,9 @@@ $cfg->setConfigEntry('socket_discovery_
  // CFG: DHT-RECIPIENT-DISCOVERY-CLASS
  $cfg->setConfigEntry('dht_recipient_discovery_class', 'DhtRecipientDiscovery');
  
 +// CFG: UNIVERSAL-NODE-LOCATOR-DISCOVERY-CLASS
 +$cfg->setConfigEntry('unl_discovery_class', 'UniversalNodeLocatorDiscovery');
 +
  // CFG: RECIPIENT-LIST-CLASS
  $cfg->setConfigEntry('recipient_list_class', 'RecipientList');
  
@@@ -744,12 -741,6 +744,12 @@@ $cfg->setConfigEntry('tcp_connection_he
  // CFG: UDP-CONNECTION-HELPER-CLASS
  $cfg->setConfigEntry('udp_connection_helper_class', 'UdpConnectionHelper');
  
 +// CFG: HUB-COMMUNICATION-PROTOCOL-TYPE
 +$cfg->setConfigEntry('hub_communication_protocol_type', 'tcp');
 +
 +// CFG: TCP-PROTOCOL-RESOLVER-CLASS
 +$cfg->setConfigEntry('tcp_protocol_resolver_class', 'TcpProtocolResolver');
 +
  // CFG: TCP-BUFFER-LENGTH
  $cfg->setConfigEntry('tcp_buffer_length', 1024);
  
@@@ -1298,6 -1289,12 +1298,12 @@@ $cfg->setConfigEntry('crawler_url_rss_s
  // CFG: CRAWLER-URL-FOUND-RSS-FILE-STACK-INDEX-CLASS
  $cfg->setConfigEntry('crawler_url_found_rss_file_stack_index_class', 'FileStackIndex');
  
+ // CFG: CRAWLER-URL-UPLOADED-LIST-URL-SOURCE-STACK-CLASS
+ $cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacker');
+ // CFG: STACKER-CSV-FILE-MAX-SIZE
+ $cfg->setConfigEntry('stacker_csv_file_max_size', 10);
  // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY
  $cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500);
  
@@@ -1463,8 -1460,8 +1469,8 @@@ $cfg->setConfigEntry('task_crawler_uplo
  // CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS
  $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0);
  
 -// CFG: CSV-FILE-PATH
 -$cfg->setConfigEntry('csv_file_path', 'data/url_lists');
 +// CFG: CRAWLER-CSV-FILE-PATH
 +$cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists');
  
  ///////////////////////////////////////////////////////////////////////////////
  //                            HTTP Configuration
index c98aa6ff039e347b3a808259f5b93a6ff702f77f,ec77631f0c64191c4ca852d3974b6e1f97ca667c..5b1814594a3b565105a4a83f92c4d48c866637cb
   */
  class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
        /**
-        * Cached path of CSV files
+        * "Cached" CSV path
         */
-       private $csvFilesPath = '';
+       private $csvFilePath = '';
  
        /**
-        * Last found CSV file
+        * Last CSV file instance
         */
-       private $lastCsvFile = '';
+       private $lastCsvFileInstance = NULL;
+       /**
+        * Stack for pushing data from this clas to another
+        */
+       private $stackSourceInstance = NULL;
+       /**
+        * Stack name for a CSV file
+        */
+       const STACK_NAME_CSV_FILE = 'csv_file';
+       /**
+        * "Imported" CSV files
+        */
+       private $csvFileImported = array();
  
        /**
         * Protected constructor
                // Call parent constructor
                parent::__construct(__CLASS__);
  
-               // Set CSV files path
-               $this->csvFilesPath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
+               // "Cache" CSV path for faster usage
+               $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path');
  
-               // Get directory instance
-               $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilesPath));
+               // Initialize directory instance
+               $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
  
                // Set it here
                $this->setDirectoryInstance($directoryInstance);
+               // Init stack instance
+               $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
+               // Init stack
+               $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
        }
  
        /**
 -       * Creates an instance of this class
 -       *
 -       * @return      $sourceInstance         An instance of a Source class
 -       */
 -      public final static function createCrawlerUploadedListUrlSource () {
 -              // Get new instance
 -              $sourceInstance = new CrawlerUploadedListUrlSource();
 -
 -              // Init source
 -              $sourceInstance->initSource('crawler', 'uploaded_list');
 -
 -              // Return the prepared instance
 -              return $sourceInstance;
 -      }
 -
 -      /**
 -       * Checks whether a CSV file is found
 +       * Checks whether a CSV file is found in configured path
         *
         * @return      $isFound        Whether a CSV file is found
         */
        private function isCsvFileFound () {
 -              // Is the instance valid?
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
 +              // Is it valid?
                if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
 -                      // Then rewind it
 +                      // Rewind to start
                        $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
                } // END - if
  
                // Read next entry
-               $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array('.htaccess', '.', '..'));
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
+               $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
  
 -              // The read entry has not to be empty and extension must be '.csv'
 +              // Debug message
 +              /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
 +
 +              // Is it empty or wrong file extension?
                if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
                        // Skip further processing
 +                      /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
                        return FALSE;
                } // END - if
  
-               // Initialize CSV instance
-               $csvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilesPath . '/' . $directoryEntry));
+               // Initialize CSV file instance
+               $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry));
  
-               // Set it here
-               $this->setCsvFileInstance($csvFileInstance);
-               // Found a file
+               // Found an entry
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
                return TRUE;
        }
  
-               // Get a ??? @TODO
 +      /**
 +       * Creates an instance of this class
 +       *
 +       * @return      $sourceInstance         An instance of a Source class
 +       */
 +      public final static function createCrawlerUploadedListUrlSource () {
 +              // Get new instance
 +              $sourceInstance = new CrawlerUploadedListUrlSource();
 +
 +              // Init source
 +              $sourceInstance->initSource('crawler', 'uploaded_list');
 +
 +              // Return the prepared instance
 +              return $sourceInstance;
 +      }
 +
+       /**
+        * Initializes the import of the CSV file which is being processed by other task
+        *
+        * @return      void
+        * @throws      NullPointerException    If lastCsvFileInstance is not set
+        */
+       private function importCsvFile () {
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+               // Is the instance set?
+               if (is_null($this->lastCsvFileInstance)) {
+                       // This should not happen
+                       throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
+               } // END - if
+               // Stack this file
+               $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
+               // ... and mark it as "imported"
+               array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
+               // ... and finally NULL it (to save some RAM)
+               $this->lastCsvFileInstance = NULL;
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+       }
+       /**
+        * Getter for stackSourceInstance variable
+        *
+        * @return      $stackSourceInstance    An instance of an additional stack
+        */
+       public final function getStackSourceInstance () {
+               return $this->stackSourceInstance;
+       }
        /**
         * Processes entries in the stack.
         *
        public function processStack () {
                // Does the stack have some entries left?
                if (!$this->isUrlStackEmpty()) {
 -                      // Handle next entry
 +                      /*
 +                       * Handle next entry. This method will be called very often, so need
 +                       * to process more than one entry at a time.
 +                       */
                        $this->processNextEntry();
                } elseif ($this->isCsvFileFound()) {
 -                      // A CSV file has been found and can maybe be imported.
 +                      /*
 +                       * A file containing an URL list is found. Please note the format is
 +                       * CSV-like as you may wish to provide meta data such as crawl
 +                       * depth, handling of 3rd-party URLs and such.
 +                       */
                        $this->importCsvFile();
                }