]> git.mxchange.org Git - hub.git/blobdiff - application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
Updated 'core'.
[hub.git] / application / hub / main / source / urls / class_CrawlerUploadedListUrlSource.php
index 605478d6059419ab3a1420bc6713aab9f2d87c5a..7063492b0bd2c96f037c6cbaa84c3e00fa1d23fa 100644 (file)
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
+       /**
+        * "Cached" CSV path
+        */
+       private $csvFilePath = '';
+
+       /**
+        * Last CSV file instance
+        */
+       private $lastCsvFileInstance = NULL;
+
+       /**
+        * Stack for pushing data from this clas to another
+        */
+       private $stackSourceInstance = NULL;
+
+       /**
+        * Stack name for a CSV file
+        */
+       const STACK_NAME_CSV_FILE = 'csv_file';
+
+       /**
+        * "Imported" CSV files
+        */
+       private $csvFileImported = array();
+
        /**
         * Protected constructor
         *
@@ -30,6 +55,57 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        protected function __construct () {
                // Call parent constructor
                parent::__construct(__CLASS__);
+
+               // "Cache" CSV path for faster usage
+               $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
+
+               // Initialize directory instance
+               $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
+
+               // Set it here
+               $this->setDirectoryInstance($directoryInstance);
+
+               // Init stack instance
+               $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
+
+               // Init stack
+               $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
+       }
+
+       /**
+        * Checks whether a CSV file is found in configured path
+        *
+        * @return      $isFound        Whether a CSV file is found
+        */
+       private function isCsvFileFound () {
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+               // Is it valid?
+               if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
+                       // Rewind to start
+                       $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
+               } // END - if
+
+               // Read next entry
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
+               $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
+
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
+
+               // Is it empty or wrong file extension?
+               if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
+                       // Skip further processing
+                       /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
+                       return FALSE;
+               } // END - if
+
+               // Initialize CSV file instance
+               $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry));
+
+               // Found an entry
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+               return TRUE;
        }
 
        /**
@@ -44,11 +120,46 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                // Init source
                $sourceInstance->initSource('crawler', 'uploaded_list');
 
-               // Get a 
                // Return the prepared instance
                return $sourceInstance;
        }
 
+       /**
+        * Initializes the import of the CSV file which is being processed by other task
+        *
+        * @return      void
+        * @throws      NullPointerException    If lastCsvFileInstance is not set
+        */
+       private function importCsvFile () {
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+               // Is the instance set?
+               if (is_null($this->lastCsvFileInstance)) {
+                       // This should not happen
+                       throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
+               } // END - if
+
+               // Stack this file
+               $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
+
+               // ... and mark it as "imported"
+               array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
+
+               // ... and finally NULL it (to save some RAM)
+               $this->lastCsvFileInstance = NULL;
+
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+       }
+
+       /**
+        * Getter for stackSourceInstance variable
+        *
+        * @return      $stackSourceInstance    An instance of an additional stack
+        */
+       public final function getStackSourceInstance () {
+               return $this->stackSourceInstance;
+       }
+
        /**
         * Processes entries in the stack.
         *
@@ -58,9 +169,19 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        public function processStack () {
                // Does the stack have some entries left?
                if (!$this->isUrlStackEmpty()) {
-                       // Nothing to handle here
+                       /*
+                        * Handle next entry. This method will be called very often, so need
+                        * to process more than one entry at a time.
+                        */
                        $this->processNextEntry();
-               } elseif ($this->
+               } elseif ($this->isCsvFileFound()) {
+                       /*
+                        * A file containing an URL list is found. Please note the format is
+                        * CSV-like as you may wish to provide meta data such as crawl
+                        * depth, handling of 3rd-party URLs and such.
+                        */
+                       $this->importCsvFile();
+               }
 
                $this->partialStub('Please implement this method.');
        }