]> git.mxchange.org Git - hub.git/blobdiff - application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
Updated 'core'.
[hub.git] / application / hub / main / source / urls / class_CrawlerUploadedListUrlSource.php
index ec77631f0c64191c4ca852d3974b6e1f97ca667c..7063492b0bd2c96f037c6cbaa84c3e00fa1d23fa 100644 (file)
@@ -57,7 +57,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                parent::__construct(__CLASS__);
 
                // "Cache" CSV path for faster usage
-               $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path');
+               $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
 
                // Initialize directory instance
                $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
@@ -73,32 +73,16 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        }
 
        /**
-        * Creates an instance of this class
-        *
-        * @return      $sourceInstance         An instance of a Source class
-        */
-       public final static function createCrawlerUploadedListUrlSource () {
-               // Get new instance
-               $sourceInstance = new CrawlerUploadedListUrlSource();
-
-               // Init source
-               $sourceInstance->initSource('crawler', 'uploaded_list');
-
-               // Return the prepared instance
-               return $sourceInstance;
-       }
-
-       /**
-        * Checks whether a CSV file is found
+        * Checks whether a CSV file is found in configured path
         *
         * @return      $isFound        Whether a CSV file is found
         */
        private function isCsvFileFound () {
                //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
 
-               // Is the instance valid?
+               // Is it valid?
                if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
-                       // Then rewind it
+                       // Rewind to start
                        $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
                } // END - if
 
@@ -106,9 +90,13 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
                $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
 
-               // The read entry has not to be empty and extension must be '.csv'
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
+
+               // Is it empty or wrong file extension?
                if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
                        // Skip further processing
+                       /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
                        return FALSE;
                } // END - if
 
@@ -120,6 +108,22 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                return TRUE;
        }
 
+       /**
+        * Creates an instance of this class
+        *
+        * @return      $sourceInstance         An instance of a Source class
+        */
+       public final static function createCrawlerUploadedListUrlSource () {
+               // Get new instance
+               $sourceInstance = new CrawlerUploadedListUrlSource();
+
+               // Init source
+               $sourceInstance->initSource('crawler', 'uploaded_list');
+
+               // Return the prepared instance
+               return $sourceInstance;
+       }
+
        /**
         * Initializes the import of the CSV file which is being processed by other task
         *
@@ -165,10 +169,17 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        public function processStack () {
                // Does the stack have some entries left?
                if (!$this->isUrlStackEmpty()) {
-                       // Handle next entry
+                       /*
+                        * Handle next entry. This method will be called very often, so need
+                        * to process more than one entry at a time.
+                        */
                        $this->processNextEntry();
                } elseif ($this->isCsvFileFound()) {
-                       // A CSV file has been found and can maybe be imported.
+                       /*
+                        * A file containing an URL list is found. Please note the format is
+                        * CSV-like as you may wish to provide meta data such as crawl
+                        * depth, handling of 3rd-party URLs and such.
+                        */
                        $this->importCsvFile();
                }