]> git.mxchange.org Git - hub.git/commitdiff
Continued with crawler:
authorRoland Haeder <roland@mxchange.org>
Thu, 5 Mar 2015 02:01:10 +0000 (03:01 +0100)
committerRoland Haeder <roland@mxchange.org>
Thu, 5 Mar 2015 02:01:10 +0000 (03:01 +0100)
- Renamed parseCsvEntry() to parseCsvFile() as it reads a CSV file
- Added check method if a CSV entry is stacked
- Updated 'core' to latest commit

Signed-off-by: Roland Haeder <roland@mxchange.org>
application/hub/config.php
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
core

index 3afe1f12b85df01962f93d72ac1b401f8e148faf..c7401fe7d9105e0feea82a57f92c3f9f5d720907 100644 (file)
@@ -1313,6 +1313,9 @@ $cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacke
 // CFG: STACKER-CSV-FILE-MAX-SIZE
 $cfg->setConfigEntry('stacker_csv_file_max_size', 10);
 
+// CFG: STACKER-CSV-ENTRY-MAX-SIZE
+$cfg->setConfigEntry('stacker_csv_entry_max_size', 100);
+
 // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY
 $cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500);
 
@@ -1481,6 +1484,9 @@ $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0);
 // CFG: CRAWLER-CSV-FILE-PATH
 $cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists');
 
+// CFG: CRAWLER-URL-LIST-COLUMN-SEPARATOR
+$cfg->setConfigEntry('crawler_url_list_column_separator', ',');
+
 ///////////////////////////////////////////////////////////////////////////////
 //                            HTTP Configuration
 ///////////////////////////////////////////////////////////////////////////////
index 3fd94f31d3701b56d4baee362cc3338725810316..6283a46c7254309e50264e4d49eb7bf7ed8934b7 100644 (file)
@@ -42,11 +42,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
         */
        const STACK_NAME_CSV_FILE = 'csv_file';
 
+       /**
+        * Stack name for a CSV entry
+        */
+       const STACK_NAME_CSV_ENTRY = 'csv_entry';
+
        /**
         * "Imported" CSV files
         */
        private $csvFileImported = array();
 
+       /**
+        * "Cached" separator for columns
+        */
+       private $columnSeparator = '';
+
        /**
         * Protected constructor
         *
@@ -68,8 +78,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                // Init stack instance
                $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
 
-               // Init stack
+               // Init stacks
                $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
+               $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_ENTRY);
+
+               // "Cache" column separator
+               $this->columnSeparator = $this->getConfigInstance()->getConfigEntry('crawler_url_list_column_separator');
        }
 
        /**
@@ -139,6 +153,19 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                return $isLoaded;
        }
 
+       /**
+        * Checks whether a CSV entry has been added to the stack
+        *
+        * @return      $isLoaded       Whether a CSV entry has been added
+        */
+       private function isCsvEntryAdded () {
+               // Check whether the stacker is not empty
+               $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY)));
+
+               // Return the result
+               return $isLoaded;
+       }
+
        /**
         * Initializes the import of the CSV file which is being processed by other task
         *
@@ -167,18 +194,54 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        }
 
        /**
-        * Parses the next stacked CSV by reading only one line from it. Then the
-        * read line is being validated and if found good being feed to the next
+        * Parses the next stacked CSV file by reading only one line from it. Then
+        * the read line is being validated and if found good being feed to the next
         * stack. The file is removed from stack only if it has been fully parsed.
         *
         * @return      void
         */
-       private function parseCsvEntry () {
+       private function parseCsvFile () {
+               // Debug message
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+               // Get next entry
+               $csvFileInstance = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_FILE);
+
+               // Read full "CSV line"
+               $csvData = $csvFileInstance->readCsvFileLine($this->columnSeparator);
+
                // Debug message
-               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
+
+               // Expect always an array
+               assert(is_array($csvData));
+
+               // Is the array empty?
+               if (count($csvData) == 0) {
+                       // Debug message
+                       //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.');
+
+                       // Try to close it
+                       $csvFileInstance->closeFile();
+
+                       // This file as been fully read, so don't push it back on stack.
+                       return;
+               } // END - if
+
+               // ...  with 3 elements, later enhancements may accept more
+               assert(count($csvData) == 3);
+
+               /*
+                * Push the file back on stack as it may contain more entries. This way
+                * all files got rotated on stack which may improve crawler performance.
+                */
+               $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $csvFileInstance);
+
+               // Push array on next stack
+               $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_ENTRY, $csvFileInstance);
 
                // Debug message
-               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
        }
 
 
@@ -199,12 +262,18 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
         */
        public function processStack () {
                // Does the stack have some entries left?
-               if ($this->isCsvFileAdded()) {
+               if ($this->isCsvEntryAdded()) {
                        /*
                         * A CSV file has been found and "imported" (added to stack). Now
                         * the file can be read line by line and checked every one of it.
                         */
                        $this->parseCsvEntry();
+               } elseif ($this->isCsvFileAdded()) {
+                       /*
+                        * A CSV file has been found and "imported" (added to stack). Now
+                        * the file can be read line by line and checked every one of it.
+                        */
+                       $this->parseCsvFile();
                } elseif ($this->isCsvFileFound()) {
                        /*
                         * A file containing an URL list is found. Please note the format is
diff --git a/core b/core
index f2d79735f329e3dafe347a56686122b3b5bdbea9..f9d9f2a93c091cb3d6381927d4d20293207a9e30 160000 (submodule)
--- a/core
+++ b/core
@@ -1 +1 @@
-Subproject commit f2d79735f329e3dafe347a56686122b3b5bdbea9
+Subproject commit f9d9f2a93c091cb3d6381927d4d20293207a9e30