From: Roland Haeder Date: Thu, 5 Mar 2015 02:01:10 +0000 (+0100) Subject: Continued with crawler: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=f467602c96c42d3acb8ccd5e3d858794fb685ae8;p=hub.git Continued with crawler: - Renamed parseCsvEntry() to parseCsvFile() as it reads a CSV file - Added check method if a CSV entry is stacked - Updated 'core' to latest commit Signed-off-by: Roland Haeder --- diff --git a/application/hub/config.php b/application/hub/config.php index 3afe1f12b..c7401fe7d 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1313,6 +1313,9 @@ $cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacke // CFG: STACKER-CSV-FILE-MAX-SIZE $cfg->setConfigEntry('stacker_csv_file_max_size', 10); +// CFG: STACKER-CSV-ENTRY-MAX-SIZE +$cfg->setConfigEntry('stacker_csv_entry_max_size', 100); + // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY $cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500); @@ -1481,6 +1484,9 @@ $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0); // CFG: CRAWLER-CSV-FILE-PATH $cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists'); +// CFG: CRAWLER-URL-LIST-COLUMN-SEPARATOR +$cfg->setConfigEntry('crawler_url_list_column_separator', ','); + /////////////////////////////////////////////////////////////////////////////// // HTTP Configuration /////////////////////////////////////////////////////////////////////////////// diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 3fd94f31d..6283a46c7 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -42,11 +42,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ const STACK_NAME_CSV_FILE = 'csv_file'; + /** + * Stack name for a CSV entry + */ + const STACK_NAME_CSV_ENTRY = 'csv_entry'; + /** * "Imported" CSV files */ private $csvFileImported = array(); + /** + * "Cached" separator for columns + */ + private $columnSeparator = ''; + /** * Protected constructor * @@ -68,8 +78,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R // Init stack instance $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class'); - // Init stack + // Init stacks $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE); + $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_ENTRY); + + // "Cache" column separator + $this->columnSeparator = $this->getConfigInstance()->getConfigEntry('crawler_url_list_column_separator'); } /** @@ -139,6 +153,19 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R return $isLoaded; } + /** + * Checks whether a CSV entry has been added to the stack + * + * @return $isLoaded Whether a CSV entry has been added + */ + private function isCsvEntryAdded () { + // Check whether the stacker is not empty + $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY))); + + // Return the result + return $isLoaded; + } + /** * Initializes the import of the CSV file which is being processed by other task * @@ -167,18 +194,54 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } /** - * Parses the next stacked CSV by reading only one line from it. Then the - * read line is being validated and if found good being feed to the next + * Parses the next stacked CSV file by reading only one line from it. Then + * the read line is being validated and if found good being feed to the next * stack. The file is removed from stack only if it has been fully parsed. * * @return void */ - private function parseCsvEntry () { + private function parseCsvFile () { + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + + // Get next entry + $csvFileInstance = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_FILE); + + // Read full "CSV line" + $csvData = $csvFileInstance->readCsvFileLine($this->columnSeparator); + // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); + + // Expect always an array + assert(is_array($csvData)); + + // Is the array empty? + if (count($csvData) == 0) { + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.'); + + // Try to close it + $csvFileInstance->closeFile(); + + // This file as been fully read, so don't push it back on stack. + return; + } // END - if + + // ... with 3 elements, later enhancements may accept more + assert(count($csvData) == 3); + + /* + * Push the file back on stack as it may contain more entries. This way + * all files got rotated on stack which may improve crawler performance. + */ + $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $csvFileInstance); + + // Push array on next stack + $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_ENTRY, $csvFileInstance); // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } @@ -199,12 +262,18 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ public function processStack () { // Does the stack have some entries left? - if ($this->isCsvFileAdded()) { + if ($this->isCsvEntryAdded()) { /* * A CSV file has been found and "imported" (added to stack). Now * the file can be read line by line and checked every one of it. */ $this->parseCsvEntry(); + } elseif ($this->isCsvFileAdded()) { + /* + * A CSV file has been found and "imported" (added to stack). Now + * the file can be read line by line and checked every one of it. + */ + $this->parseCsvFile(); } elseif ($this->isCsvFileFound()) { /* * A file containing an URL list is found. Please note the format is diff --git a/core b/core index f2d79735f..f9d9f2a93 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit f2d79735f329e3dafe347a56686122b3b5bdbea9 +Subproject commit f9d9f2a93c091cb3d6381927d4d20293207a9e30