// CFG: STACKER-CSV-FILE-MAX-SIZE
$cfg->setConfigEntry('stacker_csv_file_max_size', 10);
+// CFG: STACKER-CSV-ENTRY-MAX-SIZE
+$cfg->setConfigEntry('stacker_csv_entry_max_size', 100);
+
// CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY
$cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500);
// CFG: CRAWLER-CSV-FILE-PATH
$cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists');
+// CFG: CRAWLER-URL-LIST-COLUMN-SEPARATOR
+$cfg->setConfigEntry('crawler_url_list_column_separator', ',');
+
///////////////////////////////////////////////////////////////////////////////
// HTTP Configuration
///////////////////////////////////////////////////////////////////////////////
*/
const STACK_NAME_CSV_FILE = 'csv_file';
+ /**
+ * Stack name for a CSV entry
+ */
+ const STACK_NAME_CSV_ENTRY = 'csv_entry';
+
/**
* "Imported" CSV files
*/
private $csvFileImported = array();
+ /**
+ * "Cached" separator for columns
+ */
+ private $columnSeparator = '';
+
/**
* Protected constructor
*
// Init stack instance
$this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
- // Init stack
+ // Init stacks
$this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
+ $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_ENTRY);
+
+ // "Cache" column separator
+ $this->columnSeparator = $this->getConfigInstance()->getConfigEntry('crawler_url_list_column_separator');
}
/**
return $isLoaded;
}
+ /**
+ * Checks whether a CSV entry has been added to the stack
+ *
+ * @return $isLoaded Whether a CSV entry has been added
+ */
+ private function isCsvEntryAdded () {
+ // Check whether the stacker is not empty
+ $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY)));
+
+ // Return the result
+ return $isLoaded;
+ }
+
/**
* Initializes the import of the CSV file which is being processed by other task
*
}
/**
- * Parses the next stacked CSV by reading only one line from it. Then the
- * read line is being validated and if found good being feed to the next
+ * Parses the next stacked CSV file by reading only one line from it. Then
+ * the read line is being validated and if found good being feed to the next
* stack. The file is removed from stack only if it has been fully parsed.
*
* @return void
*/
- private function parseCsvEntry () {
+ private function parseCsvFile () {
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Get next entry
+ $csvFileInstance = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_FILE);
+
+ // Read full "CSV line"
+ $csvData = $csvFileInstance->readCsvFileLine($this->columnSeparator);
+
// Debug message
- /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
+
+ // Expect always an array
+ assert(is_array($csvData));
+
+ // Is the array empty?
+ if (count($csvData) == 0) {
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.');
+
+ // Try to close it
+ $csvFileInstance->closeFile();
+
+ // This file as been fully read, so don't push it back on stack.
+ return;
+ } // END - if
+
+ // ... with 3 elements, later enhancements may accept more
+ assert(count($csvData) == 3);
+
+ /*
+ * Push the file back on stack as it may contain more entries. This way
+ * all files got rotated on stack which may improve crawler performance.
+ */
+ $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $csvFileInstance);
+
+ // Push array on next stack
+ $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_ENTRY, $csvFileInstance);
// Debug message
- /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
}
*/
public function processStack () {
// Does the stack have some entries left?
- if ($this->isCsvFileAdded()) {
+ if ($this->isCsvEntryAdded()) {
/*
* A CSV file has been found and "imported" (added to stack). Now
* the file can be read line by line and checked every one of it.
*/
$this->parseCsvEntry();
+ } elseif ($this->isCsvFileAdded()) {
+ /*
+ * A CSV file has been found and "imported" (added to stack). Now
+ * the file can be read line by line and checked every one of it.
+ */
+ $this->parseCsvFile();
} elseif ($this->isCsvFileFound()) {
/*
* A file containing an URL list is found. Please note the format is