*/
class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
/**
- * "Cached" CSV path
+ * Stack name for a CSV file
*/
- private $csvFilePath = '';
+ const STACK_NAME_CSV_FILE = 'csv_file';
/**
- * Last CSV file instance
+ * Stack name for a CSV entry
*/
- private $lastCsvFileInstance = NULL;
+ const STACK_NAME_CSV_ENTRY = 'csv_entry';
/**
- * Stack for pushing data from this clas to another
+ * Size of crawl (CSV) entry which is an indexed array:
+ *
+ * 0 = URL to crawl
+ * 1 = Crawl depth of URL
+ * 2 = Crawl depth of linked URLs (same other host only)
*/
- private $stackSourceInstance = NULL;
+ const CRAWL_ENTRY_SIZE = 3;
/**
- * Stack name for a CSV file
+ * "Cached" CSV path
*/
- const STACK_NAME_CSV_FILE = 'csv_file';
+ private $csvFilePath = '';
/**
- * Stack name for a CSV entry
+ * Last CSV file instance
*/
- const STACK_NAME_CSV_ENTRY = 'csv_entry';
+ private $lastCsvFileInstance = NULL;
+
+ /**
+ * Stack for pushing data from this clas to another
+ */
+ private $stackSourceInstance = NULL;
/**
* "Imported" CSV files
return $sourceInstance;
}
+ /**
+ * Enriches and saves the given CSV entry (array) in the assigned
+ * file-based stack. To such entry a lot more informations are added, such
+ * as which files shall be crawled and many more.
+ *
+ * @param $csvData Array with data from a CSV file
+ * @return void
+ */
+ private function saveCsvDataInCrawlerQueue (array $csvData) {
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!');
+
+ // The array must have a fixed amount of elements, later enhancements may accept more
+ assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
+
+ /*
+ * First converted the indexed array into an assoziative array. Don't
+ * forget to expand this array as well when you want to add another
+ * column to the CSV file.
+ */
+ $csvArray = array(
+ self::CRAWL_JOB_ARRAY_START_URL => $csvData[0],
+ self::CRAWL_JOB_ARRAY_DEPTH => $csvData[1],
+ self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2]
+ );
+
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - BEFORE!');
+
+ // Then add more data to it
+ $this->enrichCrawlerQueueData($csvArray);
+
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - AFTER!');
+
+ /*
+ * Then enqueue it in the file stack. The local crawler "task" will
+ * then pick this up.
+ */
+ $this->enqueueInFileStack($csvArray);
+
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ }
+
/**
* Checks whether a CSV file has been loaded (added to the stack)
*
- * @return $isLoaded Whether a CSV file has been loaded
+ * @return $isAdded Whether a CSV file has been loaded
*/
private function isCsvFileAdded () {
// Check whether the stacker is not empty
- $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE)));
+ $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE)));
// Return the result
- return $isLoaded;
+ return $isAdded;
}
/**
* Checks whether a CSV entry has been added to the stack
*
- * @return $isLoaded Whether a CSV entry has been added
+ * @return $isAdded Whether a CSV entry has been added
*/
private function isCsvEntryAdded () {
// Check whether the stacker is not empty
- $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY)));
+ $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY)));
// Return the result
- return $isLoaded;
+ return $isAdded;
}
/**
// Debug message
//* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.');
- // Try to close it
- $csvFileInstance->closeFile();
+ // Try to close it by actually unsetting (destructing) it
+ unset($csvFileInstance);
// This file as been fully read, so don't push it back on stack.
return;
} // END - if
- // ... with 3 elements, later enhancements may accept more
- assert(count($csvData) == 3);
+ // ... with a fixed amount of elements, later enhancements may accept more
+ assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
/*
* Push the file back on stack as it may contain more entries. This way
*/
private function parseCsvEntry () {
// Debug message
- /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Pop it from stack
+ $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY);
+
+ // Debug message
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
+
+ // It must have a fixed amount of elements (see method parseCsvFile() for details)
+ assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
+
+ // Save it in crawler queue (which will enrich it with way more informations
+ $this->saveCsvDataInCrawlerQueue($csvData);
// Debug message
- /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
}
/**
}
/**
- * Processes entries in the stack.
+ * Fills the URL stack with new entries from source
*
* @return void
- * @todo ~20% done
+ * @todo ~40% done
*/
- public function processStack () {
+ public function fillUrlStack () {
// Does the stack have some entries left?
if ($this->isCsvEntryAdded()) {
/*
* depth, handling of 3rd-party URLs and such.
*/
$this->addCsvFile();
- } elseif (!$this->isUrlStackEmpty()) {
- /*
- * Handle next entry. This method will be called very often, so need
- * to process more than one entry at a time.
- */
- $this->processNextEntry();
}
$this->partialStub('Please implement this method.');