From 66b3a9329ab206d3c459cdfa02170a3a654df207 Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Sun, 11 Jan 2015 21:51:25 +0100 Subject: [PATCH] Continued with crawler (won't work with out-dated core): - Added new method to check stack size for added CSV files - Added empty stub for adding entries from loaded CSV file Signed-off-by: Roland Haeder --- .../class_CrawlerUploadedListUrlSource.php | 62 ++++++++++++++++--- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index ec77631f0..80d029b80 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -103,7 +103,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } // END - if // Read next entry - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE)); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE)); $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported)); // The read entry has not to be empty and extension must be '.csv' @@ -113,20 +113,35 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } // END - if // Initialize CSV file instance - $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry)); + $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($this->csvFilePath . '/' . $directoryEntry)); + + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!'); // Found an entry - //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); return TRUE; } + /** + * Checks whether a CSV file has been loaded (added to the stack) + * + * @return $isLoaded Whether a CSV file has been loaded + */ + private function isCsvFileAdded () { + // Check whether the stacker is not empty + $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE))); + + // Return the result + return $isLoaded; + } + /** * Initializes the import of the CSV file which is being processed by other task * * @return void * @throws NullPointerException If lastCsvFileInstance is not set */ - private function importCsvFile () { + private function addCsvFile () { //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Is the instance set? @@ -147,6 +162,20 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } + /** + * Parses the next stacked CSV by reading only one line from it. Then the + * read line is being validated and if found good being feed to the next + * stack. The file is removed from stack only if it has been fully parsed. + */ + private function parseCsvEntry () { + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } + + /** * Getter for stackSourceInstance variable * @@ -160,16 +189,29 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * Processes entries in the stack. * * @return void - * @todo ~10% done + * @todo ~20% done */ public function processStack () { // Does the stack have some entries left? - if (!$this->isUrlStackEmpty()) { - // Handle next entry - $this->processNextEntry(); + if ($this->isCsvFileAdded()) { + /* + * A CSV file has been found and "imported" (added to stack). Now + * the file can be read line by line and checked every one of it. + */ + $this->parseCsvEntry(); } elseif ($this->isCsvFileFound()) { - // A CSV file has been found and can maybe be imported. - $this->importCsvFile(); + /* + * A file containing an URL list is found. Please note the format is + * CSV-like as you may wish to provide meta data such as crawl + * depth, handling of 3rd-party URLs and such. + */ + $this->addCsvFile(); + } elseif (!$this->isUrlStackEmpty()) { + /* + * Handle next entry. This method will be called very often, so need + * to process more than one entry at a time. + */ + $this->processNextEntry(); } $this->partialStub('Please implement this method.'); -- 2.39.5