From b6b68abc9cb8c39b1b1db3260a895554c9c9bfba Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Sun, 7 Dec 2014 22:45:55 +0100 Subject: [PATCH] Continued: - Added some more stuff for URL CSV file importing - Ignored all files in data/url_lists/ (and removed a "demo" file) - Used new 'core' Signed-off-by: Roland Haeder --- .gitignore | 1 + application/hub/config.php | 9 ++ .../class_CrawlerUploadedListUrlSource.php | 112 +++++++++++++++++- core | 2 +- data/url_lists/demo.lst | 1 - 5 files changed, 120 insertions(+), 5 deletions(-) delete mode 100644 data/url_lists/demo.lst diff --git a/.gitignore b/.gitignore index fc60ff9a3..3dc43e248 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ docs/latex/* docs/warn.log /nbproject data/stacks/*.stack* +data/url_lists/*.* diff --git a/application/hub/config.php b/application/hub/config.php index c4b77f9af..33069d54f 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1289,6 +1289,12 @@ $cfg->setConfigEntry('crawler_url_rss_start_file_stack_index_class', 'FileStackI // CFG: CRAWLER-URL-FOUND-RSS-FILE-STACK-INDEX-CLASS $cfg->setConfigEntry('crawler_url_found_rss_file_stack_index_class', 'FileStackIndex'); +// CFG: CRAWLER-URL-UPLOADED-LIST-URL-SOURCE-STACK-CLASS +$cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacker'); + +// CFG: STACKER-CSV-FILE-MAX-SIZE +$cfg->setConfigEntry('stacker_csv_file_max_size', 10); + // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY $cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500); @@ -1454,6 +1460,9 @@ $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_interval_delay', 1000); // CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0); +// CFG: CSV-FILE-PATH +$cfg->setConfigEntry('csv_file_path', 'data/url_lists'); + /////////////////////////////////////////////////////////////////////////////// // HTTP Configuration /////////////////////////////////////////////////////////////////////////////// diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 605478d60..dd2a84ead 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -22,6 +22,31 @@ * along with this program. If not, see . */ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable { + /** + * "Cached" CSV path + */ + private $csvFilePath = ''; + + /** + * Last CSV file instance + */ + private $lastCsvFileInstance = NULL; + + /** + * Stack for pushing data from this clas to another + */ + private $stackSourceInstance = NULL; + + /** + * Stack name for a CSV file + */ + const STACK_NAME_CSV_FILE = 'csv_file'; + + /** + * "Imported" CSV files + */ + private $csvFileImported = array(); + /** * Protected constructor * @@ -30,6 +55,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R protected function __construct () { // Call parent constructor parent::__construct(__CLASS__); + + // "Cache" CSV path for faster usage + $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path'); + + // Initialize directory instance + $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath)); + + // Set it here + $this->setDirectoryInstance($directoryInstance); + + // Init stack instance + $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class'); + + // Init stack + $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE); } /** @@ -44,11 +84,74 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R // Init source $sourceInstance->initSource('crawler', 'uploaded_list'); - // Get a // Return the prepared instance return $sourceInstance; } + /** + * Checks whether a CSV file is found + * + * @return $isFound Whether a CSV file is found + */ + private function isCsvFileFound () { + // Is the instance valid? + if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) { + // Then rewind it + $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind(); + } // END - if + + // Read next entry + $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported)); + + // The read entry has not to be empty and extension must be '.csv' + if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) { + // Skip further processing + return FALSE; + } // END - if + + // Initialize CSV file instance + $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry)); + + // Found an entry + return TRUE; + } + + /** + * Initializes the import of the CSV file which is being processed by other task + * + * @return void + * @throws NullPointerException If lastCsvFileInstance is not set + */ + private function importCsvFile () { + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + + // Is the instance set? + if (is_null($this->lastCsvFileInstance)) { + // This should not happen + throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER); + } // END - if + + // Stack this file + $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance); + + // ... and mark it as "imported" + array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName())); + + // ... and finally NULL it (to save some RAM) + $this->lastCsvFileInstance = NULL; + + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } + + /** + * Getter for stackSourceInstance variable + * + * @return $stackSourceInstance An instance of an additional stack + */ + public final function stackSourceInstance () { + return $this->stackSourceInstance; + } + /** * Processes entries in the stack. * @@ -58,9 +161,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R public function processStack () { // Does the stack have some entries left? if (!$this->isUrlStackEmpty()) { - // Nothing to handle here + // Handle next entry $this->processNextEntry(); - } elseif ($this-> + } elseif ($this->isCsvFileFound()) { + // A CSV file has been found and can maybe be imported. + $this->importCsvFile(); + } $this->partialStub('Please implement this method.'); } diff --git a/core b/core index 80d808f78..c8ea0af3f 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 80d808f788a6b4712bc7a33abcfcc8bb432cbdf9 +Subproject commit c8ea0af3f3bfe092c38f0864d689a82172af19c0 diff --git a/data/url_lists/demo.lst b/data/url_lists/demo.lst deleted file mode 100644 index 84b5b400b..000000000 --- a/data/url_lists/demo.lst +++ /dev/null @@ -1 +0,0 @@ -http://mxchange.org -- 2.39.5