From 3e761c738a27e49ef5871a672dcb2b3871b58405 Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Sat, 7 Mar 2015 23:07:11 +0100 Subject: [PATCH] Continued with crawler: - Added class constant STACKER_NAME_URLS for stacker name 'urls' - enrichCrawlerQueueData() is still unfinished as there *must* be added more entries - Updated 'core' to latest commit Signed-off-by: Roland Haeder --- .../hub/main/source/class_BaseUrlSource.php | 33 +++++++++++++++++-- .../class_CrawlerUploadedListUrlSource.php | 24 ++++++++++---- core | 2 +- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/application/hub/main/source/class_BaseUrlSource.php b/application/hub/main/source/class_BaseUrlSource.php index afc644923..149867f9e 100644 --- a/application/hub/main/source/class_BaseUrlSource.php +++ b/application/hub/main/source/class_BaseUrlSource.php @@ -22,6 +22,9 @@ * along with this program. If not, see . */ class BaseUrlSource extends BaseSource { + // Stack name for all URLs + const STACKER_NAME_URLS = 'urls'; + // Array elements for CSV data array const CRAWL_JOB_ARRAY_START_URL = 'start_url'; const CRAWL_JOB_ARRAY_DEPTH = 'start_depth'; @@ -60,7 +63,7 @@ class BaseUrlSource extends BaseSource { */ public function isUrlStackEmpty () { // Determine it - $isEmpty = $this->getStackInstance()->isStackEmpty('urls'); + $isEmpty = $this->getStackInstance()->isStackEmpty(self::STACKER_NAME_URLS); // Return result return $isEmpty; @@ -75,13 +78,37 @@ class BaseUrlSource extends BaseSource { * * @param $crawlData Array with partial data for being queued * @return void + * @todo ~10% done */ protected function enrichCrawlerQueueData (array &$crawlData) { // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!'); + + // Check for minimum array elements + assert(isset($crawlData[self::CRAWL_JOB_ARRAY_START_URL])); + assert(isset($crawlData[self::CRAWL_JOB_ARRAY_DEPTH])); + + // @TODO Add more elements + + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } + + /** + * Enqueues given crawler array in assigned file-based stack + * + * @param $crawlData Array with partial data for being queued + * @return void + */ + protected function enqueueInFileStack (array $crawlData) { + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!'); + + // Get the stack instance and enqueue it + $this->getStackInstance()->pushNamed(self::STACKER_NAME_URLS, $crawlData); // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } } diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 749c3f631..ba2c81e97 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -159,7 +159,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ private function saveCsvDataInCrawlerQueue (array $csvData) { // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!'); // The array has 3 elements, later enhancements may accept more assert(count($csvData) == self::CRAWL_ENTRY_SIZE); @@ -175,11 +175,23 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2] ); + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - BEFORE!'); + // Then add more data to it - $this->enrichCrawlerQueueData($csvData); + $this->enrichCrawlerQueueData($csvArray); // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - AFTER!'); + + /* + * Then enqueue it in the file stack. The local crawler "task" will + * then pick this up. + */ + $this->enqueueInFileStack($csvArray); + + // Debug message + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** @@ -293,13 +305,13 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ private function parseCsvEntry () { // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Pop it from stack $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY); // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); // It must have 3 elements (see method parseCsvFile() for details) assert(count($csvData) == self::CRAWL_ENTRY_SIZE); @@ -308,7 +320,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R $this->saveCsvDataInCrawlerQueue($csvData); // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** diff --git a/core b/core index 23d0a8893..aebdd613a 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 23d0a889351670874a4b120e1487edf89dc1b540 +Subproject commit aebdd613aed13516c96d08fa6f72b3a11f4d3b85 -- 2.39.5