From 1fdae03279d808dce54ac5125b138376fdba3436 Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Sat, 7 Mar 2015 18:54:54 +0100 Subject: [PATCH] Continued with crawler: - Renamed method processStack() to fillUrlStack() to reflect its purpose - Added isUrlStackEmpty() to interface as it is now public - Added new base class BaseUrlSourceTask which will initialize all such tasks by creating the proper URL source instance - Contants belong to top of classes Signed-off-by: Roland Haeder --- .../source/urls/class_UrlSource.php | 11 +++- .../tcp/class_TcpProtocolResolver.php | 2 +- .../hub/main/source/class_BaseUrlSource.php | 20 ++++++- .../urls/class_CrawlerFoundRssUrlSource.php | 12 ++--- .../urls/class_CrawlerLocalStartUrlSource.php | 12 ++--- .../urls/class_CrawlerRssStartUrlSource.php | 12 ++--- .../class_CrawlerUploadedListUrlSource.php | 45 ++++++++-------- .../tasks/crawler/class_BaseUrlSourceTask.php | 54 +++++++++++++++++++ .../crawler/url_source/class_CrawlerUrlSource | 2 +- .../class_CrawlerUrlSourceFoundRssTask.php | 4 +- .../class_CrawlerUrlSourceLocalStartTask.php | 4 +- .../class_CrawlerUrlSourceRssStartTask.php | 4 +- ...class_CrawlerUrlSourceUploadedListTask.php | 4 +- 13 files changed, 122 insertions(+), 64 deletions(-) create mode 100644 application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php diff --git a/application/hub/interfaces/source/urls/class_UrlSource.php b/application/hub/interfaces/source/urls/class_UrlSource.php index e95634b5c..6a4a04677 100644 --- a/application/hub/interfaces/source/urls/class_UrlSource.php +++ b/application/hub/interfaces/source/urls/class_UrlSource.php @@ -23,11 +23,18 @@ */ interface UrlSource extends Source { /** - * Processes entries in the stack. + * Fills the URL stack with new entries from source * * @return void */ - function processStack (); + function fillUrlStack (); + + /** + * Determines whether the stack 'urls' is empty. + * + * @return $isEmpty Whether the stack 'urls' is empty. + */ + function isUrlStackEmpty (); } // [EOF] diff --git a/application/hub/main/resolver/protocol/tcp/class_TcpProtocolResolver.php b/application/hub/main/resolver/protocol/tcp/class_TcpProtocolResolver.php index 027f0497f..758ae7804 100644 --- a/application/hub/main/resolver/protocol/tcp/class_TcpProtocolResolver.php +++ b/application/hub/main/resolver/protocol/tcp/class_TcpProtocolResolver.php @@ -67,7 +67,7 @@ class TcpProtocolResolver extends BaseProtocolResolver implements ProtocolResolv $resultInstance = $nodeInstance->getWrapperInstance()->doSelectByCriteria($searchInstance); // Is the result valid? - if ((!$resultInstance->valid()) || (! $resultInstance->next())) { + if ((!$resultInstance->valid()) || (!$resultInstance->next())) { // Node not found in database, this could mean that your database file is damaged. return NULL; } // END - if diff --git a/application/hub/main/source/class_BaseUrlSource.php b/application/hub/main/source/class_BaseUrlSource.php index 18286ba28..afc644923 100644 --- a/application/hub/main/source/class_BaseUrlSource.php +++ b/application/hub/main/source/class_BaseUrlSource.php @@ -58,13 +58,31 @@ class BaseUrlSource extends BaseSource { * * @return $isEmpty Whether the stack 'urls' is empty. */ - protected function isUrlStackEmpty () { + public function isUrlStackEmpty () { // Determine it $isEmpty = $this->getStackInstance()->isStackEmpty('urls'); // Return result return $isEmpty; } + + /** + * Enriches the given associative array with more data, now at least 2 + * elements are required: + * + * 'start_url' - Starting URL + * 'start_depth' - Crawl depth for starting URL + * + * @param $crawlData Array with partial data for being queued + * @return void + */ + protected function enrichCrawlerQueueData (array &$crawlData) { + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!'); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } } // [EOF] diff --git a/application/hub/main/source/urls/class_CrawlerFoundRssUrlSource.php b/application/hub/main/source/urls/class_CrawlerFoundRssUrlSource.php index d77847c2c..cad4691e7 100644 --- a/application/hub/main/source/urls/class_CrawlerFoundRssUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerFoundRssUrlSource.php @@ -49,18 +49,12 @@ class CrawlerFoundRssUrlSource extends BaseUrlSource implements UrlSource, Regis } /** - * Processes entries in the stack. + * Fills the URL stack with new entries from source * * @return void - * @todo ~10% done + * @todo 0% done */ - public function processStack () { - // Does the stack have some entries left? - if ($this->isUrlStackEmpty()) { - // Nothing to handle here - return; - } // END - if - + public function fillUrlStack () { $this->partialStub('Please implement this method.'); } } diff --git a/application/hub/main/source/urls/class_CrawlerLocalStartUrlSource.php b/application/hub/main/source/urls/class_CrawlerLocalStartUrlSource.php index a2679ebdb..fdabe0642 100644 --- a/application/hub/main/source/urls/class_CrawlerLocalStartUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerLocalStartUrlSource.php @@ -49,18 +49,12 @@ class CrawlerLocalStartUrlSource extends BaseUrlSource implements UrlSource, Reg } /** - * Processes entries in the stack. + * Fills the URL stack with new entries from source * * @return void - * @todo ~10% done + * @todo 0% done */ - public function processStack () { - // Does the stack have some entries left? - if ($this->isUrlStackEmpty()) { - // Nothing to handle here - return; - } // END - if - + public function fillUrlStack () { $this->partialStub('Please implement this method.'); } } diff --git a/application/hub/main/source/urls/class_CrawlerRssStartUrlSource.php b/application/hub/main/source/urls/class_CrawlerRssStartUrlSource.php index e955d027f..ef6ade1bd 100644 --- a/application/hub/main/source/urls/class_CrawlerRssStartUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerRssStartUrlSource.php @@ -49,18 +49,12 @@ class CrawlerRssStartUrlSource extends BaseUrlSource implements UrlSource, Regis } /** - * Processes entries in the stack. + * Fills the URL stack with new entries from source * * @return void - * @todo ~10% done + * @todo 0% done */ - public function processStack () { - // Does the stack have some entries left? - if ($this->isUrlStackEmpty()) { - // Nothing to handle here - return; - } // END - if - + public function fillUrlStack () { $this->partialStub('Please implement this method.'); } } diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index dd5ff03bc..749c3f631 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -22,21 +22,6 @@ * along with this program. If not, see . */ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable { - /** - * "Cached" CSV path - */ - private $csvFilePath = ''; - - /** - * Last CSV file instance - */ - private $lastCsvFileInstance = NULL; - - /** - * Stack for pushing data from this clas to another - */ - private $stackSourceInstance = NULL; - /** * Stack name for a CSV file */ @@ -56,6 +41,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ const CRAWL_ENTRY_SIZE = 3; + /** + * "Cached" CSV path + */ + private $csvFilePath = ''; + + /** + * Last CSV file instance + */ + private $lastCsvFileInstance = NULL; + + /** + * Stack for pushing data from this clas to another + */ + private $stackSourceInstance = NULL; + /** * "Imported" CSV files */ @@ -175,6 +175,9 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2] ); + // Then add more data to it + $this->enrichCrawlerQueueData($csvData); + // Debug message /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } @@ -318,12 +321,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } /** - * Processes entries in the stack. + * Fills the URL stack with new entries from source * * @return void - * @todo ~20% done + * @todo ~40% done */ - public function processStack () { + public function fillUrlStack () { // Does the stack have some entries left? if ($this->isCsvEntryAdded()) { /* @@ -344,12 +347,6 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * depth, handling of 3rd-party URLs and such. */ $this->addCsvFile(); - } elseif (!$this->isUrlStackEmpty()) { - /* - * Handle next entry. This method will be called very often, so need - * to process more than one entry at a time. - */ - $this->processNextEntry(); } $this->partialStub('Please implement this method.'); diff --git a/application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php b/application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php new file mode 100644 index 000000000..998980272 --- /dev/null +++ b/application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php @@ -0,0 +1,54 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2007, 2008 Roland Haeder, 2009 - 2014 Hub Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.shipsimu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class BaseUrlSourceTask extends BaseTask { + /** + * Protected constructor + * + * @param $className Name of the class + * @return void + */ + protected function __construct ($className) { + // Call parent constructor + parent::__construct($className); + + // Init this URL source task + $this->initUrlSourceTask(); + } + + /** + * Initializes URL source task (to keep the constructor small) + * + * @return void + */ + private function initUrlSourceTask () { + // Get source instance + $sourceInstance = UrlSourceObjectFactory::createUrlSourceInstance($this); + + // And set it here + $this->setSourceInstance($sourceInstance); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource index b62a9df06..48a6a42c8 100644 --- a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource @@ -21,7 +21,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -class CrawlerUrlSource???Task extends BaseTask implements Taskable, Visitable { +class CrawlerUrlSource???Task extends BaseUrlSourceTask implements Taskable, Visitable { /** * Protected constructor * diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php index 14236c6ff..dd912b465 100644 --- a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php @@ -21,7 +21,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -class CrawlerUrlSourceFoundRssTask extends BaseTask implements Taskable, Visitable { +class CrawlerUrlSourceFoundRssTask extends BaseUrlSourceTask implements Taskable, Visitable { /** * Protected constructor * @@ -64,7 +64,7 @@ class CrawlerUrlSourceFoundRssTask extends BaseTask implements Taskable, Visitab */ public function executeTask () { // Get the URL source instance and announce us - UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack(); + $this->getSourceInstance()->fillUrlStack(); } } diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php index 9fdb71d1d..5938d2cb7 100644 --- a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php @@ -21,7 +21,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -class CrawlerUrlSourceLocalStartTask extends BaseTask implements Taskable, Visitable { +class CrawlerUrlSourceLocalStartTask extends BaseUrlSourceTask implements Taskable, Visitable { /** * Protected constructor * @@ -64,7 +64,7 @@ class CrawlerUrlSourceLocalStartTask extends BaseTask implements Taskable, Visit */ public function executeTask () { // Get the URL source instance and announce us - UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack(); + $this->getSourceInstance()->fillUrlStack(); } } diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php index 413c7ad5d..e8c317541 100644 --- a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php @@ -21,7 +21,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -class CrawlerUrlSourceRssStartTask extends BaseTask implements Taskable, Visitable { +class CrawlerUrlSourceRssStartTask extends BaseUrlSourceTask implements Taskable, Visitable { /** * Protected constructor * @@ -64,7 +64,7 @@ class CrawlerUrlSourceRssStartTask extends BaseTask implements Taskable, Visitab */ public function executeTask () { // Get the URL source instance and announce us - UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack(); + $this->getSourceInstance()->fillUrlStack(); } } diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php index 7330dda2d..d80924858 100644 --- a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php @@ -21,7 +21,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -class CrawlerUrlSourceUploadedListTask extends BaseTask implements Taskable, Visitable { +class CrawlerUrlSourceUploadedListTask extends BaseUrlSourceTask implements Taskable, Visitable { /** * Protected constructor * @@ -64,7 +64,7 @@ class CrawlerUrlSourceUploadedListTask extends BaseTask implements Taskable, Vis */ public function executeTask () { // Get the URL source instance and announce us - UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack(); + $this->getSourceInstance()->fillUrlStack(); } } -- 2.39.5