From 20bcb77223bd276e1f16d2c762893791feb6c28e Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Wed, 7 May 2014 22:18:54 +0200 Subject: [PATCH] Continued: - added new tasks for URL sources (I need to split it in separate tasks to maintain a good cycle time). - removed deprecated files and directories - added "demo" list for URL list source Signed-off-by: Roland Haeder --- application/hub/config.php | 48 +++++++++++++ ...ss_CrawlerTaskHandlerInitializerFilter.php | 16 +++++ .../{hub => crawler/url_source}/.htaccess | 0 .../crawler/url_source/class_CrawlerUrlSource | 72 +++++++++++++++++++ .../class_CrawlerUrlSourceFoundRssTask.php | 72 +++++++++++++++++++ .../class_CrawlerUrlSourceLocalStartTask.php | 72 +++++++++++++++++++ .../class_CrawlerUrlSourceRssStartTask.php | 72 +++++++++++++++++++ ...class_CrawlerUrlSourceUploadedListTask.php | 72 +++++++++++++++++++ .../hub/main/tasks/hub/chunks/.htaccess | 1 - application/hub/main/tasks/hub/class_Hub | 3 - .../hub/main/tasks/hub/decoder/.htaccess | 1 - application/hub/main/tasks/hub/ping/.htaccess | 1 - .../hub/main/tasks/hub/update/.htaccess | 1 - .../hub/announcement => url_lists}/.htaccess | 0 url_lists/demo.lst | 1 + 15 files changed, 425 insertions(+), 7 deletions(-) rename application/hub/main/tasks/{hub => crawler/url_source}/.htaccess (100%) create mode 100644 application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource create mode 100644 application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php create mode 100644 application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php create mode 100644 application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php create mode 100644 application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php delete mode 100644 application/hub/main/tasks/hub/chunks/.htaccess delete mode 100644 application/hub/main/tasks/hub/class_Hub delete mode 100644 application/hub/main/tasks/hub/decoder/.htaccess delete mode 100644 application/hub/main/tasks/hub/ping/.htaccess delete mode 100644 application/hub/main/tasks/hub/update/.htaccess rename {application/hub/main/tasks/hub/announcement => url_lists}/.htaccess (100%) create mode 100644 url_lists/demo.lst diff --git a/application/hub/config.php b/application/hub/config.php index 478c03cdb..ccd06b2e4 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1325,6 +1325,54 @@ $cfg->setConfigEntry('task_crawler_ping_interval_delay', 250); // CFG: TASK-CRAWLER-PING-MAX-RUNS $cfg->setConfigEntry('task_crawler_ping_max_runs', 0); +// CFG: CRAWLER-URL-SOURCE-LOCAL-START-TASK-CLASS +$cfg->setConfigEntry('crawler_url_source_local_start_task_class', 'CrawlerUrlSourceLocalStartTask'); + +// CFG: TASK-CRAWLER-URL-SOURCE-LOCAL-START-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_url_source_local_start_startup_delay', 3000); + +// CFG: TASK-CRAWLER-URL-SOURCE-LOCAL-START-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_url_source_local_start_interval_delay', 150); + +// CFG: TASK-CRAWLER-URL-SOURCE-LOCAL-START-MAX-RUNS +$cfg->setConfigEntry('task_crawler_url_source_local_start_max_runs', 0); + +// CFG: CRAWLER-URL-SOURCE-UPLOADED-LIST-TASK-CLASS +$cfg->setConfigEntry('crawler_url_source_uploaded_list_task_class', 'CrawlerUrlSourceUploadedListTask'); + +// CFG: TASK-CRAWLER-URL-SOURCE-UPLOADED-LIST-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_url_source_uploaded_list_startup_delay', 3000); + +// CFG: TASK-CRAWLER-URL-SOURCE-UPLOADED-LIST-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_url_source_uploaded_list_interval_delay', 150); + +// CFG: TASK-CRAWLER-URL-SOURCE-UPLOADED-LIST-MAX-RUNS +$cfg->setConfigEntry('task_crawler_url_source_uploaded_list_max_runs', 0); + +// CFG: CRAWLER-URL-SOURCE-RSS-START-TASK-CLASS +$cfg->setConfigEntry('crawler_url_source_rss_start_task_class', 'CrawlerUrlSourceRssStartTask'); + +// CFG: TASK-CRAWLER-URL-SOURCE-RSS-START-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_url_source_rss_start_startup_delay', 3000); + +// CFG: TASK-CRAWLER-URL-SOURCE-RSS-START-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_url_source_rss_start_interval_delay', 150); + +// CFG: TASK-CRAWLER-URL-SOURCE-RSS-START-MAX-RUNS +$cfg->setConfigEntry('task_crawler_url_source_rss_start_max_runs', 0); + +// CFG: CRAWLER-URL-SOURCE-FOUND-RSS-TASK-CLASS +$cfg->setConfigEntry('crawler_url_source_found_rss_task_class', 'CrawlerUrlSourceFoundRssTask'); + +// CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_url_source_found_rss_startup_delay', 3000); + +// CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_url_source_found_rss_interval_delay', 150); + +// CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS +$cfg->setConfigEntry('task_crawler_url_source_found_rss_max_runs', 0); + /////////////////////////////////////////////////////////////////////////////// // HTTP Configuration /////////////////////////////////////////////////////////////////////////////// diff --git a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php index cb2a1383c..4ba0ed0b1 100644 --- a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php +++ b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php @@ -101,6 +101,22 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_ping_task_class'); $handlerInstance->registerTask('crawler_ping', $taskInstance); + // 10) URL source: local start + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_local_start_task_class'); + $handlerInstance->registerTask('crawler_url_source_local_start', $taskInstance); + + // 11) URL source: uploaded list + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_uploaded_list_task_class'); + $handlerInstance->registerTask('crawler_url_source_uploaded_list', $taskInstance); + + // 12) URL source: RSS feed + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_rss_start_task_class'); + $handlerInstance->registerTask('crawler_url_source_rss_start', $taskInstance); + + // 13) URL source: found RSS/ATOM feed + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_found_rss_task_class'); + $handlerInstance->registerTask('crawler_url_source_found_rss', $taskInstance); + // Put the task handler in registry Registry::getRegistry()->addInstance('task_handler', $handlerInstance); } diff --git a/application/hub/main/tasks/hub/.htaccess b/application/hub/main/tasks/crawler/url_source/.htaccess similarity index 100% rename from application/hub/main/tasks/hub/.htaccess rename to application/hub/main/tasks/crawler/url_source/.htaccess diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource new file mode 100644 index 000000000..fc9fa9f0e --- /dev/null +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerUrlSource???Task extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawler???Task () { + // Get new instance + $taskInstance = new Crawler???Task(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php new file mode 100644 index 000000000..1988739de --- /dev/null +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerUrlSourceFoundRssTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerUrlSourceFoundRssTask () { + // Get new instance + $taskInstance = new CrawlerUrlSourceFoundRssTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php new file mode 100644 index 000000000..0972f062a --- /dev/null +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerUrlSourceLocalStartTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerUrlSourceLocalStartTask () { + // Get new instance + $taskInstance = new CrawlerUrlSourceLocalStartTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php new file mode 100644 index 000000000..fd1a77d45 --- /dev/null +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerUrlSourceRssStartTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerUrlSourceRssStartTask () { + // Get new instance + $taskInstance = new CrawlerUrlSourceRssStartTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php new file mode 100644 index 000000000..c8b4b7085 --- /dev/null +++ b/application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerUrlSourceUploadedListTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerUrlSourceUploadedListTask () { + // Get new instance + $taskInstance = new CrawlerUrlSourceUploadedListTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/hub/chunks/.htaccess b/application/hub/main/tasks/hub/chunks/.htaccess deleted file mode 100644 index 3a4288278..000000000 --- a/application/hub/main/tasks/hub/chunks/.htaccess +++ /dev/null @@ -1 +0,0 @@ -Deny from all diff --git a/application/hub/main/tasks/hub/class_Hub b/application/hub/main/tasks/hub/class_Hub deleted file mode 100644 index f551ef47b..000000000 --- a/application/hub/main/tasks/hub/class_Hub +++ /dev/null @@ -1,3 +0,0 @@ - diff --git a/application/hub/main/tasks/hub/decoder/.htaccess b/application/hub/main/tasks/hub/decoder/.htaccess deleted file mode 100644 index 3a4288278..000000000 --- a/application/hub/main/tasks/hub/decoder/.htaccess +++ /dev/null @@ -1 +0,0 @@ -Deny from all diff --git a/application/hub/main/tasks/hub/ping/.htaccess b/application/hub/main/tasks/hub/ping/.htaccess deleted file mode 100644 index 3a4288278..000000000 --- a/application/hub/main/tasks/hub/ping/.htaccess +++ /dev/null @@ -1 +0,0 @@ -Deny from all diff --git a/application/hub/main/tasks/hub/update/.htaccess b/application/hub/main/tasks/hub/update/.htaccess deleted file mode 100644 index 3a4288278..000000000 --- a/application/hub/main/tasks/hub/update/.htaccess +++ /dev/null @@ -1 +0,0 @@ -Deny from all diff --git a/application/hub/main/tasks/hub/announcement/.htaccess b/url_lists/.htaccess similarity index 100% rename from application/hub/main/tasks/hub/announcement/.htaccess rename to url_lists/.htaccess diff --git a/url_lists/demo.lst b/url_lists/demo.lst new file mode 100644 index 000000000..84b5b400b --- /dev/null +++ b/url_lists/demo.lst @@ -0,0 +1 @@ +http://mxchange.org -- 2.39.5