From 4fa56f38f1a42be96a93cbffadda4d7eb31851e1 Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Tue, 6 May 2014 21:26:17 +0200 Subject: [PATCH] Added more tasks (see wiki: https://wiki.shipsimu.org/doku.php/de/projects/hub/applications/crawler ) Signed-off-by: Roland Haeder --- application/hub/config.php | 60 ++++++++++++++++ ...ss_CrawlerTaskHandlerInitializerFilter.php | 20 ++++++ .../tasks/crawler/document_parser/.htaccess | 1 + .../class_CrawlerDocumentParserTask.php | 72 +++++++++++++++++++ .../main/tasks/crawler/mime_sniffer/.htaccess | 1 + .../class_CrawlerMimeSnifferTask.php | 72 +++++++++++++++++++ .../hub/main/tasks/crawler/ping/.htaccess | 1 + .../crawler/ping/class_CrawlerPingTask.php | 72 +++++++++++++++++++ .../tasks/crawler/snippet_extractor/.htaccess | 1 + .../class_CrawlerSnippetExtractorTask.php | 72 +++++++++++++++++++ .../crawler/structure_analyzer/.htaccess | 1 + .../class_CrawlerStructureAnalyzerTask.php | 72 +++++++++++++++++++ 12 files changed, 445 insertions(+) create mode 100644 application/hub/main/tasks/crawler/document_parser/.htaccess create mode 100644 application/hub/main/tasks/crawler/document_parser/class_CrawlerDocumentParserTask.php create mode 100644 application/hub/main/tasks/crawler/mime_sniffer/.htaccess create mode 100644 application/hub/main/tasks/crawler/mime_sniffer/class_CrawlerMimeSnifferTask.php create mode 100644 application/hub/main/tasks/crawler/ping/.htaccess create mode 100644 application/hub/main/tasks/crawler/ping/class_CrawlerPingTask.php create mode 100644 application/hub/main/tasks/crawler/snippet_extractor/.htaccess create mode 100644 application/hub/main/tasks/crawler/snippet_extractor/class_CrawlerSnippetExtractorTask.php create mode 100644 application/hub/main/tasks/crawler/structure_analyzer/.htaccess create mode 100644 application/hub/main/tasks/crawler/structure_analyzer/class_CrawlerStructureAnalyzerTask.php diff --git a/application/hub/config.php b/application/hub/config.php index da3c26053..2aa1dba3a 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1265,6 +1265,66 @@ $cfg->setConfigEntry('task_crawler_remote_job_publisher_interval_delay', 50); // CFG: TASK-CRAWLER-REMOTE-JOB-PUBLISHER-MAX-RUNS $cfg->setConfigEntry('task_crawler_remote_job_publisher_max_runs', 0); +// CFG: CRAWLER-MIME-SNIFFER-TASK-CLASS +$cfg->setConfigEntry('crawler_mime_sniffer_task_class', 'CrawlerMimeSnifferTask'); + +// CFG: TASK-CRAWLER-MIME-SNIFFER-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_mime_sniffer_startup_delay', 1500); + +// CFG: TASK-CRAWLER-MIME-SNIFFER-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_mime_sniffer_interval_delay', 100); + +// CFG: TASK-CRAWLER-MIME-SNIFFER-MAX-RUNS +$cfg->setConfigEntry('task_crawler_mime_sniffer_max_runs', 0); + +// CFG: CRAWLER-DOCUMENT-PARSER-TASK-CLASS +$cfg->setConfigEntry('crawler_document_parser_task_class', 'CrawlerDocumentParserTask'); + +// CFG: TASK-CRAWLER-DOCUMENT-PARSER-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_document_parser_startup_delay', 1500); + +// CFG: TASK-CRAWLER-DOCUMENT-PARSER-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_document_parser_interval_delay', 100); + +// CFG: TASK-CRAWLER-DOCUMENT-PARSER-MAX-RUNS +$cfg->setConfigEntry('task_crawler_document_parser_max_runs', 0); + +// CFG: CRAWLER-STRUCTURE-ANALYZER-TASK-CLASS +$cfg->setConfigEntry('crawler_structure_analyzer_task_class', 'CrawlerStructureAnalyzerTask'); + +// CFG: TASK-CRAWLER-STRUCTURE-ANALYZER-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_structure_analyzer_startup_delay', 1500); + +// CFG: TASK-CRAWLER-STRUCTURE-ANALYZER-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_structure_analyzer_interval_delay', 100); + +// CFG: TASK-CRAWLER-STRUCTURE-ANALYZER-MAX-RUNS +$cfg->setConfigEntry('task_crawler_structure_analyzer_max_runs', 0); + +// CFG: CRAWLER-SNIPPET-EXTRACTOR-TASK-CLASS +$cfg->setConfigEntry('crawler_snippet_extractor_task_class', 'CrawlerSnippetExtractorTask'); + +// CFG: TASK-CRAWLER-SNIPPET-EXTRACTOR-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_snippet_extractor_startup_delay', 1500); + +// CFG: TASK-CRAWLER-SNIPPET-EXTRACTOR-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_snippet_extractor_interval_delay', 100); + +// CFG: TASK-CRAWLER-SNIPPET-EXTRACTOR-MAX-RUNS +$cfg->setConfigEntry('task_crawler_snippet_extractor_max_runs', 0); + +// CFG: CRAWLER-PING-TASK-CLASS +$cfg->setConfigEntry('crawler_ping_task_class', 'CrawlerPingTask'); + +// CFG: TASK-CRAWLER-PING-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_ping_startup_delay', 1500); + +// CFG: TASK-CRAWLER-PING-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_ping_interval_delay', 100); + +// CFG: TASK-CRAWLER-PING-MAX-RUNS +$cfg->setConfigEntry('task_crawler_ping_max_runs', 0); + /////////////////////////////////////////////////////////////////////////////// // HTTP Configuration /////////////////////////////////////////////////////////////////////////////// diff --git a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php index 43a7be491..cb2a1383c 100644 --- a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php +++ b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php @@ -81,6 +81,26 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_job_publisher_task_class'); $handlerInstance->registerTask('crawler_remote_job_publisher', $taskInstance); + // 5) MIME sniffer + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_mime_sniffer_task_class'); + $handlerInstance->registerTask('crawler_mime_sniffer', $taskInstance); + + // 6) Document parser (converts document to meta format) + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_document_parser_task_class'); + $handlerInstance->registerTask('crawler_document_parser', $taskInstance); + + // 7) Document structure analyzer + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_structure_analyzer_task_class'); + $handlerInstance->registerTask('crawler_structure_analyzer', $taskInstance); + + // 8) Snippet extractor + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_snippet_extractor_task_class'); + $handlerInstance->registerTask('crawler_snippet_extractor', $taskInstance); + + // 9) Node ping + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_ping_task_class'); + $handlerInstance->registerTask('crawler_ping', $taskInstance); + // Put the task handler in registry Registry::getRegistry()->addInstance('task_handler', $handlerInstance); } diff --git a/application/hub/main/tasks/crawler/document_parser/.htaccess b/application/hub/main/tasks/crawler/document_parser/.htaccess new file mode 100644 index 000000000..3a4288278 --- /dev/null +++ b/application/hub/main/tasks/crawler/document_parser/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/application/hub/main/tasks/crawler/document_parser/class_CrawlerDocumentParserTask.php b/application/hub/main/tasks/crawler/document_parser/class_CrawlerDocumentParserTask.php new file mode 100644 index 000000000..32b245da0 --- /dev/null +++ b/application/hub/main/tasks/crawler/document_parser/class_CrawlerDocumentParserTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerDocumentParserTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerDocumentParserTask () { + // Get new instance + $taskInstance = new CrawlerDocumentParserTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/mime_sniffer/.htaccess b/application/hub/main/tasks/crawler/mime_sniffer/.htaccess new file mode 100644 index 000000000..3a4288278 --- /dev/null +++ b/application/hub/main/tasks/crawler/mime_sniffer/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/application/hub/main/tasks/crawler/mime_sniffer/class_CrawlerMimeSnifferTask.php b/application/hub/main/tasks/crawler/mime_sniffer/class_CrawlerMimeSnifferTask.php new file mode 100644 index 000000000..6cd300178 --- /dev/null +++ b/application/hub/main/tasks/crawler/mime_sniffer/class_CrawlerMimeSnifferTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerMimeSnifferTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerMimeSnifferTask () { + // Get new instance + $taskInstance = new CrawlerMimeSnifferTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/ping/.htaccess b/application/hub/main/tasks/crawler/ping/.htaccess new file mode 100644 index 000000000..3a4288278 --- /dev/null +++ b/application/hub/main/tasks/crawler/ping/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/application/hub/main/tasks/crawler/ping/class_CrawlerPingTask.php b/application/hub/main/tasks/crawler/ping/class_CrawlerPingTask.php new file mode 100644 index 000000000..ee2154c99 --- /dev/null +++ b/application/hub/main/tasks/crawler/ping/class_CrawlerPingTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerPingTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerPingTask () { + // Get new instance + $taskInstance = new CrawlerPingTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/snippet_extractor/.htaccess b/application/hub/main/tasks/crawler/snippet_extractor/.htaccess new file mode 100644 index 000000000..3a4288278 --- /dev/null +++ b/application/hub/main/tasks/crawler/snippet_extractor/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/application/hub/main/tasks/crawler/snippet_extractor/class_CrawlerSnippetExtractorTask.php b/application/hub/main/tasks/crawler/snippet_extractor/class_CrawlerSnippetExtractorTask.php new file mode 100644 index 000000000..8b2f98a64 --- /dev/null +++ b/application/hub/main/tasks/crawler/snippet_extractor/class_CrawlerSnippetExtractorTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerSnippetExtractorTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerSnippetExtractorTask () { + // Get new instance + $taskInstance = new CrawlerSnippetExtractorTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> diff --git a/application/hub/main/tasks/crawler/structure_analyzer/.htaccess b/application/hub/main/tasks/crawler/structure_analyzer/.htaccess new file mode 100644 index 000000000..3a4288278 --- /dev/null +++ b/application/hub/main/tasks/crawler/structure_analyzer/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/application/hub/main/tasks/crawler/structure_analyzer/class_CrawlerStructureAnalyzerTask.php b/application/hub/main/tasks/crawler/structure_analyzer/class_CrawlerStructureAnalyzerTask.php new file mode 100644 index 000000000..20e153c7c --- /dev/null +++ b/application/hub/main/tasks/crawler/structure_analyzer/class_CrawlerStructureAnalyzerTask.php @@ -0,0 +1,72 @@ + + * @version 0.0.0 + * @copyright Copyright (c) 2014 Crawler Developer Team + * @license GNU GPL 3.0 or any newer version + * @link http://www.ship-simu.org + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +class CrawlerStructureAnalyzerTask extends BaseTask implements Taskable, Visitable { + /** + * Protected constructor + * + * @return void + */ + protected function __construct () { + // Call parent constructor + parent::__construct(__CLASS__); + } + + /** + * Creates an instance of this class + * + * @return $taskInstance An instance of a Visitable class + */ + public final static function createCrawlerStructureAnalyzerTask () { + // Get new instance + $taskInstance = new CrawlerStructureAnalyzerTask(); + + // Return the prepared instance + return $taskInstance; + } + + /** + * Accepts the visitor to process the visitor + * + * @param $visitorInstance An instance of a Visitor class + * @return void + * @todo Maybe visit some sub-objects + */ + public function accept (Visitor $visitorInstance) { + // Visit this task + $visitorInstance->visitTask($this); + } + + /** + * Executes the task + * + * @return void + * @todo 0% + */ + public function executeTask () { + $this->partialStub('Unimplemented task.'); + } +} + +// [EOF] +?> -- 2.39.5