From 8d4c74182a1dd4d6ae3d5397cffe47778795af3d Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Thu, 5 Mar 2015 23:02:04 +0100 Subject: [PATCH] Now all URL sources (stacks) are initialized in a loop. This config entry will later be used by the crawlers to "look" for pending crawl jobs. Signed-off-by: Roland Haeder --- application/hub/config.php | 35 +++++++++-------- ...ss_CrawlerTaskHandlerInitializerFilter.php | 35 +++++++---------- .../class_CrawlerUploadedListUrlSource.php | 39 ++++++++++++++++++- 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/application/hub/config.php b/application/hub/config.php index c7401fe7d..e7d1dafac 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1280,6 +1280,9 @@ $cfg->setConfigEntry('crawler_url_source_rss_start_class', 'CrawlerRssStartUrlSo // CFG: CRAWLER-URL-SOURCE-FOUND-RSS-CLASS $cfg->setConfigEntry('crawler_url_source_found_rss_class', 'CrawlerFoundRssUrlSource'); +// CFG: CRAWLER-URL-STACKS +$cfg->setConfigEntry('crawler_url_stacks', 'local_start:uploaded_list:rss_start:found_rss'); + // CFG: CRAWLER-NODE-COMMUNICATOR-TASK-CLASS $cfg->setConfigEntry('crawler_node_communicator_task_class', 'CrawlerNodeCommunicatorTask'); @@ -1325,29 +1328,29 @@ $cfg->setConfigEntry('task_crawler_node_communicator_interval_delay', 250); // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-MAX-RUNS $cfg->setConfigEntry('task_crawler_node_communicator_max_runs', 0); -// CFG: CRAWLER-LOCAL-URL-GETTER-TASK-CLASS -$cfg->setConfigEntry('crawler_local_url_getter_task_class', 'CrawlerLocalUrlGetterTask'); +// CFG: CRAWLER-LOCAL-URL-CRAWLER-TASK-CLASS +$cfg->setConfigEntry('crawler_local_url_crawler_task_class', 'CrawlerLocalUrlGetterTask'); -// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-STARTUP-DELAY -$cfg->setConfigEntry('task_crawler_local_url_getter_startup_delay', 1500); +// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_local_url_crawler_startup_delay', 1500); -// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-INTERVAL-DELAY -$cfg->setConfigEntry('task_crawler_local_url_getter_interval_delay', 200); +// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_local_url_crawler_interval_delay', 200); -// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-MAX-RUNS -$cfg->setConfigEntry('task_crawler_local_url_getter_max_runs', 0); +// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-MAX-RUNS +$cfg->setConfigEntry('task_crawler_local_url_crawler_max_runs', 0); -// CFG: CRAWLER-REMOTE-URL-GETTER-TASK-CLASS -$cfg->setConfigEntry('crawler_remote_url_getter_task_class', 'CrawlerRemoteUrlGetterTask'); +// CFG: CRAWLER-REMOTE-URL-CRAWLER-TASK-CLASS +$cfg->setConfigEntry('crawler_remote_url_crawler_task_class', 'CrawlerRemoteUrlGetterTask'); -// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-STARTUP-DELAY -$cfg->setConfigEntry('task_crawler_remote_url_getter_startup_delay', 1500); +// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-STARTUP-DELAY +$cfg->setConfigEntry('task_crawler_remote_url_crawler_startup_delay', 1500); -// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-INTERVAL-DELAY -$cfg->setConfigEntry('task_crawler_remote_url_getter_interval_delay', 200); +// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-INTERVAL-DELAY +$cfg->setConfigEntry('task_crawler_remote_url_crawler_interval_delay', 200); -// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-MAX-RUNS -$cfg->setConfigEntry('task_crawler_remote_url_getter_max_runs', 0); +// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-MAX-RUNS +$cfg->setConfigEntry('task_crawler_remote_url_crawler_max_runs', 0); // CFG: CRAWLER-REMOTE-JOB-PUBLISHER-TASK-CLASS $cfg->setConfigEntry('crawler_remote_job_publisher_task_class', 'CrawlerRemoteJobPublisherTask'); diff --git a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php index 46a84ffef..36fe3946c 100644 --- a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php +++ b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php @@ -69,13 +69,13 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_node_communicator_task_class'); $handlerInstance->registerTask('crawler_node_communicator', $taskInstance); - // 2) Local URL getter (gets URLs locally and adds them to the analyzer's input stack) - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_getter_task_class'); - $handlerInstance->registerTask('crawler_local_url_getter', $taskInstance); + // 2) Local URL crawler (gets URLs locally and adds them to the analyzer's input stack) + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_crawler_task_class'); + $handlerInstance->registerTask('crawler_local_url_crawler', $taskInstance); - // 3) Remote URL getter (gets URLs locally for other nodes, also includes the crawled URL in local index) - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_getter_task_class'); - $handlerInstance->registerTask('crawler_remote_url_getter', $taskInstance); + // 3) Remote URL crawler (gets URLs locally for other nodes, also includes the crawled URL in local index) + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_crawler_task_class'); + $handlerInstance->registerTask('crawler_remote_url_crawler', $taskInstance); // 4) Remote-crawl publisher (publishes crawl jobs for remote retrieval) $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_job_publisher_task_class'); @@ -101,23 +101,16 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_ping_task_class'); $handlerInstance->registerTask('crawler_ping', $taskInstance); - // 10) URL source: local start - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_local_start_task_class'); - $handlerInstance->registerTask('crawler_url_source_local_start', $taskInstance); + // 10) URL sources + foreach (explode(':', $this->getConfigInstance()->getConfigEntry('crawler_url_stacks')( as $stack) { + // Init task instance + $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_' . $stack . '_task_class'); - // 11) URL source: uploaded list - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_uploaded_list_task_class'); - $handlerInstance->registerTask('crawler_url_source_uploaded_list', $taskInstance); + // And register it + $handlerInstance->registerTask('crawler_url_source_' . $stack, $taskInstance); + } // END - foreach - // 12) URL source: RSS feed - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_rss_start_task_class'); - $handlerInstance->registerTask('crawler_url_source_rss_start', $taskInstance); - - // 13) URL source: found RSS/ATOM feed - $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_found_rss_task_class'); - $handlerInstance->registerTask('crawler_url_source_found_rss', $taskInstance); - - // 14) Uploaded list scanner (checks for wanted files) + // 11) Uploaded list scanner (checks for wanted files) $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_scanner_task_class'); $handlerInstance->registerTask('crawler_uploaded_list_scanner', $taskInstance); diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 2084887bb..02ff77f32 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -47,6 +47,15 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ const STACK_NAME_CSV_ENTRY = 'csv_entry'; + /** + * Size of crawl (CSV) entry which is an indexed array: + * + * 0 = URL to crawl + * 1 = Crawl depth of URL + * 2 = Crawl depth of linked URLs (same other host only) + */ + const CRAWL_ENTRY_SIZE = 3; + /** * "Imported" CSV files */ @@ -140,6 +149,22 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R return $sourceInstance; } + /** + * Enriches and saves the given CSV entry (array) in the assigned + * file-based stack. To such entry a lot more informations are added, such + * as which files shall be crawled and many more. + * + * @param $csvData Array with data from a CSV file + * @return void + */ + private function saveCsvDataInCrawlerQueue (array $csvData) { + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!'); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } + /** * Checks whether a CSV file has been loaded (added to the stack) * @@ -229,7 +254,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } // END - if // ... with 3 elements, later enhancements may accept more - assert(count($csvData) == 3); + assert(count($csvData) == self::CRAWL_ENTRY_SIZE); /* * Push the file back on stack as it may contain more entries. This way @@ -253,6 +278,18 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R // Debug message /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + // Pop it from stack + $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); + + // It must have 3 elements (see method parseCsvFile() for details) + assert(count($csvData) == self::CRAWL_ENTRY_SIZE); + + // Save it in crawler queue (which will enrich it with way more informations + $this->saveCsvDataInCrawlerQueue($csvData); + // Debug message /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } -- 2.39.5