Now all URL sources (stacks) are initialized in a loop. This config entry will

author Roland Haeder <roland@mxchange.org>

Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)

committer Roland Haeder <roland@mxchange.org>

Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)
author Roland Haeder <roland@mxchange.org>
Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)
committer Roland Haeder <roland@mxchange.org>
Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)
diff --git a/application/hub/config.php b/application/hub/config.php

index c7401fe7d9105e0feea82a57f92c3f9f5d720907..e7d1dafacb4b3d0355e7dad494f01a9bc7d78b4c 100644 (file)
--- a/application/hub/config.php
+++ b/application/hub/config.php
@@ -1280,6 +1280,9 @@ $cfg->setConfigEntry('crawler_url_source_rss_start_class', 'CrawlerRssStartUrlSo
  // CFG: CRAWLER-URL-SOURCE-FOUND-RSS-CLASS
  $cfg->setConfigEntry('crawler_url_source_found_rss_class', 'CrawlerFoundRssUrlSource');
  
+// CFG: CRAWLER-URL-STACKS
+$cfg->setConfigEntry('crawler_url_stacks', 'local_start:uploaded_list:rss_start:found_rss');
+
  // CFG: CRAWLER-NODE-COMMUNICATOR-TASK-CLASS
  $cfg->setConfigEntry('crawler_node_communicator_task_class', 'CrawlerNodeCommunicatorTask');
  
@@ -1325,29 +1328,29 @@ $cfg->setConfigEntry('task_crawler_node_communicator_interval_delay', 250);
  // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-MAX-RUNS
  $cfg->setConfigEntry('task_crawler_node_communicator_max_runs', 0);
  
-// CFG: CRAWLER-LOCAL-URL-GETTER-TASK-CLASS
-$cfg->setConfigEntry('crawler_local_url_getter_task_class', 'CrawlerLocalUrlGetterTask');
+// CFG: CRAWLER-LOCAL-URL-CRAWLER-TASK-CLASS
+$cfg->setConfigEntry('crawler_local_url_crawler_task_class', 'CrawlerLocalUrlGetterTask');
  
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-STARTUP-DELAY
-$cfg->setConfigEntry('task_crawler_local_url_getter_startup_delay', 1500);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-STARTUP-DELAY
+$cfg->setConfigEntry('task_crawler_local_url_crawler_startup_delay', 1500);
  
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-INTERVAL-DELAY
-$cfg->setConfigEntry('task_crawler_local_url_getter_interval_delay', 200);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-INTERVAL-DELAY
+$cfg->setConfigEntry('task_crawler_local_url_crawler_interval_delay', 200);
  
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-MAX-RUNS
-$cfg->setConfigEntry('task_crawler_local_url_getter_max_runs', 0);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-MAX-RUNS
+$cfg->setConfigEntry('task_crawler_local_url_crawler_max_runs', 0);
  
-// CFG: CRAWLER-REMOTE-URL-GETTER-TASK-CLASS
-$cfg->setConfigEntry('crawler_remote_url_getter_task_class', 'CrawlerRemoteUrlGetterTask');
+// CFG: CRAWLER-REMOTE-URL-CRAWLER-TASK-CLASS
+$cfg->setConfigEntry('crawler_remote_url_crawler_task_class', 'CrawlerRemoteUrlGetterTask');
  
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-STARTUP-DELAY
-$cfg->setConfigEntry('task_crawler_remote_url_getter_startup_delay', 1500);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-STARTUP-DELAY
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_startup_delay', 1500);
  
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-INTERVAL-DELAY
-$cfg->setConfigEntry('task_crawler_remote_url_getter_interval_delay', 200);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-INTERVAL-DELAY
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_interval_delay', 200);
  
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-MAX-RUNS
-$cfg->setConfigEntry('task_crawler_remote_url_getter_max_runs', 0);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-MAX-RUNS
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_max_runs', 0);
  
  // CFG: CRAWLER-REMOTE-JOB-PUBLISHER-TASK-CLASS
  $cfg->setConfigEntry('crawler_remote_job_publisher_task_class', 'CrawlerRemoteJobPublisherTask');
diff --git a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php

index 46a84ffef9ef7c10ae24524dbb7ee479332fdb98..36fe3946c9a95b5cf68ac8e97105d6246704cfc1 100644 (file)
--- a/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php
+++ b/application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php
@@ -69,13 +69,13 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F
                 $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_node_communicator_task_class');
                 $handlerInstance->registerTask('crawler_node_communicator', $taskInstance);
  
-               // 2) Local URL getter (gets URLs locally and adds them to the analyzer's input stack)
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_getter_task_class');
-               $handlerInstance->registerTask('crawler_local_url_getter', $taskInstance);
+               // 2) Local URL crawler (gets URLs locally and adds them to the analyzer's input stack)
+               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_crawler_task_class');
+               $handlerInstance->registerTask('crawler_local_url_crawler', $taskInstance);
  
-               // 3) Remote URL getter (gets URLs locally for other nodes, also includes the crawled URL in local index)
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_getter_task_class');
-               $handlerInstance->registerTask('crawler_remote_url_getter', $taskInstance);
+               // 3) Remote URL crawler (gets URLs locally for other nodes, also includes the crawled URL in local index)
+               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_crawler_task_class');
+               $handlerInstance->registerTask('crawler_remote_url_crawler', $taskInstance);
  
                 // 4) Remote-crawl publisher (publishes crawl jobs for remote retrieval)
                 $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_job_publisher_task_class');
@@ -101,23 +101,16 @@ class CrawlerTaskHandlerInitializerFilter extends BaseCrawlerFilter implements F
                 $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_ping_task_class');
                 $handlerInstance->registerTask('crawler_ping', $taskInstance);
  
-               // 10) URL source: local start
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_local_start_task_class');
-               $handlerInstance->registerTask('crawler_url_source_local_start', $taskInstance);
+               // 10) URL sources
+               foreach (explode(':', $this->getConfigInstance()->getConfigEntry('crawler_url_stacks')( as $stack) {
+                       // Init task instance
+                       $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_' . $stack . '_task_class');
  
-               // 11) URL source: uploaded list
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_uploaded_list_task_class');
-               $handlerInstance->registerTask('crawler_url_source_uploaded_list', $taskInstance);
+                       // And register it
+                       $handlerInstance->registerTask('crawler_url_source_' . $stack, $taskInstance);
+               } // END - foreach
  
-               // 12) URL source: RSS feed
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_rss_start_task_class');
-               $handlerInstance->registerTask('crawler_url_source_rss_start', $taskInstance);
-
-               // 13) URL source: found RSS/ATOM feed
-               $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_found_rss_task_class');
-               $handlerInstance->registerTask('crawler_url_source_found_rss', $taskInstance);
-
-               // 14) Uploaded list scanner (checks for wanted files)
+               // 11) Uploaded list scanner (checks for wanted files)
                 $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_scanner_task_class');
                 $handlerInstance->registerTask('crawler_uploaded_list_scanner', $taskInstance);
  
diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php

index 2084887bb1024ea2318cdc23e02777e229a4cd45..02ff77f32eb81c9c73f8f766db080acb776dc264 100644 (file)
--- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
+++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
@@ -47,6 +47,15 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
          */
         const STACK_NAME_CSV_ENTRY = 'csv_entry';
  
+       /**
+        * Size of crawl (CSV) entry which is an indexed array:
+        *
+        * 0 = URL to crawl
+        * 1 = Crawl depth of URL
+        * 2 = Crawl depth of linked URLs (same other host only)
+        */
+       const CRAWL_ENTRY_SIZE = 3;
+
         /**
          * "Imported" CSV files
          */
@@ -140,6 +149,22 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                 return $sourceInstance;
         }
  
+       /**
+        * Enriches and saves the given CSV entry (array) in the assigned
+        * file-based stack. To such entry a lot more informations are added, such
+        * as which files shall be crawled and many more.
+        *
+        * @param       $csvData        Array with data from a CSV file
+        * @return      void
+        */
+       private function saveCsvDataInCrawlerQueue (array $csvData) {
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!');
+
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+       }
+
         /**
          * Checks whether a CSV file has been loaded (added to the stack)
          *
@@ -229,7 +254,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                 } // END - if
  
                 // ...  with 3 elements, later enhancements may accept more
-               assert(count($csvData) == 3);
+               assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
  
                 /*
                  * Push the file back on stack as it may contain more entries. This way
@@ -253,6 +278,18 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                 // Debug message
                 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
  
+               // Pop it from stack
+               $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY);
+
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
+
+               // It must have 3 elements (see method parseCsvFile() for details)
+               assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
+
+               // Save it in crawler queue (which will enrich it with way more informations
+               $this->saveCsvDataInCrawlerQueue($csvData);
+
                 // Debug message
                 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
         }
author	Roland Haeder <roland@mxchange.org>
	Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)
committer	Roland Haeder <roland@mxchange.org>
	Thu, 5 Mar 2015 22:02:04 +0000 (23:02 +0100)
application/hub/config.php		patch \| blob \| history
application/hub/main/filter/task/crawler/class_CrawlerTaskHandlerInitializerFilter.php		patch \| blob \| history
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php		patch \| blob \| history