later be used by the crawlers to "look" for pending crawl jobs.
Signed-off-by: Roland Haeder <roland@mxchange.org>
// CFG: CRAWLER-URL-SOURCE-FOUND-RSS-CLASS
$cfg->setConfigEntry('crawler_url_source_found_rss_class', 'CrawlerFoundRssUrlSource');
+// CFG: CRAWLER-URL-STACKS
+$cfg->setConfigEntry('crawler_url_stacks', 'local_start:uploaded_list:rss_start:found_rss');
+
// CFG: CRAWLER-NODE-COMMUNICATOR-TASK-CLASS
$cfg->setConfigEntry('crawler_node_communicator_task_class', 'CrawlerNodeCommunicatorTask');
// CFG: TASK-CRAWLER-NODE-COMMUNICATOR-MAX-RUNS
$cfg->setConfigEntry('task_crawler_node_communicator_max_runs', 0);
-// CFG: CRAWLER-LOCAL-URL-GETTER-TASK-CLASS
-$cfg->setConfigEntry('crawler_local_url_getter_task_class', 'CrawlerLocalUrlGetterTask');
+// CFG: CRAWLER-LOCAL-URL-CRAWLER-TASK-CLASS
+$cfg->setConfigEntry('crawler_local_url_crawler_task_class', 'CrawlerLocalUrlGetterTask');
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-STARTUP-DELAY
-$cfg->setConfigEntry('task_crawler_local_url_getter_startup_delay', 1500);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-STARTUP-DELAY
+$cfg->setConfigEntry('task_crawler_local_url_crawler_startup_delay', 1500);
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-INTERVAL-DELAY
-$cfg->setConfigEntry('task_crawler_local_url_getter_interval_delay', 200);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-INTERVAL-DELAY
+$cfg->setConfigEntry('task_crawler_local_url_crawler_interval_delay', 200);
-// CFG: TASK-CRAWLER-LOCAL-URL-GETTER-MAX-RUNS
-$cfg->setConfigEntry('task_crawler_local_url_getter_max_runs', 0);
+// CFG: TASK-CRAWLER-LOCAL-URL-CRAWLER-MAX-RUNS
+$cfg->setConfigEntry('task_crawler_local_url_crawler_max_runs', 0);
-// CFG: CRAWLER-REMOTE-URL-GETTER-TASK-CLASS
-$cfg->setConfigEntry('crawler_remote_url_getter_task_class', 'CrawlerRemoteUrlGetterTask');
+// CFG: CRAWLER-REMOTE-URL-CRAWLER-TASK-CLASS
+$cfg->setConfigEntry('crawler_remote_url_crawler_task_class', 'CrawlerRemoteUrlGetterTask');
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-STARTUP-DELAY
-$cfg->setConfigEntry('task_crawler_remote_url_getter_startup_delay', 1500);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-STARTUP-DELAY
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_startup_delay', 1500);
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-INTERVAL-DELAY
-$cfg->setConfigEntry('task_crawler_remote_url_getter_interval_delay', 200);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-INTERVAL-DELAY
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_interval_delay', 200);
-// CFG: TASK-CRAWLER-REMOTE-URL-GETTER-MAX-RUNS
-$cfg->setConfigEntry('task_crawler_remote_url_getter_max_runs', 0);
+// CFG: TASK-CRAWLER-REMOTE-URL-CRAWLER-MAX-RUNS
+$cfg->setConfigEntry('task_crawler_remote_url_crawler_max_runs', 0);
// CFG: CRAWLER-REMOTE-JOB-PUBLISHER-TASK-CLASS
$cfg->setConfigEntry('crawler_remote_job_publisher_task_class', 'CrawlerRemoteJobPublisherTask');
$taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_node_communicator_task_class');
$handlerInstance->registerTask('crawler_node_communicator', $taskInstance);
- // 2) Local URL getter (gets URLs locally and adds them to the analyzer's input stack)
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_getter_task_class');
- $handlerInstance->registerTask('crawler_local_url_getter', $taskInstance);
+ // 2) Local URL crawler (gets URLs locally and adds them to the analyzer's input stack)
+ $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_local_url_crawler_task_class');
+ $handlerInstance->registerTask('crawler_local_url_crawler', $taskInstance);
- // 3) Remote URL getter (gets URLs locally for other nodes, also includes the crawled URL in local index)
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_getter_task_class');
- $handlerInstance->registerTask('crawler_remote_url_getter', $taskInstance);
+ // 3) Remote URL crawler (gets URLs locally for other nodes, also includes the crawled URL in local index)
+ $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_url_crawler_task_class');
+ $handlerInstance->registerTask('crawler_remote_url_crawler', $taskInstance);
// 4) Remote-crawl publisher (publishes crawl jobs for remote retrieval)
$taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_remote_job_publisher_task_class');
$taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_ping_task_class');
$handlerInstance->registerTask('crawler_ping', $taskInstance);
- // 10) URL source: local start
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_local_start_task_class');
- $handlerInstance->registerTask('crawler_url_source_local_start', $taskInstance);
+ // 10) URL sources
+ foreach (explode(':', $this->getConfigInstance()->getConfigEntry('crawler_url_stacks')( as $stack) {
+ // Init task instance
+ $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_' . $stack . '_task_class');
- // 11) URL source: uploaded list
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_uploaded_list_task_class');
- $handlerInstance->registerTask('crawler_url_source_uploaded_list', $taskInstance);
+ // And register it
+ $handlerInstance->registerTask('crawler_url_source_' . $stack, $taskInstance);
+ } // END - foreach
- // 12) URL source: RSS feed
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_rss_start_task_class');
- $handlerInstance->registerTask('crawler_url_source_rss_start', $taskInstance);
-
- // 13) URL source: found RSS/ATOM feed
- $taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_url_source_found_rss_task_class');
- $handlerInstance->registerTask('crawler_url_source_found_rss', $taskInstance);
-
- // 14) Uploaded list scanner (checks for wanted files)
+ // 11) Uploaded list scanner (checks for wanted files)
$taskInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_scanner_task_class');
$handlerInstance->registerTask('crawler_uploaded_list_scanner', $taskInstance);
*/
const STACK_NAME_CSV_ENTRY = 'csv_entry';
+ /**
+ * Size of crawl (CSV) entry which is an indexed array:
+ *
+ * 0 = URL to crawl
+ * 1 = Crawl depth of URL
+ * 2 = Crawl depth of linked URLs (same other host only)
+ */
+ const CRAWL_ENTRY_SIZE = 3;
+
/**
* "Imported" CSV files
*/
return $sourceInstance;
}
+ /**
+ * Enriches and saves the given CSV entry (array) in the assigned
+ * file-based stack. To such entry a lot more informations are added, such
+ * as which files shall be crawled and many more.
+ *
+ * @param $csvData Array with data from a CSV file
+ * @return void
+ */
+ private function saveCsvDataInCrawlerQueue (array $csvData) {
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!');
+
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ }
+
/**
* Checks whether a CSV file has been loaded (added to the stack)
*
} // END - if
// ... with 3 elements, later enhancements may accept more
- assert(count($csvData) == 3);
+ assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
/*
* Push the file back on stack as it may contain more entries. This way
// Debug message
/* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+ // Pop it from stack
+ $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY);
+
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
+
+ // It must have 3 elements (see method parseCsvFile() for details)
+ assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
+
+ // Save it in crawler queue (which will enrich it with way more informations
+ $this->saveCsvDataInCrawlerQueue($csvData);
+
// Debug message
/* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
}