*
* @author Roland Haeder <webmaster@shipsimu.org>
* @version 0.0
- * @copyright Copyright (c) 2007 - 2008 Roland Haeder, 2009 - 2012 Hub Developer Team
+ * @copyright Copyright (c) 2007 - 2008 Roland Haeder, 2009 - 2014 Hub Developer Team
* @license GNU GPL 3.0 or any newer version
*
* This program is free software: you can redistribute it and/or modify
// CFG: DHT-RECIPIENT-DISCOVERY-CLASS
$cfg->setConfigEntry('dht_recipient_discovery_class', 'DhtRecipientDiscovery');
+// CFG: UNIVERSAL-NODE-LOCATOR-DISCOVERY-CLASS
+$cfg->setConfigEntry('unl_discovery_class', 'UniversalNodeLocatorDiscovery');
+
// CFG: RECIPIENT-LIST-CLASS
$cfg->setConfigEntry('recipient_list_class', 'RecipientList');
// CFG: UDP-CONNECTION-HELPER-CLASS
$cfg->setConfigEntry('udp_connection_helper_class', 'UdpConnectionHelper');
+// CFG: HUB-COMMUNICATION-PROTOCOL-TYPE
+$cfg->setConfigEntry('hub_communication_protocol_type', 'tcp');
+
+// CFG: TCP-PROTOCOL-RESOLVER-CLASS
+$cfg->setConfigEntry('tcp_protocol_resolver_class', 'TcpProtocolResolver');
+
// CFG: TCP-BUFFER-LENGTH
$cfg->setConfigEntry('tcp_buffer_length', 1024);
// CFG: CRAWLER-URL-FOUND-RSS-FILE-STACK-INDEX-CLASS
$cfg->setConfigEntry('crawler_url_found_rss_file_stack_index_class', 'FileStackIndex');
+ // CFG: CRAWLER-URL-UPLOADED-LIST-URL-SOURCE-STACK-CLASS
+ $cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacker');
+
+ // CFG: STACKER-CSV-FILE-MAX-SIZE
+ $cfg->setConfigEntry('stacker_csv_file_max_size', 10);
+
// CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY
$cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500);
// CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS
$cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0);
-// CFG: CSV-FILE-PATH
-$cfg->setConfigEntry('csv_file_path', 'data/url_lists');
+// CFG: CRAWLER-CSV-FILE-PATH
+$cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists');
///////////////////////////////////////////////////////////////////////////////
// HTTP Configuration
*/
class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
/**
- * Cached path of CSV files
+ * "Cached" CSV path
*/
- private $csvFilesPath = '';
+ private $csvFilePath = '';
/**
- * Last found CSV file
+ * Last CSV file instance
*/
- private $lastCsvFile = '';
+ private $lastCsvFileInstance = NULL;
+
+ /**
+ * Stack for pushing data from this clas to another
+ */
+ private $stackSourceInstance = NULL;
+
+ /**
+ * Stack name for a CSV file
+ */
+ const STACK_NAME_CSV_FILE = 'csv_file';
+
+ /**
+ * "Imported" CSV files
+ */
+ private $csvFileImported = array();
/**
* Protected constructor
// Call parent constructor
parent::__construct(__CLASS__);
- // Set CSV files path
- $this->csvFilesPath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
+ // "Cache" CSV path for faster usage
+ $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path');
- // Get directory instance
- $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilesPath));
+ // Initialize directory instance
+ $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
// Set it here
$this->setDirectoryInstance($directoryInstance);
+
+ // Init stack instance
+ $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
+
+ // Init stack
+ $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
}
/**
- * Creates an instance of this class
- *
- * @return $sourceInstance An instance of a Source class
- */
- public final static function createCrawlerUploadedListUrlSource () {
- // Get new instance
- $sourceInstance = new CrawlerUploadedListUrlSource();
-
- // Init source
- $sourceInstance->initSource('crawler', 'uploaded_list');
-
- // Return the prepared instance
- return $sourceInstance;
- }
-
- /**
- * Checks whether a CSV file is found
+ * Checks whether a CSV file is found in configured path
*
* @return $isFound Whether a CSV file is found
*/
private function isCsvFileFound () {
- // Is the instance valid?
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Is it valid?
if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
- // Then rewind it
+ // Rewind to start
$this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
} // END - if
// Read next entry
- $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array('.htaccess', '.', '..'));
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
+ $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
- // The read entry has not to be empty and extension must be '.csv'
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
+
+ // Is it empty or wrong file extension?
if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
// Skip further processing
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
return FALSE;
} // END - if
- // Initialize CSV instance
- $csvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilesPath . '/' . $directoryEntry));
+ // Initialize CSV file instance
+ $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry));
- // Set it here
- $this->setCsvFileInstance($csvFileInstance);
-
- // Found a file
+ // Found an entry
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
return TRUE;
}
- // Get a ??? @TODO
-
+ /**
+ * Creates an instance of this class
+ *
+ * @return $sourceInstance An instance of a Source class
+ */
+ public final static function createCrawlerUploadedListUrlSource () {
+ // Get new instance
+ $sourceInstance = new CrawlerUploadedListUrlSource();
+
+ // Init source
+ $sourceInstance->initSource('crawler', 'uploaded_list');
+
+ // Return the prepared instance
+ return $sourceInstance;
+ }
+
+ /**
+ * Initializes the import of the CSV file which is being processed by other task
+ *
+ * @return void
+ * @throws NullPointerException If lastCsvFileInstance is not set
+ */
+ private function importCsvFile () {
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Is the instance set?
+ if (is_null($this->lastCsvFileInstance)) {
+ // This should not happen
+ throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
+ } // END - if
+
+ // Stack this file
+ $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
+
+ // ... and mark it as "imported"
+ array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
+
+ // ... and finally NULL it (to save some RAM)
+ $this->lastCsvFileInstance = NULL;
+
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ }
+
+ /**
+ * Getter for stackSourceInstance variable
+ *
+ * @return $stackSourceInstance An instance of an additional stack
+ */
+ public final function getStackSourceInstance () {
+ return $this->stackSourceInstance;
+ }
+
/**
* Processes entries in the stack.
*
public function processStack () {
// Does the stack have some entries left?
if (!$this->isUrlStackEmpty()) {
- // Handle next entry
+ /*
+ * Handle next entry. This method will be called very often, so need
+ * to process more than one entry at a time.
+ */
$this->processNextEntry();
} elseif ($this->isCsvFileFound()) {
- // A CSV file has been found and can maybe be imported.
+ /*
+ * A file containing an URL list is found. Please note the format is
+ * CSV-like as you may wish to provide meta data such as crawl
+ * depth, handling of 3rd-party URLs and such.
+ */
$this->importCsvFile();
}