From 5ef98382f47aa1b63151acebb4c2788888a9cb81 Mon Sep 17 00:00:00 2001 From: Roland Haeder Date: Sun, 7 Dec 2014 21:30:48 +0100 Subject: [PATCH] Continued: - added initial stuff to import CSV files (unfinished) - udpated to latest core Signed-off-by: Roland Haeder --- application/hub/config.php | 3 + .../hub/main/nodes/class_BaseHubNode.php | 5 ++ .../class_CrawlerUploadedListUrlSource.php | 63 ++++++++++++++++++- ...odeDistributedHashTableDatabaseWrapper.php | 3 + core | 2 +- 5 files changed, 74 insertions(+), 2 deletions(-) diff --git a/application/hub/config.php b/application/hub/config.php index a1ec1a2ec..217135e03 100644 --- a/application/hub/config.php +++ b/application/hub/config.php @@ -1463,6 +1463,9 @@ $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_interval_delay', 1000); // CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0); +// CFG: CRAWLER-CSV-FILE-PATH +$cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists'); + /////////////////////////////////////////////////////////////////////////////// // HTTP Configuration /////////////////////////////////////////////////////////////////////////////// diff --git a/application/hub/main/nodes/class_BaseHubNode.php b/application/hub/main/nodes/class_BaseHubNode.php index e71a63026..ce3637cc1 100644 --- a/application/hub/main/nodes/class_BaseHubNode.php +++ b/application/hub/main/nodes/class_BaseHubNode.php @@ -689,6 +689,7 @@ class BaseHubNode extends BaseHubSystem implements Updateable, AddableCriteria { * @return $unlInstance An instance of a LocateableNode class for this node */ public function determineUniversalNodeLocator () { + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Determine UNL based on this node: // 1) Get discovery class $discoveryInstance = ObjectFactory::createObjectByConfiguredName('unl_discovery_class'); @@ -697,6 +698,7 @@ class BaseHubNode extends BaseHubSystem implements Updateable, AddableCriteria { $unlInstance = $discoveryInstance->discoverUniversalNodeLocatorByNode($this); // 3) Return it + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: unlInstance= ' . $unlInstance->__toString() . ' - EXIT!'); return $unlInstance; } @@ -706,6 +708,8 @@ class BaseHubNode extends BaseHubSystem implements Updateable, AddableCriteria { * @return $unlArray An array from an instance of a LocateableNode class for this node */ public final function getUniversalNodeLocatorArray () { + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + // Get the Universal Node Locator (UNL) instance $unlInstance = $this->determineUniversalNodeLocator(); @@ -713,6 +717,7 @@ class BaseHubNode extends BaseHubSystem implements Updateable, AddableCriteria { die(__METHOD__ . ':unlInstance[' . gettype($unlInstance) . ']=' . print_r($unlInstance, TRUE)); // Return it + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); return $unlArray; } diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 928a9e284..c98aa6ff0 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -22,6 +22,16 @@ * along with this program. If not, see . */ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable { + /** + * Cached path of CSV files + */ + private $csvFilesPath = ''; + + /** + * Last found CSV file + */ + private $lastCsvFile = ''; + /** * Protected constructor * @@ -30,6 +40,50 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R protected function __construct () { // Call parent constructor parent::__construct(__CLASS__); + + // Set CSV files path + $this->csvFilesPath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path'); + + // Get directory instance + $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilesPath)); + + // Set it here + $this->setDirectoryInstance($directoryInstance); + } + + /** + * Checks whether a CSV file is found in configured path + * + * @return $isFound Whether a CSV file is found + */ + private function isCsvFileFound () { + // Is it valid? + if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) { + // Rewind to start + $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind(); + } // END - if + + // Read next entry + $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array('.htaccess', '.', '..')); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry); + + // Is it empty or wrong file extension? + if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) { + // Skip further processing + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!'); + return FALSE; + } // END - if + + // Initialize CSV instance + $csvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilesPath . '/' . $directoryEntry)); + + // Set it here + $this->setCsvFileInstance($csvFileInstance); + + // Found a file + return TRUE; } /** @@ -64,7 +118,14 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * to process more than one entry at a time. */ $this->processNextEntry(); - } // @TODO elseif ($this-> + } elseif ($this->isCsvFileFound()) { + /* + * A file containing an URL list is found. Please note the format is + * CSV-like as you may wish to provide meta data such as crawl + * depth, handling of 3rd-party URLs and such. + */ + $this->importCsvFile(); + } $this->partialStub('Please implement this method.'); } diff --git a/application/hub/main/wrapper/node/class_NodeDistributedHashTableDatabaseWrapper.php b/application/hub/main/wrapper/node/class_NodeDistributedHashTableDatabaseWrapper.php index 9dafd9f35..928eb3310 100644 --- a/application/hub/main/wrapper/node/class_NodeDistributedHashTableDatabaseWrapper.php +++ b/application/hub/main/wrapper/node/class_NodeDistributedHashTableDatabaseWrapper.php @@ -136,6 +136,8 @@ class NodeDistributedHashTableDatabaseWrapper extends BaseDatabaseWrapper implem * @return $dataSetInstance An instance of a StoreableCriteria class */ private function prepareLocalDataSetInstance () { + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('DHT-WRAPPER[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + // Get node/request instances $nodeInstance = NodeObjectFactory::createNodeInstance(); $requestInstance = ApplicationHelper::getSelfInstance()->getRequestInstance(); @@ -169,6 +171,7 @@ class NodeDistributedHashTableDatabaseWrapper extends BaseDatabaseWrapper implem $dataSetInstance->addCriteria(self::DB_COLUMN_ACCEPT_BOOTSTRAP, $this->translateBooleanToYesNo($nodeInstance->isAcceptingDhtBootstrap())); // Return it + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('DHT-WRAPPER[' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); return $dataSetInstance; } diff --git a/core b/core index 6339d66e4..c8ea0af3f 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 6339d66e421f4514ec9de8f61d96e38cb34005e6 +Subproject commit c8ea0af3f3bfe092c38f0864d689a82172af19c0 -- 2.39.5