// CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS
$cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0);
+// CFG: CRAWLER-CSV-FILE-PATH
+$cfg->setConfigEntry('crawler_csv_file_path', 'data/url_lists');
+
///////////////////////////////////////////////////////////////////////////////
// HTTP Configuration
///////////////////////////////////////////////////////////////////////////////
* @return $unlInstance An instance of a LocateableNode class for this node
*/
public function determineUniversalNodeLocator () {
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
// Determine UNL based on this node:
// 1) Get discovery class
$discoveryInstance = ObjectFactory::createObjectByConfiguredName('unl_discovery_class');
$unlInstance = $discoveryInstance->discoverUniversalNodeLocatorByNode($this);
// 3) Return it
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: unlInstance= ' . $unlInstance->__toString() . ' - EXIT!');
return $unlInstance;
}
* @return $unlArray An array from an instance of a LocateableNode class for this node
*/
public final function getUniversalNodeLocatorArray () {
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
// Get the Universal Node Locator (UNL) instance
$unlInstance = $this->determineUniversalNodeLocator();
die(__METHOD__ . ':unlInstance[' . gettype($unlInstance) . ']=' . print_r($unlInstance, TRUE));
// Return it
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('NODE[' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
return $unlArray;
}
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
+ /**
+ * Cached path of CSV files
+ */
+ private $csvFilesPath = '';
+
+ /**
+ * Last found CSV file
+ */
+ private $lastCsvFile = '';
+
/**
* Protected constructor
*
protected function __construct () {
// Call parent constructor
parent::__construct(__CLASS__);
+
+ // Set CSV files path
+ $this->csvFilesPath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
+
+ // Get directory instance
+ $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilesPath));
+
+ // Set it here
+ $this->setDirectoryInstance($directoryInstance);
+ }
+
+ /**
+ * Checks whether a CSV file is found in configured path
+ *
+ * @return $isFound Whether a CSV file is found
+ */
+ private function isCsvFileFound () {
+ // Is it valid?
+ if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
+ // Rewind to start
+ $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
+ } // END - if
+
+ // Read next entry
+ $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array('.htaccess', '.', '..'));
+
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
+
+ // Is it empty or wrong file extension?
+ if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
+ // Skip further processing
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
+ return FALSE;
+ } // END - if
+
+ // Initialize CSV instance
+ $csvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilesPath . '/' . $directoryEntry));
+
+ // Set it here
+ $this->setCsvFileInstance($csvFileInstance);
+
+ // Found a file
+ return TRUE;
}
/**
* to process more than one entry at a time.
*/
$this->processNextEntry();
- } // @TODO elseif ($this->
+ } elseif ($this->isCsvFileFound()) {
+ /*
+ * A file containing an URL list is found. Please note the format is
+ * CSV-like as you may wish to provide meta data such as crawl
+ * depth, handling of 3rd-party URLs and such.
+ */
+ $this->importCsvFile();
+ }
$this->partialStub('Please implement this method.');
}
* @return $dataSetInstance An instance of a StoreableCriteria class
*/
private function prepareLocalDataSetInstance () {
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('DHT-WRAPPER[' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
// Get node/request instances
$nodeInstance = NodeObjectFactory::createNodeInstance();
$requestInstance = ApplicationHelper::getSelfInstance()->getRequestInstance();
$dataSetInstance->addCriteria(self::DB_COLUMN_ACCEPT_BOOTSTRAP, $this->translateBooleanToYesNo($nodeInstance->isAcceptingDhtBootstrap()));
// Return it
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('DHT-WRAPPER[' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
return $dataSetInstance;
}