* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
+ /**
+ * "Cached" CSV path
+ */
+ private $csvFilePath = '';
+
+ /**
+ * Last CSV file instance
+ */
+ private $lastCsvFileInstance = NULL;
+
+ /**
+ * Stack for pushing data from this clas to another
+ */
+ private $stackSourceInstance = NULL;
+
+ /**
+ * Stack name for a CSV file
+ */
+ const STACK_NAME_CSV_FILE = 'csv_file';
+
+ /**
+ * "Imported" CSV files
+ */
+ private $csvFileImported = array();
+
/**
* Protected constructor
*
protected function __construct () {
// Call parent constructor
parent::__construct(__CLASS__);
+
+ // "Cache" CSV path for faster usage
+ $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
+
+ // Initialize directory instance
+ $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
+
+ // Set it here
+ $this->setDirectoryInstance($directoryInstance);
+
+ // Init stack instance
+ $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
+
+ // Init stack
+ $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
+ }
+
+ /**
+ * Checks whether a CSV file is found in configured path
+ *
+ * @return $isFound Whether a CSV file is found
+ */
+ private function isCsvFileFound () {
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Is it valid?
+ if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
+ // Rewind to start
+ $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
+ } // END - if
+
+ // Read next entry
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
+ $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
+
+ // Debug message
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
+
+ // Is it empty or wrong file extension?
+ if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
+ // Skip further processing
+ /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
+ return FALSE;
+ } // END - if
+
+ // Initialize CSV file instance
+ $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry));
+
+ // Found an entry
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ return TRUE;
}
/**
// Init source
$sourceInstance->initSource('crawler', 'uploaded_list');
- // Get a ??? @TODO
-
// Return the prepared instance
return $sourceInstance;
}
+ /**
+ * Initializes the import of the CSV file which is being processed by other task
+ *
+ * @return void
+ * @throws NullPointerException If lastCsvFileInstance is not set
+ */
+ private function importCsvFile () {
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+ // Is the instance set?
+ if (is_null($this->lastCsvFileInstance)) {
+ // This should not happen
+ throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
+ } // END - if
+
+ // Stack this file
+ $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
+
+ // ... and mark it as "imported"
+ array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
+
+ // ... and finally NULL it (to save some RAM)
+ $this->lastCsvFileInstance = NULL;
+
+ //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+ }
+
+ /**
+ * Getter for stackSourceInstance variable
+ *
+ * @return $stackSourceInstance An instance of an additional stack
+ */
+ public final function getStackSourceInstance () {
+ return $this->stackSourceInstance;
+ }
+
/**
* Processes entries in the stack.
*
* to process more than one entry at a time.
*/
$this->processNextEntry();
- } // @TODO elseif ($this->
+ } elseif ($this->isCsvFileFound()) {
+ /*
+ * A file containing an URL list is found. Please note the format is
+ * CSV-like as you may wish to provide meta data such as crawl
+ * depth, handling of 3rd-party URLs and such.
+ */
+ $this->importCsvFile();
+ }
$this->partialStub('Please implement this method.');
}