X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=application%2Fhub%2Fmain%2Fsource%2Furls%2Fclass_CrawlerUploadedListUrlSource.php;h=5b1814594a3b565105a4a83f92c4d48c866637cb;hb=0d4b1675b69d4f54e11e80af16027e15b8da4eb1;hp=41fe3d7d7e2ea07b97b5f2b0f212ad7f6a187424;hpb=fa1e85c6f610356bc1b4f7b01a57acce6741782f;p=hub.git diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index 41fe3d7d7..5b1814594 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -22,6 +22,31 @@ * along with this program. If not, see . */ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable { + /** + * "Cached" CSV path + */ + private $csvFilePath = ''; + + /** + * Last CSV file instance + */ + private $lastCsvFileInstance = NULL; + + /** + * Stack for pushing data from this clas to another + */ + private $stackSourceInstance = NULL; + + /** + * Stack name for a CSV file + */ + const STACK_NAME_CSV_FILE = 'csv_file'; + + /** + * "Imported" CSV files + */ + private $csvFileImported = array(); + /** * Protected constructor * @@ -30,6 +55,57 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R protected function __construct () { // Call parent constructor parent::__construct(__CLASS__); + + // "Cache" CSV path for faster usage + $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path'); + + // Initialize directory instance + $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath)); + + // Set it here + $this->setDirectoryInstance($directoryInstance); + + // Init stack instance + $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class'); + + // Init stack + $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE); + } + + /** + * Checks whether a CSV file is found in configured path + * + * @return $isFound Whether a CSV file is found + */ + private function isCsvFileFound () { + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + + // Is it valid? + if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) { + // Rewind to start + $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind(); + } // END - if + + // Read next entry + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE)); + $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported)); + + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry); + + // Is it empty or wrong file extension? + if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) { + // Skip further processing + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!'); + return FALSE; + } // END - if + + // Initialize CSV file instance + $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry)); + + // Found an entry + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + return TRUE; } /** @@ -44,12 +120,46 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R // Init source $sourceInstance->initSource('crawler', 'uploaded_list'); - // Get a ??? @TODO - // Return the prepared instance return $sourceInstance; } + /** + * Initializes the import of the CSV file which is being processed by other task + * + * @return void + * @throws NullPointerException If lastCsvFileInstance is not set + */ + private function importCsvFile () { + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); + + // Is the instance set? + if (is_null($this->lastCsvFileInstance)) { + // This should not happen + throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER); + } // END - if + + // Stack this file + $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance); + + // ... and mark it as "imported" + array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName())); + + // ... and finally NULL it (to save some RAM) + $this->lastCsvFileInstance = NULL; + + //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); + } + + /** + * Getter for stackSourceInstance variable + * + * @return $stackSourceInstance An instance of an additional stack + */ + public final function getStackSourceInstance () { + return $this->stackSourceInstance; + } + /** * Processes entries in the stack. * @@ -59,9 +169,19 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R public function processStack () { // Does the stack have some entries left? if (!$this->isUrlStackEmpty()) { - // Nothing to handle here + /* + * Handle next entry. This method will be called very often, so need + * to process more than one entry at a time. + */ $this->processNextEntry(); - } elseif ($this-> + } elseif ($this->isCsvFileFound()) { + /* + * A file containing an URL list is found. Please note the format is + * CSV-like as you may wish to provide meta data such as crawl + * depth, handling of 3rd-party URLs and such. + */ + $this->importCsvFile(); + } $this->partialStub('Please implement this method.'); }