X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=application%2Fhub%2Fmain%2Fsource%2Furls%2Fclass_CrawlerUploadedListUrlSource.php;fp=application%2Fhub%2Fmain%2Fsource%2Furls%2Fclass_CrawlerUploadedListUrlSource.php;h=5b1814594a3b565105a4a83f92c4d48c866637cb;hb=0d4b1675b69d4f54e11e80af16027e15b8da4eb1;hp=ec77631f0c64191c4ca852d3974b6e1f97ca667c;hpb=c6d1d880b6ee6289f5cdedc7b3eefdc1ae92cda2;p=hub.git diff --git a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php index ec77631f0..5b1814594 100644 --- a/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php @@ -73,32 +73,16 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } /** - * Creates an instance of this class - * - * @return $sourceInstance An instance of a Source class - */ - public final static function createCrawlerUploadedListUrlSource () { - // Get new instance - $sourceInstance = new CrawlerUploadedListUrlSource(); - - // Init source - $sourceInstance->initSource('crawler', 'uploaded_list'); - - // Return the prepared instance - return $sourceInstance; - } - - /** - * Checks whether a CSV file is found + * Checks whether a CSV file is found in configured path * * @return $isFound Whether a CSV file is found */ private function isCsvFileFound () { //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); - // Is the instance valid? + // Is it valid? if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) { - // Then rewind it + // Rewind to start $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind(); } // END - if @@ -106,9 +90,13 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE)); $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported)); - // The read entry has not to be empty and extension must be '.csv' + // Debug message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry); + + // Is it empty or wrong file extension? if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) { // Skip further processing + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!'); return FALSE; } // END - if @@ -120,6 +108,22 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R return TRUE; } + /** + * Creates an instance of this class + * + * @return $sourceInstance An instance of a Source class + */ + public final static function createCrawlerUploadedListUrlSource () { + // Get new instance + $sourceInstance = new CrawlerUploadedListUrlSource(); + + // Init source + $sourceInstance->initSource('crawler', 'uploaded_list'); + + // Return the prepared instance + return $sourceInstance; + } + /** * Initializes the import of the CSV file which is being processed by other task * @@ -165,10 +169,17 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R public function processStack () { // Does the stack have some entries left? if (!$this->isUrlStackEmpty()) { - // Handle next entry + /* + * Handle next entry. This method will be called very often, so need + * to process more than one entry at a time. + */ $this->processNextEntry(); } elseif ($this->isCsvFileFound()) { - // A CSV file has been found and can maybe be imported. + /* + * A file containing an URL list is found. Please note the format is + * CSV-like as you may wish to provide meta data such as crawl + * depth, handling of 3rd-party URLs and such. + */ $this->importCsvFile(); }