* @version 0.0.0 * @copyright Copyright (c) 2014 Crawler Developer Team * @license GNU GPL 3.0 or any newer version * @link http://www.ship-simu.org * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable { /** * Stack name for a CSV file */ const STACK_NAME_CSV_FILE = 'csv_file'; /** * Stack name for a CSV entry */ const STACK_NAME_CSV_ENTRY = 'csv_entry'; /** * Size of crawl (CSV) entry which is an indexed array: * * 0 = URL to crawl * 1 = Crawl depth of URL * 2 = Crawl depth of linked URLs (same other host only) */ const CRAWL_ENTRY_SIZE = 3; /** * "Cached" CSV path */ private $csvFilePath = ''; /** * Last CSV file instance */ private $lastCsvFileInstance = NULL; /** * Stack for pushing data from this clas to another */ private $stackSourceInstance = NULL; /** * "Imported" CSV files */ private $csvFileImported = array(); /** * "Cached" separator for columns */ private $columnSeparator = ''; /** * Protected constructor * * @return void */ protected function __construct () { // Call parent constructor parent::__construct(__CLASS__); // "Cache" CSV path for faster usage $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path'); // Initialize directory instance $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath)); // Set it here $this->setDirectoryInstance($directoryInstance); // Init stack instance $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class'); // Init stacks $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE); $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_ENTRY); // "Cache" column separator $this->columnSeparator = $this->getConfigInstance()->getConfigEntry('crawler_url_list_column_separator'); } /** * Checks whether a CSV file is found in configured path * * @return $isFound Whether a CSV file is found */ private function isCsvFileFound () { //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Is it valid? if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) { // Rewind to start $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind(); } // END - if // Read next entry //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE)); $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported)); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry); // Is it empty or wrong file extension? if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) { // Skip further processing //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!'); return FALSE; } // END - if // Initialize CSV file instance $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($this->csvFilePath . '/' . $directoryEntry)); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!'); // Found an entry return TRUE; } /** * Creates an instance of this class * * @return $sourceInstance An instance of a Source class */ public final static function createCrawlerUploadedListUrlSource () { // Get new instance $sourceInstance = new CrawlerUploadedListUrlSource(); // Init source $sourceInstance->initSource('crawler', 'uploaded_list'); // Return the prepared instance return $sourceInstance; } /** * Enriches and saves the given CSV entry (array) in the assigned * file-based stack. To such entry a lot more informations are added, such * as which files shall be crawled and many more. * * @param $csvData Array with data from a CSV file * @return void */ private function saveCsvDataInCrawlerQueue (array $csvData) { // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!'); // The array has 3 elements, later enhancements may accept more assert(count($csvData) == self::CRAWL_ENTRY_SIZE); /* * First converted the indexed array into an assoziative array. Don't * forget to expand this array as well when you want to add another * column to the CSV file. */ $csvArray = array( self::CRAWL_JOB_ARRAY_START_URL => $csvData[0], self::CRAWL_JOB_ARRAY_DEPTH => $csvData[1], self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2] ); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - BEFORE!'); // Then add more data to it $this->enrichCrawlerQueueData($csvArray); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - AFTER!'); /* * Then enqueue it in the file stack. The local crawler "task" will * then pick this up. */ $this->enqueueInFileStack($csvArray); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** * Checks whether a CSV file has been loaded (added to the stack) * * @return $isAdded Whether a CSV file has been loaded */ private function isCsvFileAdded () { // Check whether the stacker is not empty $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE))); // Return the result return $isAdded; } /** * Checks whether a CSV entry has been added to the stack * * @return $isAdded Whether a CSV entry has been added */ private function isCsvEntryAdded () { // Check whether the stacker is not empty $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY))); // Return the result return $isAdded; } /** * Initializes the import of the CSV file which is being processed by other task * * @return void * @throws NullPointerException If lastCsvFileInstance is not set */ private function addCsvFile () { //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Is the instance set? if (is_null($this->lastCsvFileInstance)) { // This should not happen throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER); } // END - if // Stack this file $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance); // ... and mark it as "imported" array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName())); // ... and finally NULL it (to save some RAM) $this->lastCsvFileInstance = NULL; //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** * Parses the next stacked CSV file by reading only one line from it. Then * the read line is being validated and if found good being feed to the next * stack. The file is removed from stack only if it has been fully parsed. * * @return void */ private function parseCsvFile () { // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Get next entry $csvFileInstance = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_FILE); // Read full "CSV line" $csvData = $csvFileInstance->readCsvFileLine($this->columnSeparator); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); // Expect always an array assert(is_array($csvData)); // Is the array empty? if (count($csvData) == 0) { // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.'); // Try to close it $csvFileInstance->closeFile(); // This file as been fully read, so don't push it back on stack. return; } // END - if // ... with 3 elements, later enhancements may accept more assert(count($csvData) == self::CRAWL_ENTRY_SIZE); /* * Push the file back on stack as it may contain more entries. This way * all files got rotated on stack which may improve crawler performance. */ $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $csvFileInstance); // Push array on next stack $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_ENTRY, $csvData); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** * Parses the next stacked CSV entry. * * @return void */ private function parseCsvEntry () { // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!'); // Pop it from stack $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE)); // It must have 3 elements (see method parseCsvFile() for details) assert(count($csvData) == self::CRAWL_ENTRY_SIZE); // Save it in crawler queue (which will enrich it with way more informations $this->saveCsvDataInCrawlerQueue($csvData); // Debug message //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!'); } /** * Getter for stackSourceInstance variable * * @return $stackSourceInstance An instance of an additional stack */ public final function getStackSourceInstance () { return $this->stackSourceInstance; } /** * Fills the URL stack with new entries from source * * @return void * @todo ~40% done */ public function fillUrlStack () { // Does the stack have some entries left? if ($this->isCsvEntryAdded()) { /* * A CSV file has been found and "imported" (added to stack). Now * the file can be read line by line and checked every one of it. */ $this->parseCsvEntry(); } elseif ($this->isCsvFileAdded()) { /* * A CSV file has been found and "imported" (added to stack). Now * the file can be read line by line and checked every one of it. */ $this->parseCsvFile(); } elseif ($this->isCsvFileFound()) { /* * A file containing an URL list is found. Please note the format is * CSV-like as you may wish to provide meta data such as crawl * depth, handling of 3rd-party URLs and such. */ $this->addCsvFile(); } $this->partialStub('Please implement this method.'); } } // [EOF] ?>