3 * A UploadedList URL source class for crawlers
5 * @author Roland Haeder <webmaster@ship-simu.org>
7 * @copyright Copyright (c) 2014 Crawler Developer Team
8 * @license GNU GPL 3.0 or any newer version
9 * @link http://www.ship-simu.org
11 * This program is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation, either version 3 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
28 private $csvFilePath = '';
31 * Last CSV file instance
33 private $lastCsvFileInstance = NULL;
36 * Stack for pushing data from this clas to another
38 private $stackSourceInstance = NULL;
41 * Stack name for a CSV file
43 const STACK_NAME_CSV_FILE = 'csv_file';
46 * "Imported" CSV files
48 private $csvFileImported = array();
51 * Protected constructor
55 protected function __construct () {
56 // Call parent constructor
57 parent::__construct(__CLASS__);
59 // "Cache" CSV path for faster usage
60 $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
62 // Initialize directory instance
63 $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
66 $this->setDirectoryInstance($directoryInstance);
68 // Init stack instance
69 $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
72 $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
76 * Checks whether a CSV file is found in configured path
78 * @return $isFound Whether a CSV file is found
80 private function isCsvFileFound () {
81 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
84 if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
86 $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
90 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
91 $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
94 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
96 // Is it empty or wrong file extension?
97 if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
98 // Skip further processing
99 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
103 // Initialize CSV file instance
104 $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($this->csvFilePath . '/' . $directoryEntry));
107 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!');
114 * Creates an instance of this class
116 * @return $sourceInstance An instance of a Source class
118 public final static function createCrawlerUploadedListUrlSource () {
120 $sourceInstance = new CrawlerUploadedListUrlSource();
123 $sourceInstance->initSource('crawler', 'uploaded_list');
125 // Return the prepared instance
126 return $sourceInstance;
130 * Checks whether a CSV file has been loaded (added to the stack)
132 * @return $isLoaded Whether a CSV file has been loaded
134 private function isCsvFileAdded () {
135 // Check whether the stacker is not empty
136 $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE)));
143 * Initializes the import of the CSV file which is being processed by other task
146 * @throws NullPointerException If lastCsvFileInstance is not set
148 private function addCsvFile () {
149 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
151 // Is the instance set?
152 if (is_null($this->lastCsvFileInstance)) {
153 // This should not happen
154 throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
158 $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
160 // ... and mark it as "imported"
161 array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
163 // ... and finally NULL it (to save some RAM)
164 $this->lastCsvFileInstance = NULL;
166 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
170 * Parses the next stacked CSV by reading only one line from it. Then the
171 * read line is being validated and if found good being feed to the next
172 * stack. The file is removed from stack only if it has been fully parsed.
174 private function parseCsvEntry () {
176 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
179 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
184 * Getter for stackSourceInstance variable
186 * @return $stackSourceInstance An instance of an additional stack
188 public final function getStackSourceInstance () {
189 return $this->stackSourceInstance;
193 * Processes entries in the stack.
198 public function processStack () {
199 // Does the stack have some entries left?
200 if ($this->isCsvFileAdded()) {
202 * A CSV file has been found and "imported" (added to stack). Now
203 * the file can be read line by line and checked every one of it.
205 $this->parseCsvEntry();
206 } elseif ($this->isCsvFileFound()) {
208 * A file containing an URL list is found. Please note the format is
209 * CSV-like as you may wish to provide meta data such as crawl
210 * depth, handling of 3rd-party URLs and such.
213 } elseif (!$this->isUrlStackEmpty()) {
215 * Handle next entry. This method will be called very often, so need
216 * to process more than one entry at a time.
218 $this->processNextEntry();
221 $this->partialStub('Please implement this method.');