3 * A UploadedList URL source class for crawlers
5 * @author Roland Haeder <webmaster@ship-simu.org>
7 * @copyright Copyright (c) 2014 Crawler Developer Team
8 * @license GNU GPL 3.0 or any newer version
9 * @link http://www.ship-simu.org
11 * This program is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation, either version 3 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
26 * Stack name for a CSV file
28 const STACK_NAME_CSV_FILE = 'csv_file';
31 * Stack name for a CSV entry
33 const STACK_NAME_CSV_ENTRY = 'csv_entry';
36 * Size of crawl (CSV) entry which is an indexed array:
39 * 1 = Crawl depth of URL
40 * 2 = Crawl depth of linked URLs (same other host only)
42 const CRAWL_ENTRY_SIZE = 3;
47 private $csvFilePath = '';
50 * Last CSV file instance
52 private $lastCsvFileInstance = NULL;
55 * Stack for pushing data from this clas to another
57 private $stackSourceInstance = NULL;
60 * "Imported" CSV files
62 private $csvFileImported = array();
65 * "Cached" separator for columns
67 private $columnSeparator = '';
70 * Protected constructor
74 protected function __construct () {
75 // Call parent constructor
76 parent::__construct(__CLASS__);
78 // "Cache" CSV path for faster usage
79 $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
81 // Initialize directory instance
82 $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
85 $this->setDirectoryInstance($directoryInstance);
87 // Init stack instance
88 $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
91 $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
92 $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_ENTRY);
94 // "Cache" column separator
95 $this->columnSeparator = $this->getConfigInstance()->getConfigEntry('crawler_url_list_column_separator');
99 * Checks whether a CSV file is found in configured path
101 * @return $isFound Whether a CSV file is found
103 private function isCsvFileFound () {
104 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
107 if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
109 $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
113 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
114 $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
117 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
119 // Is it empty or wrong file extension?
120 if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
121 // Skip further processing
122 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
126 // Initialize CSV file instance
127 $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($this->csvFilePath . '/' . $directoryEntry));
130 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!');
137 * Creates an instance of this class
139 * @return $sourceInstance An instance of a Source class
141 public final static function createCrawlerUploadedListUrlSource () {
143 $sourceInstance = new CrawlerUploadedListUrlSource();
146 $sourceInstance->initSource('crawler', 'uploaded_list');
148 // Return the prepared instance
149 return $sourceInstance;
153 * Enriches and saves the given CSV entry (array) in the assigned
154 * file-based stack. To such entry a lot more informations are added, such
155 * as which files shall be crawled and many more.
157 * @param $csvData Array with data from a CSV file
160 private function saveCsvDataInCrawlerQueue (array $csvData) {
162 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData()=' . count($csvData) . ' - CALLED!');
164 // The array has 3 elements, later enhancements may accept more
165 assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
168 * First converted the indexed array into an assoziative array. Don't
169 * forget to expand this array as well when you want to add another
170 * column to the CSV file.
173 self::CRAWL_JOB_ARRAY_START_URL => $csvData[0],
174 self::CRAWL_JOB_ARRAY_DEPTH => $csvData[1],
175 self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2]
179 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - BEFORE!');
181 // Then add more data to it
182 $this->enrichCrawlerQueueData($csvArray);
185 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvArray()=' . count($csvArray) . ' - AFTER!');
188 * Then enqueue it in the file stack. The local crawler "task" will
191 $this->enqueueInFileStack($csvArray);
194 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
198 * Checks whether a CSV file has been loaded (added to the stack)
200 * @return $isAdded Whether a CSV file has been loaded
202 private function isCsvFileAdded () {
203 // Check whether the stacker is not empty
204 $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE)));
211 * Checks whether a CSV entry has been added to the stack
213 * @return $isAdded Whether a CSV entry has been added
215 private function isCsvEntryAdded () {
216 // Check whether the stacker is not empty
217 $isAdded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_ENTRY)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_ENTRY)));
224 * Initializes the import of the CSV file which is being processed by other task
227 * @throws NullPointerException If lastCsvFileInstance is not set
229 private function addCsvFile () {
230 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
232 // Is the instance set?
233 if (is_null($this->lastCsvFileInstance)) {
234 // This should not happen
235 throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
239 $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
241 // ... and mark it as "imported"
242 array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
244 // ... and finally NULL it (to save some RAM)
245 $this->lastCsvFileInstance = NULL;
247 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
251 * Parses the next stacked CSV file by reading only one line from it. Then
252 * the read line is being validated and if found good being feed to the next
253 * stack. The file is removed from stack only if it has been fully parsed.
257 private function parseCsvFile () {
259 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
262 $csvFileInstance = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_FILE);
264 // Read full "CSV line"
265 $csvData = $csvFileInstance->readCsvFileLine($this->columnSeparator);
268 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
270 // Expect always an array
271 assert(is_array($csvData));
273 // Is the array empty?
274 if (count($csvData) == 0) {
276 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: File ' . $csvFileInstance->getFileName() . ' has been fully read.');
279 $csvFileInstance->closeFile();
281 // This file as been fully read, so don't push it back on stack.
285 // ... with 3 elements, later enhancements may accept more
286 assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
289 * Push the file back on stack as it may contain more entries. This way
290 * all files got rotated on stack which may improve crawler performance.
292 $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $csvFileInstance);
294 // Push array on next stack
295 $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_ENTRY, $csvData);
298 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
302 * Parses the next stacked CSV entry.
306 private function parseCsvEntry () {
308 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
311 $csvData = $this->getStackSourceInstance()->popNamed(self::STACK_NAME_CSV_ENTRY);
314 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: csvData[' . gettype($csvData) . ']=' . print_r($csvData, TRUE));
316 // It must have 3 elements (see method parseCsvFile() for details)
317 assert(count($csvData) == self::CRAWL_ENTRY_SIZE);
319 // Save it in crawler queue (which will enrich it with way more informations
320 $this->saveCsvDataInCrawlerQueue($csvData);
323 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
327 * Getter for stackSourceInstance variable
329 * @return $stackSourceInstance An instance of an additional stack
331 public final function getStackSourceInstance () {
332 return $this->stackSourceInstance;
336 * Fills the URL stack with new entries from source
341 public function fillUrlStack () {
342 // Does the stack have some entries left?
343 if ($this->isCsvEntryAdded()) {
345 * A CSV file has been found and "imported" (added to stack). Now
346 * the file can be read line by line and checked every one of it.
348 $this->parseCsvEntry();
349 } elseif ($this->isCsvFileAdded()) {
351 * A CSV file has been found and "imported" (added to stack). Now
352 * the file can be read line by line and checked every one of it.
354 $this->parseCsvFile();
355 } elseif ($this->isCsvFileFound()) {
357 * A file containing an URL list is found. Please note the format is
358 * CSV-like as you may wish to provide meta data such as crawl
359 * depth, handling of 3rd-party URLs and such.
364 $this->partialStub('Please implement this method.');