3 * A UploadedList URL source class for crawlers
5 * @author Roland Haeder <webmaster@ship-simu.org>
7 * @copyright Copyright (c) 2014 Crawler Developer Team
8 * @license GNU GPL 3.0 or any newer version
9 * @link http://www.ship-simu.org
11 * This program is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation, either version 3 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
26 * Cached path of CSV files
28 private $csvFilesPath = '';
33 private $lastCsvFile = '';
36 * Protected constructor
40 protected function __construct () {
41 // Call parent constructor
42 parent::__construct(__CLASS__);
45 $this->csvFilesPath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
47 // Get directory instance
48 $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilesPath));
51 $this->setDirectoryInstance($directoryInstance);
55 * Checks whether a CSV file is found in configured path
57 * @return $isFound Whether a CSV file is found
59 private function isCsvFileFound () {
61 if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
63 $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
67 $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array('.htaccess', '.', '..'));
70 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
72 // Is it empty or wrong file extension?
73 if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
74 // Skip further processing
75 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
79 // Initialize CSV instance
80 $csvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilesPath . '/' . $directoryEntry));
83 $this->setCsvFileInstance($csvFileInstance);
90 * Creates an instance of this class
92 * @return $sourceInstance An instance of a Source class
94 public final static function createCrawlerUploadedListUrlSource () {
96 $sourceInstance = new CrawlerUploadedListUrlSource();
99 $sourceInstance->initSource('crawler', 'uploaded_list');
103 // Return the prepared instance
104 return $sourceInstance;
108 * Processes entries in the stack.
113 public function processStack () {
114 // Does the stack have some entries left?
115 if (!$this->isUrlStackEmpty()) {
117 * Handle next entry. This method will be called very often, so need
118 * to process more than one entry at a time.
120 $this->processNextEntry();
121 } elseif ($this->isCsvFileFound()) {
123 * A file containing an URL list is found. Please note the format is
124 * CSV-like as you may wish to provide meta data such as crawl
125 * depth, handling of 3rd-party URLs and such.
127 $this->importCsvFile();
130 $this->partialStub('Please implement this method.');