]> git.mxchange.org Git - hub.git/blob - application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
Merge branch 'master' into refacuring/protocol_handler
[hub.git] / application / hub / main / source / urls / class_CrawlerUploadedListUrlSource.php
1 <?php
2 /**
3  * A UploadedList URL source class for crawlers
4  *
5  * @author              Roland Haeder <webmaster@ship-simu.org>
6  * @version             0.0.0
7  * @copyright   Copyright (c) 2014 Crawler Developer Team
8  * @license             GNU GPL 3.0 or any newer version
9  * @link                http://www.ship-simu.org
10  *
11  * This program is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation, either version 3 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23  */
24 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
25         /**
26          * "Cached" CSV path
27          */
28         private $csvFilePath = '';
29
30         /**
31          * Last CSV file instance
32          */
33         private $lastCsvFileInstance = NULL;
34
35         /**
36          * Stack for pushing data from this clas to another
37          */
38         private $stackSourceInstance = NULL;
39
40         /**
41          * Stack name for a CSV file
42          */
43         const STACK_NAME_CSV_FILE = 'csv_file';
44
45         /**
46          * "Imported" CSV files
47          */
48         private $csvFileImported = array();
49
50         /**
51          * Protected constructor
52          *
53          * @return      void
54          */
55         protected function __construct () {
56                 // Call parent constructor
57                 parent::__construct(__CLASS__);
58
59                 // "Cache" CSV path for faster usage
60                 $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('crawler_csv_file_path');
61
62                 // Initialize directory instance
63                 $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
64
65                 // Set it here
66                 $this->setDirectoryInstance($directoryInstance);
67
68                 // Init stack instance
69                 $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
70
71                 // Init stack
72                 $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
73         }
74
75         /**
76          * Checks whether a CSV file is found in configured path
77          *
78          * @return      $isFound        Whether a CSV file is found
79          */
80         private function isCsvFileFound () {
81                 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
82
83                 // Is it valid?
84                 if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
85                         // Rewind to start
86                         $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
87                 } // END - if
88
89                 // Read next entry
90                 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: this->csvFileImported=' . print_r($this->csvFileImported, TRUE));
91                 $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
92
93                 // Debug message
94                 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry);
95
96                 // Is it empty or wrong file extension?
97                 if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
98                         // Skip further processing
99                         /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE[' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - SKIPPED!');
100                         return FALSE;
101                 } // END - if
102
103                 // Initialize CSV file instance
104                 $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($this->csvFilePath . '/' . $directoryEntry));
105
106                 // Debug message
107                 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . '] directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!');
108
109                 // Found an entry
110                 return TRUE;
111         }
112
113         /**
114          * Creates an instance of this class
115          *
116          * @return      $sourceInstance         An instance of a Source class
117          */
118         public final static function createCrawlerUploadedListUrlSource () {
119                 // Get new instance
120                 $sourceInstance = new CrawlerUploadedListUrlSource();
121
122                 // Init source
123                 $sourceInstance->initSource('crawler', 'uploaded_list');
124
125                 // Return the prepared instance
126                 return $sourceInstance;
127         }
128
129         /**
130          * Checks whether a CSV file has been loaded (added to the stack)
131          *
132          * @return      $isLoaded       Whether a CSV file has been loaded
133          */
134         private function isCsvFileAdded () {
135                 // Check whether the stacker is not empty
136                 $isLoaded = (($this->getStackSourceInstance()->isStackInitialized(self::STACK_NAME_CSV_FILE)) && (!$this->getStackSourceInstance()->isStackEmpty(self::STACK_NAME_CSV_FILE)));
137
138                 // Return the result
139                 return $isLoaded;
140         }
141
142         /**
143          * Initializes the import of the CSV file which is being processed by other task
144          *
145          * @return      void
146          * @throws      NullPointerException    If lastCsvFileInstance is not set
147          */
148         private function addCsvFile () {
149                 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
150
151                 // Is the instance set?
152                 if (is_null($this->lastCsvFileInstance)) {
153                         // This should not happen
154                         throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
155                 } // END - if
156
157                 // Stack this file
158                 $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
159
160                 // ... and mark it as "imported"
161                 array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
162
163                 // ... and finally NULL it (to save some RAM)
164                 $this->lastCsvFileInstance = NULL;
165
166                 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
167         }
168
169         /**
170          * Parses the next stacked CSV by reading only one line from it. Then the
171          * read line is being validated and if found good being feed to the next
172          * stack. The file is removed from stack only if it has been fully parsed.
173          */
174         private function parseCsvEntry () {
175                 // Debug message
176                 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
177
178                 // Debug message
179                 /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
180         }
181
182
183         /**
184          * Getter for stackSourceInstance variable
185          *
186          * @return      $stackSourceInstance    An instance of an additional stack
187          */
188         public final function getStackSourceInstance () {
189                 return $this->stackSourceInstance;
190         }
191
192         /**
193          * Processes entries in the stack.
194          *
195          * @return      void
196          * @todo        ~20% done
197          */
198         public function processStack () {
199                 // Does the stack have some entries left?
200                 if ($this->isCsvFileAdded()) {
201                         /*
202                          * A CSV file has been found and "imported" (added to stack). Now
203                          * the file can be read line by line and checked every one of it.
204                          */
205                         $this->parseCsvEntry();
206                 } elseif ($this->isCsvFileFound()) {
207                         /*
208                          * A file containing an URL list is found. Please note the format is
209                          * CSV-like as you may wish to provide meta data such as crawl
210                          * depth, handling of 3rd-party URLs and such.
211                          */
212                         $this->addCsvFile();
213                 } elseif (!$this->isUrlStackEmpty()) {
214                         /*
215                          * Handle next entry. This method will be called very often, so need
216                          * to process more than one entry at a time.
217                          */
218                         $this->processNextEntry();
219                 }
220
221                 $this->partialStub('Please implement this method.');
222         }
223 }
224
225 // [EOF]
226 ?>