3 * A general URL source class
5 * @author Roland Haeder <webmaster@shipsimu.org>
7 * @copyright Copyright (c) 2007, 2008 Roland Haeder, 2009 - 2015 Hub Developer Team
8 * @license GNU GPL 3.0 or any newer version
9 * @link http://www.shipsimu.org
11 * This program is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation, either version 3 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 class BaseUrlSource extends BaseSource {
25 // Stack name for all URLs
26 const STACKER_NAME_URLS = 'urls';
28 // Array elements for CSV data array
29 const CRAWL_JOB_ARRAY_START_URL = 'start_url';
30 const CRAWL_JOB_ARRAY_DEPTH = 'start_depth';
31 const CRAWL_JOB_ARRAY_EXTERNAL_DEPTH = 'external_depth';
34 * Protected constructor
36 * @param $className Name of the class
39 protected function __construct ($className) {
40 // Call parent constructor
41 parent::__construct($className);
45 * Initalizes this source
47 * @param $prefix Prefix for this source
48 * @param $sourceName Name of this source
51 protected function initSource ($prefix, $sourceName) {
52 // Use another object factory
53 $stackInstance = FileStackFactory::createFileStackInstance($prefix . '_url', $sourceName);
56 $this->setStackInstance($stackInstance);
60 * Determines whether the stack 'urls' is empty.
62 * @return $isEmpty Whether the stack 'urls' is empty.
64 public function isUrlStackEmpty () {
66 $isEmpty = $this->getStackInstance()->isStackEmpty(self::STACKER_NAME_URLS);
73 * Enriches the given associative array with more data, now at least 2
74 * elements are required:
76 * 'start_url' - Starting URL
77 * 'start_depth' - Crawl depth for starting URL
79 * @param $crawlData Array with partial data for being queued
83 protected function enrichCrawlerQueueData (array &$crawlData) {
85 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!');
87 // Check for minimum array elements
88 assert(isset($crawlData[self::CRAWL_JOB_ARRAY_START_URL]));
89 assert(isset($crawlData[self::CRAWL_JOB_ARRAY_DEPTH]));
91 // @TODO Add more elements
94 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
98 * Enqueues given crawler array in assigned file-based stack
100 * @param $crawlData Array with partial data for being queued
103 protected function enqueueInFileStack (array $crawlData) {
105 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!');
107 // Get the stack instance and enqueue it
108 $this->getStackInstance()->pushNamed(self::STACKER_NAME_URLS, $crawlData);
111 //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');