]> git.mxchange.org Git - hub.git/commitdiff
Continued with crawler:
authorRoland Haeder <roland@mxchange.org>
Sat, 7 Mar 2015 17:54:54 +0000 (18:54 +0100)
committerRoland Haeder <roland@mxchange.org>
Sat, 7 Mar 2015 18:00:18 +0000 (19:00 +0100)
- Renamed method processStack() to fillUrlStack() to reflect its purpose
- Added isUrlStackEmpty() to interface as it is now public
- Added new base class BaseUrlSourceTask which will initialize all such tasks
  by creating the proper URL source instance
- Contants belong to top of classes

Signed-off-by: Roland Haeder <roland@mxchange.org>
13 files changed:
application/hub/interfaces/source/urls/class_UrlSource.php
application/hub/main/resolver/protocol/tcp/class_TcpProtocolResolver.php
application/hub/main/source/class_BaseUrlSource.php
application/hub/main/source/urls/class_CrawlerFoundRssUrlSource.php
application/hub/main/source/urls/class_CrawlerLocalStartUrlSource.php
application/hub/main/source/urls/class_CrawlerRssStartUrlSource.php
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php [new file with mode: 0644]
application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSource
application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceFoundRssTask.php
application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceLocalStartTask.php
application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceRssStartTask.php
application/hub/main/tasks/crawler/url_source/class_CrawlerUrlSourceUploadedListTask.php

index e95634b5c318788067200b97ffc0d1c2709ed84a..6a4a04677cad97a86ff9d64c8603ccbb0e372373 100644 (file)
  */
 interface UrlSource extends Source {
        /**
-        * Processes entries in the stack.
+        * Fills the URL stack with new entries from source
         *
         * @return      void
         */
-       function processStack ();
+       function fillUrlStack ();
+
+       /**
+        * Determines whether the stack 'urls' is empty.
+        *
+        * @return      $isEmpty        Whether the stack 'urls' is empty.
+        */
+       function isUrlStackEmpty ();
 }
 
 // [EOF]
index 027f0497ff4708ecf8f7ded5fd501f7f29c8707e..758ae780411724b2c266fa8dca16384fb4420006 100644 (file)
@@ -67,7 +67,7 @@ class TcpProtocolResolver extends BaseProtocolResolver implements ProtocolResolv
                $resultInstance = $nodeInstance->getWrapperInstance()->doSelectByCriteria($searchInstance);
 
                // Is the result valid?
-               if ((!$resultInstance->valid()) || (! $resultInstance->next())) {
+               if ((!$resultInstance->valid()) || (!$resultInstance->next())) {
                        // Node not found in database, this could mean that your database file is damaged.
                        return NULL;
                } // END - if
index 18286ba28a81fc05f181798a53119cf5e775d4fc..afc64492321beb14aac83737695fe5ec7d0c5368 100644 (file)
@@ -58,13 +58,31 @@ class BaseUrlSource extends BaseSource {
         *
         * @return      $isEmpty        Whether the stack 'urls' is empty.
         */
-       protected function isUrlStackEmpty () {
+       public function isUrlStackEmpty () {
                // Determine it
                $isEmpty = $this->getStackInstance()->isStackEmpty('urls');
 
                // Return result
                return $isEmpty;
        }
+
+       /**
+        * Enriches the given associative array with more data, now at least 2
+        * elements are required:
+        *
+        * 'start_url'   - Starting URL
+        * 'start_depth' - Crawl depth for starting URL
+        *
+        * @param       $crawlData      Array with partial data for being queued
+        * @return      void
+        */
+       protected function enrichCrawlerQueueData (array &$crawlData) {
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: crawlData()=' . count($crawlData) . ' - CALLED!');
+
+               // Debug message
+               /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+       }
 }
 
 // [EOF]
index d77847c2cd758311ba2e1f95573d7065138721c6..cad4691e7f38f37fcc9459bda498348a8e97d4d1 100644 (file)
@@ -49,18 +49,12 @@ class CrawlerFoundRssUrlSource extends BaseUrlSource implements UrlSource, Regis
        }
 
        /**
-        * Processes entries in the stack.
+        * Fills the URL stack with new entries from source
         *
         * @return      void
-        * @todo        ~10% done
+        * @todo        0% done
         */
-       public function processStack () {
-               // Does the stack have some entries left?
-               if ($this->isUrlStackEmpty()) {
-                       // Nothing to handle here
-                       return;
-               } // END - if
-
+       public function fillUrlStack () {
                $this->partialStub('Please implement this method.');
        }
 }
index a2679ebdb2799bedeebb2b7572c976ac97f1b9cd..fdabe06421aeff9df7b23ee3d0f1493d78b27a3b 100644 (file)
@@ -49,18 +49,12 @@ class CrawlerLocalStartUrlSource extends BaseUrlSource implements UrlSource, Reg
        }
 
        /**
-        * Processes entries in the stack.
+        * Fills the URL stack with new entries from source
         *
         * @return      void
-        * @todo        ~10% done
+        * @todo        0% done
         */
-       public function processStack () {
-               // Does the stack have some entries left?
-               if ($this->isUrlStackEmpty()) {
-                       // Nothing to handle here
-                       return;
-               } // END - if
-
+       public function fillUrlStack () {
                $this->partialStub('Please implement this method.');
        }
 }
index e955d027f2f93209024ea05c060556278b8ad09a..ef6ade1bd02e78eaee16ff28e1bf9c74ad7e5b2d 100644 (file)
@@ -49,18 +49,12 @@ class CrawlerRssStartUrlSource extends BaseUrlSource implements UrlSource, Regis
        }
 
        /**
-        * Processes entries in the stack.
+        * Fills the URL stack with new entries from source
         *
         * @return      void
-        * @todo        ~10% done
+        * @todo        0% done
         */
-       public function processStack () {
-               // Does the stack have some entries left?
-               if ($this->isUrlStackEmpty()) {
-                       // Nothing to handle here
-                       return;
-               } // END - if
-
+       public function fillUrlStack () {
                $this->partialStub('Please implement this method.');
        }
 }
index dd5ff03bcd9db1761e5ecaa168adc93abde7042b..749c3f631ed66821cb3da2b55c6aff0f6e930851 100644 (file)
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
-       /**
-        * "Cached" CSV path
-        */
-       private $csvFilePath = '';
-
-       /**
-        * Last CSV file instance
-        */
-       private $lastCsvFileInstance = NULL;
-
-       /**
-        * Stack for pushing data from this clas to another
-        */
-       private $stackSourceInstance = NULL;
-
        /**
         * Stack name for a CSV file
         */
@@ -56,6 +41,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
         */
        const CRAWL_ENTRY_SIZE = 3;
 
+       /**
+        * "Cached" CSV path
+        */
+       private $csvFilePath = '';
+
+       /**
+        * Last CSV file instance
+        */
+       private $lastCsvFileInstance = NULL;
+
+       /**
+        * Stack for pushing data from this clas to another
+        */
+       private $stackSourceInstance = NULL;
+
        /**
         * "Imported" CSV files
         */
@@ -175,6 +175,9 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                        self::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2]
                );
 
+               // Then add more data to it
+               $this->enrichCrawlerQueueData($csvData);
+
                // Debug message
                /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
        }
@@ -318,12 +321,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        }
 
        /**
-        * Processes entries in the stack.
+        * Fills the URL stack with new entries from source
         *
         * @return      void
-        * @todo        ~20% done
+        * @todo        ~40% done
         */
-       public function processStack () {
+       public function fillUrlStack () {
                // Does the stack have some entries left?
                if ($this->isCsvEntryAdded()) {
                        /*
@@ -344,12 +347,6 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                         * depth, handling of 3rd-party URLs and such.
                         */
                        $this->addCsvFile();
-               } elseif (!$this->isUrlStackEmpty()) {
-                       /*
-                        * Handle next entry. This method will be called very often, so need
-                        * to process more than one entry at a time.
-                        */
-                       $this->processNextEntry();
                }
 
                $this->partialStub('Please implement this method.');
diff --git a/application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php b/application/hub/main/tasks/crawler/class_BaseUrlSourceTask.php
new file mode 100644 (file)
index 0000000..9989802
--- /dev/null
@@ -0,0 +1,54 @@
+<?php
+/**
+ * A general URL source Task
+ *
+ * @author             Roland Haeder <webmaster@shipsimu.org>
+ * @version            0.0.0
+ * @copyright  Copyright (c) 2007, 2008 Roland Haeder, 2009 - 2014 Hub Developer Team
+ * @license            GNU GPL 3.0 or any newer version
+ * @link               http://www.shipsimu.org
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+class BaseUrlSourceTask extends BaseTask {
+       /**
+        * Protected constructor
+        *
+        * @param       $className      Name of the class
+        * @return      void
+        */
+       protected function __construct ($className) {
+               // Call parent constructor
+               parent::__construct($className);
+
+               // Init this URL source task
+               $this->initUrlSourceTask();
+       }
+
+       /**
+        * Initializes URL source task (to keep the constructor small)
+        *
+        * @return      void
+        */
+       private function initUrlSourceTask () {
+               // Get source instance
+               $sourceInstance = UrlSourceObjectFactory::createUrlSourceInstance($this);
+
+               // And set it here
+               $this->setSourceInstance($sourceInstance);
+       }
+}
+
+// [EOF]
+?>
index b62a9df06507024861b7d1c39659581c4fc7a656..48a6a42c89dbc5e29a5f284eca860135f9ce99e5 100644 (file)
@@ -21,7 +21,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-class CrawlerUrlSource???Task extends BaseTask implements Taskable, Visitable {
+class CrawlerUrlSource???Task extends BaseUrlSourceTask implements Taskable, Visitable {
        /**
         * Protected constructor
         *
index 14236c6ffa6ed537675ecefb1549d905ce3f02eb..dd912b465b77afa13f80a022ac040fc734ef765d 100644 (file)
@@ -21,7 +21,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-class CrawlerUrlSourceFoundRssTask extends BaseTask implements Taskable, Visitable {
+class CrawlerUrlSourceFoundRssTask extends BaseUrlSourceTask implements Taskable, Visitable {
        /**
         * Protected constructor
         *
@@ -64,7 +64,7 @@ class CrawlerUrlSourceFoundRssTask extends BaseTask implements Taskable, Visitab
         */
        public function executeTask () {
                // Get the URL source instance and announce us
-               UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack();
+               $this->getSourceInstance()->fillUrlStack();
        }
 }
 
index 9fdb71d1dd90054bb62a6ec6cd2631ca51626fca..5938d2cb75647e4a6cb5056f2760bec8adec65dc 100644 (file)
@@ -21,7 +21,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-class CrawlerUrlSourceLocalStartTask extends BaseTask implements Taskable, Visitable {
+class CrawlerUrlSourceLocalStartTask extends BaseUrlSourceTask implements Taskable, Visitable {
        /**
         * Protected constructor
         *
@@ -64,7 +64,7 @@ class CrawlerUrlSourceLocalStartTask extends BaseTask implements Taskable, Visit
         */
        public function executeTask () {
                // Get the URL source instance and announce us
-               UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack();
+               $this->getSourceInstance()->fillUrlStack();
        }
 }
 
index 413c7ad5d83503f293bca55d5af61b07770b55f1..e8c317541b50e73b94eee73fd5440228450b9c12 100644 (file)
@@ -21,7 +21,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-class CrawlerUrlSourceRssStartTask extends BaseTask implements Taskable, Visitable {
+class CrawlerUrlSourceRssStartTask extends BaseUrlSourceTask implements Taskable, Visitable {
        /**
         * Protected constructor
         *
@@ -64,7 +64,7 @@ class CrawlerUrlSourceRssStartTask extends BaseTask implements Taskable, Visitab
         */
        public function executeTask () {
                // Get the URL source instance and announce us
-               UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack();
+               $this->getSourceInstance()->fillUrlStack();
        }
 }
 
index 7330dda2d886d6b10b421a0e88fc2b54282d5846..d8092485866c1219898e8448d4bcc373a36833f1 100644 (file)
@@ -21,7 +21,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-class CrawlerUrlSourceUploadedListTask extends BaseTask implements Taskable, Visitable {
+class CrawlerUrlSourceUploadedListTask extends BaseUrlSourceTask implements Taskable, Visitable {
        /**
         * Protected constructor
         *
@@ -64,7 +64,7 @@ class CrawlerUrlSourceUploadedListTask extends BaseTask implements Taskable, Vis
         */
        public function executeTask () {
                // Get the URL source instance and announce us
-               UrlSourceObjectFactory::createUrlSourceInstance($this)->processStack();
+               $this->getSourceInstance()->fillUrlStack();
        }
 }