]> git.mxchange.org Git - hub.git/commitdiff
Continued:
authorRoland Haeder <roland@mxchange.org>
Sun, 7 Dec 2014 21:45:55 +0000 (22:45 +0100)
committerRoland Haeder <roland@mxchange.org>
Sun, 7 Dec 2014 21:45:55 +0000 (22:45 +0100)
- Added some more stuff for URL CSV file importing
- Ignored all files in data/url_lists/ (and removed a "demo" file)
- Used new 'core'

Signed-off-by: Roland Haeder <roland@mxchange.org>
.gitignore
application/hub/config.php
application/hub/main/source/urls/class_CrawlerUploadedListUrlSource.php
core
data/url_lists/demo.lst [deleted file]

index fc60ff9a31c75a40905e07b53a1f286a4969e1f4..3dc43e248f9a01eaecbcf9ee5d66cff252c5c0f4 100644 (file)
@@ -11,3 +11,4 @@ docs/latex/*
 docs/warn.log
 /nbproject
 data/stacks/*.stack*
+data/url_lists/*.*
index c4b77f9af9eed4489412c9cb3bd2396b18c4348f..33069d54f026ad4de21587a26a55474c806ba3d0 100644 (file)
@@ -1289,6 +1289,12 @@ $cfg->setConfigEntry('crawler_url_rss_start_file_stack_index_class', 'FileStackI
 // CFG: CRAWLER-URL-FOUND-RSS-FILE-STACK-INDEX-CLASS
 $cfg->setConfigEntry('crawler_url_found_rss_file_stack_index_class', 'FileStackIndex');
 
+// CFG: CRAWLER-URL-UPLOADED-LIST-URL-SOURCE-STACK-CLASS
+$cfg->setConfigEntry('crawler_uploaded_list_url_source_stack_class', 'FiFoStacker');
+
+// CFG: STACKER-CSV-FILE-MAX-SIZE
+$cfg->setConfigEntry('stacker_csv_file_max_size', 10);
+
 // CFG: TASK-CRAWLER-NODE-COMMUNICATOR-STARTUP-DELAY
 $cfg->setConfigEntry('task_crawler_node_communicator_startup_delay', 500);
 
@@ -1454,6 +1460,9 @@ $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_interval_delay', 1000);
 // CFG: TASK-CRAWLER-URL-SOURCE-FOUND-RSS-MAX-RUNS
 $cfg->setConfigEntry('task_crawler_uploaded_list_scanner_max_runs', 0);
 
+// CFG: CSV-FILE-PATH
+$cfg->setConfigEntry('csv_file_path', 'data/url_lists');
+
 ///////////////////////////////////////////////////////////////////////////////
 //                            HTTP Configuration
 ///////////////////////////////////////////////////////////////////////////////
index 605478d6059419ab3a1420bc6713aab9f2d87c5a..dd2a84ead80be704ccfde780e3a4ad826a40f607 100644 (file)
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, Registerable {
+       /**
+        * "Cached" CSV path
+        */
+       private $csvFilePath = '';
+
+       /**
+        * Last CSV file instance
+        */
+       private $lastCsvFileInstance = NULL;
+
+       /**
+        * Stack for pushing data from this clas to another
+        */
+       private $stackSourceInstance = NULL;
+
+       /**
+        * Stack name for a CSV file
+        */
+       const STACK_NAME_CSV_FILE = 'csv_file';
+
+       /**
+        * "Imported" CSV files
+        */
+       private $csvFileImported = array();
+
        /**
         * Protected constructor
         *
@@ -30,6 +55,21 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        protected function __construct () {
                // Call parent constructor
                parent::__construct(__CLASS__);
+
+               // "Cache" CSV path for faster usage
+               $this->csvFilePath = $this->getConfigInstance()->getConfigEntry('base_path') . '/' . $this->getConfigInstance()->getConfigEntry('csv_file_path');
+
+               // Initialize directory instance
+               $directoryInstance = ObjectFactory::createObjectByConfiguredName('directory_class', array($this->csvFilePath));
+
+               // Set it here
+               $this->setDirectoryInstance($directoryInstance);
+
+               // Init stack instance
+               $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class');
+
+               // Init stack
+               $this->getStackSourceInstance()->initStack(self::STACK_NAME_CSV_FILE);
        }
 
        /**
@@ -44,11 +84,74 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
                // Init source
                $sourceInstance->initSource('crawler', 'uploaded_list');
 
-               // Get a 
                // Return the prepared instance
                return $sourceInstance;
        }
 
+       /**
+        * Checks whether a CSV file is found
+        *
+        * @return      $isFound        Whether a CSV file is found
+        */
+       private function isCsvFileFound () {
+               // Is the instance valid?
+               if (!$this->getDirectoryInstance()->getDirectoryIteratorInstance()->valid()) {
+                       // Then rewind it
+                       $this->getDirectoryInstance()->getDirectoryIteratorInstance()->rewind();
+               } // END - if
+
+               // Read next entry
+               $directoryEntry = $this->getDirectoryInstance()->readDirectoryExcept(array_merge(array('.htaccess', '.', '..'), $this->csvFileImported));
+
+               // The read entry has not to be empty and extension must be '.csv'
+               if ((empty($directoryEntry)) || (substr($directoryEntry, -4, 4) != '.csv')) {
+                       // Skip further processing
+                       return FALSE;
+               } // END - if
+
+               // Initialize CSV file instance
+               $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_file_class', array($this->csvFilePath . '/' . $directoryEntry));
+
+               // Found an entry
+               return TRUE;
+       }
+
+       /**
+        * Initializes the import of the CSV file which is being processed by other task
+        *
+        * @return      void
+        * @throws      NullPointerException    If lastCsvFileInstance is not set
+        */
+       private function importCsvFile () {
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: CALLED!');
+
+               // Is the instance set?
+               if (is_null($this->lastCsvFileInstance)) {
+                       // This should not happen
+                       throw new NullPointerException($this, self::EXCEPTION_IS_NULL_POINTER);
+               } // END - if
+
+               // Stack this file
+               $this->getStackSourceInstance()->pushNamed(self::STACK_NAME_CSV_FILE, $this->lastCsvFileInstance);
+
+               // ... and mark it as "imported"
+               array_push($this->csvFileImported, basename($this->lastCsvFileInstance->getFileName()));
+
+               // ... and finally NULL it (to save some RAM)
+               $this->lastCsvFileInstance = NULL;
+
+               //* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__)->debugOutput('CRAWLER-SOURCE [' . __METHOD__ . ':' . __LINE__ . ']: EXIT!');
+       }
+
+       /**
+        * Getter for stackSourceInstance variable
+        *
+        * @return      $stackSourceInstance    An instance of an additional stack
+        */
+       public final function stackSourceInstance () {
+               return $this->stackSourceInstance;
+       }
+
        /**
         * Processes entries in the stack.
         *
@@ -58,9 +161,12 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R
        public function processStack () {
                // Does the stack have some entries left?
                if (!$this->isUrlStackEmpty()) {
-                       // Nothing to handle here
+                       // Handle next entry
                        $this->processNextEntry();
-               } elseif ($this->
+               } elseif ($this->isCsvFileFound()) {
+                       // A CSV file has been found and can maybe be imported.
+                       $this->importCsvFile();
+               }
 
                $this->partialStub('Please implement this method.');
        }
diff --git a/core b/core
index 80d808f788a6b4712bc7a33abcfcc8bb432cbdf9..c8ea0af3f3bfe092c38f0864d689a82172af19c0 160000 (submodule)
--- a/core
+++ b/core
@@ -1 +1 @@
-Subproject commit 80d808f788a6b4712bc7a33abcfcc8bb432cbdf9
+Subproject commit c8ea0af3f3bfe092c38f0864d689a82172af19c0
diff --git a/data/url_lists/demo.lst b/data/url_lists/demo.lst
deleted file mode 100644 (file)
index 84b5b40..0000000
+++ /dev/null
@@ -1 +0,0 @@
-http://mxchange.org