From 1ab18fbbb5a265027f734fecc87c52cdffb8ef2d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Mon, 7 Dec 2020 07:46:54 +0100 Subject: [PATCH] Continued: - moved $stackSourceInstance to BaseUrlSource - also introduced BaseUrlSource->initStacks() - renamed UrlSource::CRAWL_JOB_ARRAY_* to URL_SOURCE_ARRAY_* - added more debug lines MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Roland Häder --- .../classes/source/class_BaseUrlSource.php | 50 +++++++++++-- .../class_CrawlerUploadedListUrlSource.php | 73 +++++++------------ .../source/urls/class_UrlSource.php | 6 +- 3 files changed, 72 insertions(+), 57 deletions(-) diff --git a/application/hub/classes/source/class_BaseUrlSource.php b/application/hub/classes/source/class_BaseUrlSource.php index cae9a475a..d51e6ea76 100644 --- a/application/hub/classes/source/class_BaseUrlSource.php +++ b/application/hub/classes/source/class_BaseUrlSource.php @@ -6,6 +6,7 @@ namespace Org\Shipsimu\Hub\Crawler\Source\Url; use Org\Shipsimu\Hub\Crawler\Source\BaseSource; // Import framework stuff +use Org\Mxchange\CoreFramework\Factory\ObjectFactory; use Org\Mxchange\CoreFramework\Factory\Stack\FileStackFactory; use Org\Mxchange\CoreFramework\Traits\Stack\StackableTrait; @@ -41,6 +42,11 @@ abstract class BaseUrlSource extends BaseSource { // Stack name for all URLs const STACKER_NAME_URLS = 'urls'; + /** + * Stack for pushing data from this clas to another + */ + private $stackSourceInstance = NULL; + /** * Protected constructor * @@ -52,6 +58,38 @@ abstract class BaseUrlSource extends BaseSource { parent::__construct($className); } + /** + * Initializes given stack instances + * + * @param $stacks Stacks to initialize + * @param $stackType Type of the stack/URL source + * @return void + */ + protected function initStacks (array $stacks, string $stackType) { + // Init stack instance + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('BASE-URL-SOURCE: stacks()=%d,stackType=%s - CALLED!', count($stacks), $stackType)); + $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName(sprintf('%s_url_source_stack_class', $stackType)); + + // Init stacks + foreach($stacks as $stackName) { + // Init single stack + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('BASE-URL-SOURCE: stackName=%s', $stackName)); + $this->getStackSourceInstance()->initStack($stackName); + } + + // Trace message + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('BASE-URL-SOURCE: EXIT!'); + } + + /** + * Getter for stackSourceInstance variable + * + * @return $stackSourceInstance An instance of an additional stack + */ + public final function getStackSourceInstance () { + return $this->stackSourceInstance; + } + /** * Initalizes this source * @@ -114,15 +152,15 @@ abstract class BaseUrlSource extends BaseSource { if (count($crawlData) == 0) { // Throw IAE throw new InvalidArgumentException('Parameter "crawlData" has no elements'); - } elseif (!isset($crawlData[UrlSource::CRAWL_JOB_ARRAY_START_URL])) { + } elseif (!isset($crawlData[UrlSource::URL_SOURCE_ARRAY_START_URL])) { // Throw IAE - throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::CRAWL_JOB_ARRAY_START_URL)); - } elseif (!isset($crawlData[UrlSource::CRAWL_JOB_ARRAY_START_DEPTH])) { + throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::URL_SOURCE_ARRAY_START_URL)); + } elseif (!isset($crawlData[UrlSource::URL_SOURCE_ARRAY_START_DEPTH])) { // Throw IAE - throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::CRAWL_JOB_ARRAY_START_DEPTH)); - } elseif (!isset($crawlData[UrlSource::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH])) { + throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::URL_SOURCE_ARRAY_START_DEPTH)); + } elseif (!isset($crawlData[UrlSource::URL_SOURCE_ARRAY_EXTERNAL_DEPTH])) { // Throw IAE - throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH)); + throw new InvalidArgumentException(sprintf('crawlData()=%d does not contain element "%s"', count($crawlData), UrlSource::URL_SOURCE_ARRAY_EXTERNAL_DEPTH)); } // @TODO Add more elements diff --git a/application/hub/classes/source/urls/class_CrawlerUploadedListUrlSource.php b/application/hub/classes/source/urls/class_CrawlerUploadedListUrlSource.php index a3c48303c..aa6f52a11 100644 --- a/application/hub/classes/source/urls/class_CrawlerUploadedListUrlSource.php +++ b/application/hub/classes/source/urls/class_CrawlerUploadedListUrlSource.php @@ -68,11 +68,6 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R */ private $lastCsvFileInstance = NULL; - /** - * Stack for pushing data from this clas to another - */ - private $stackSourceInstance = NULL; - /** * "Imported" CSV files */ @@ -108,19 +103,6 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R // Set it here $this->setDirectoryInstance($directoryInstance); - // Init stack instance - $this->stackSourceInstance = ObjectFactory::createObjectByConfiguredName('crawler_uploaded_list_url_source_stack_class'); - - // Init stacks - foreach([ - self::STACK_NAME_CSV_FILE, - self::STACK_NAME_CSV_ENTRY, - ] as $stackName) { - // Init single stack - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('CRAWLER-UPLOADED-LIST-URL-SOURCE: stackName=%s', $stackName)); - $this->getStackSourceInstance()->initStack($stackName); - } - // "Cache" column separator $this->columnSeparator = FrameworkBootstrap::getConfigurationInstance()->getConfigEntry('crawler_url_list_column_separator'); @@ -129,12 +111,22 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R } /** - * Getter for stackSourceInstance variable + * Creates an instance of this class * - * @return $stackSourceInstance An instance of an additional stack + * @return $sourceInstance An instance of an UrlSource class */ - public final function getStackSourceInstance () { - return $this->stackSourceInstance; + public final static function createCrawlerUploadedListUrlSource () { + // Get new instance + $sourceInstance = new CrawlerUploadedListUrlSource(); + + // Init source + $sourceInstance->initSource('crawler', 'uploaded_list'); + + // Init stacks + $sourceInstance->initStacks([self::STACK_NAME_CSV_FILE, self::STACK_NAME_CSV_ENTRY], 'crawler_uploaded_list'); + + // Return the prepared instance + return $sourceInstance; } /** @@ -186,29 +178,13 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R $infoInstance = new SplFileInfo($this->csvFilePath . '/' . $directoryEntry); // Initialize CSV file instance - $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', array($infoInstance)); + $this->lastCsvFileInstance = ObjectFactory::createObjectByConfiguredName('csv_input_file_class', [$infoInstance]); // Found an entry - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: directoryEntry(' . strlen($directoryEntry) . ')=' . $directoryEntry . ' - Instance created - EXIT!'); + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('CRAWLER-UPLOADED-LIST-URL-SOURCE: directoryEntry(%d)=%s - Instance created - EXIT!', strlen($directoryEntry), $directoryEntry)); return TRUE; } - /** - * Creates an instance of this class - * - * @return $sourceInstance An instance of an UrlSource class - */ - public final static function createCrawlerUploadedListUrlSource () { - // Get new instance - $sourceInstance = new CrawlerUploadedListUrlSource(); - - // Init source - $sourceInstance->initSource('crawler', 'uploaded_list'); - - // Return the prepared instance - return $sourceInstance; - } - /** * Enriches and saves the given CSV entry (array) in the assigned * file-based stack. To such entry a lot more informations are added, such @@ -218,10 +194,8 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * @return void */ private function saveCsvDataInCrawlerQueue (array $csvData) { - // Debug message - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvData()=' . count($csvData) . ' - CALLED!'); - // The array must have a fixed amount of elements, later enhancements may accept more + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvData()=' . count($csvData) . ' - CALLED!'); assert(count($csvData) == self::CRAWL_ENTRY_SIZE); /* @@ -230,20 +204,20 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * column to the CSV file. */ $csvArray = [ - UrlSource::CRAWL_JOB_ARRAY_START_URL => $csvData[0], - UrlSource::CRAWL_JOB_ARRAY_START_DEPTH => $csvData[1], - UrlSource::CRAWL_JOB_ARRAY_EXTERNAL_DEPTH => $csvData[2] + UrlSource::URL_SOURCE_ARRAY_START_URL => $csvData[0], + UrlSource::URL_SOURCE_ARRAY_START_DEPTH => $csvData[1], + UrlSource::URL_SOURCE_ARRAY_EXTERNAL_DEPTH => $csvData[2] ]; // Then add more data to it - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvArray()=' . count($csvArray) . ' - BEFORE!'); + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvArray()=%d - BEFORE!', count($csvArray))); $this->enrichCrawlerQueueData($csvArray); /* * Then enqueue it in the file stack. The local crawler "task" will * then pick this up. */ - /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvArray()=' . count($csvArray) . ' - AFTER!'); + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput(sprintf('CRAWLER-UPLOADED-LIST-URL-SOURCE: csvArray()=%d - AFTER!', count($csvArray))); $this->enqueueInFileStack($csvArray); // Trace message @@ -392,12 +366,14 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * A CSV file has been found and "imported" (added to stack). Now * the file can be read line by line and checked every one of it. */ + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: Calling this->parseCsvEntry() ...'); $this->parseCsvEntry(); } elseif ($this->isCsvFileAdded()) { /* * A CSV file has been found and "imported" (added to stack). Now * the file can be read line by line and checked every one of it. */ + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: Calling this->parseCsvFile() ...'); $this->parseCsvFile(); } elseif ($this->isCsvFileFound() && !$this->isLastCsvFileImported()) { /* @@ -405,6 +381,7 @@ class CrawlerUploadedListUrlSource extends BaseUrlSource implements UrlSource, R * CSV-like as you may wish to provide meta data such as crawl * depth, handling of 3rd-party URLs and such. */ + /* NOISY-DEBUG: */ self::createDebugInstance(__CLASS__, __LINE__)->debugOutput('CRAWLER-UPLOADED-LIST-URL-SOURCE: Calling this->addCsvFile() ...'); $this->addCsvFile(); } diff --git a/application/hub/interfaces/source/urls/class_UrlSource.php b/application/hub/interfaces/source/urls/class_UrlSource.php index 1fb816e4b..fefe8e3a0 100644 --- a/application/hub/interfaces/source/urls/class_UrlSource.php +++ b/application/hub/interfaces/source/urls/class_UrlSource.php @@ -29,9 +29,9 @@ use Org\Shipsimu\Hub\Crawler\Source\Source; */ interface UrlSource extends Source { // Array elements for CSV data array - const CRAWL_JOB_ARRAY_START_URL = 'start_url'; - const CRAWL_JOB_ARRAY_START_DEPTH = 'start_depth'; - const CRAWL_JOB_ARRAY_EXTERNAL_DEPTH = 'external_depth'; + const URL_SOURCE_ARRAY_START_URL = 'start_url'; + const URL_SOURCE_ARRAY_START_DEPTH = 'start_depth'; + const URL_SOURCE_ARRAY_EXTERNAL_DEPTH = 'external_depth'; /** * Fills the URL stack with new entries from source -- 2.39.5