From 84d35049e4fb877c935f0fcd9e57309dba501bb1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 12 Jul 2023 07:28:43 +0200 Subject: [PATCH] Continued: - a recursive (aka. "crawl") depth of 500 is REALLY far deep, practically the whole Fediverse - minimum peer count to deepen the "crawl" to max depth is 100 peers - flush any pending data of current domain before continuing --- fba/http/federation.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/fba/http/federation.py b/fba/http/federation.py index e3b40bf..0d8cdf8 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -39,11 +39,15 @@ from fba.networks import lemmy from fba.networks import misskey from fba.networks import peertube +_DEPTH = 0 + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None): - logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path) + global _DEPTH + logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH) + _DEPTH = _DEPTH + 1 domain_helper.raise_on(domain) if not isinstance(origin, str) and origin is not None: @@ -79,7 +83,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin) peerlist = fetch_peers(domain, software, origin) except network.exceptions as exception: - logger.warning("Cannot fetch peers from domain='%s': '%s'", domain, type(exception)) + logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception)) logger.debug("peerlist[]='%s'", type(peerlist)) if isinstance(peerlist, list): @@ -88,7 +92,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: logger.debug("peerlist[]='%s'", type(peerlist)) if peerlist is None or len(peerlist) == 0: - logger.warning("Cannot fetch peers: domain='%s'", domain) + logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software) if instances.has_pending(domain): logger.debug("Flushing updates for domain='%s' ...", domain) @@ -97,6 +101,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: logger.debug("Invoking cookies.clear(%s) ...", domain) cookies.clear(domain) + _DEPTH = _DEPTH - 1 logger.debug("EXIT!") return @@ -129,8 +134,18 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance) continue elif not instances.is_registered(instance): - logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s' ...", instance, domain, command, path) - fetch_instances(instance, domain, None, command, path) + logger.debug("Checking if domain='%s' has pending updates ...", domain) + if instances.has_pending(domain): + logger.debug("Flushing updates for domain='%s' ...", domain) + instances.update_data(domain) + + logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH) + if _DEPTH <= 500 and len(peerlist) >= 100: + logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH) + fetch_instances(instance, domain, None, command, path) + else: + logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH) + instances.add(instance, domain, command) logger.debug("Invoking cookies.clear(%s) ...", domain) cookies.clear(domain) @@ -140,6 +155,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: logger.debug("Flushing updates for domain='%s' ...", domain) instances.update_data(domain) + _DEPTH = _DEPTH - 1 logger.debug("EXIT!") def fetch_peers(domain: str, software: str, origin: str) -> list: -- 2.39.5