From: Roland Häder Date: Sun, 25 Jun 2023 06:47:13 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=7e986a58c24e38a622dbe18f1ea7f8c210778d5f;p=fba.git Continued: - let's be nice and only fetch instances/blocks from not recently (aka. out-dated) records so we keep bandwidth low on these servers --- diff --git a/fba/commands.py b/fba/commands.py index f19d51b..f443520 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -108,17 +108,20 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: for row in rows: logger.debug("row[]='%s'", type(row)) if "domain" not in row: - logger.warning("row='%s' does not contain element 'domain' - SKIPPED!") + logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row) continue - elif not utils.is_domain_wanted(row['domain']): - logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row['domain']) + elif not utils.is_domain_wanted(row["domain"]): + logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) continue - elif instances.is_registered(row['domain']): - logger.debug("row[domain]='%s' is already registered - SKIPPED!", row['domain']) + elif instances.is_registered(row["domain"]): + logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"]) + continue + elif instances.is_recent(row["domain"]): + logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) continue - logger.debug("Fetching instances from row[domain]='%s' ...", row['domain']) - federation.fetch_instances(row['domain'], None, None, inspect.currentframe().f_code.co_name) + logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"]) + federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception)) @@ -162,10 +165,13 @@ def fetch_bkali(args: argparse.Namespace) -> int: logger.debug("entry[domain]='%s' is not wanted - SKIPPED!") continue elif instances.is_registered(entry["domain"]): - logger.debug("domain='%s' is already registered - SKIPPED!", entry['domain']) + logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"]) + continue + elif instances.is_recent(entry["domain"]): + logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"]) continue - logger.debug("Adding domain='%s' ...", entry['domain']) + logger.debug("Adding domain='%s' ...", entry["domain"]) domains.append(entry["domain"]) except network.exceptions as exception: @@ -551,7 +557,10 @@ def fetch_cs(args: argparse.Namespace): for row in domains[block_level]: logger.debug("row[%s]='%s'", type(row), row) - if not instances.is_registered(row["domain"]): + if instances.is_recent(row["domain"], "last_blocked"): + logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) + continue + elif not instances.is_registered(row["domain"]): try: logger.info("Fetching instances from domain='%s' ...", row["domain"]) federation.fetch_instances(row["domain"], 'chaos.social', None, inspect.currentframe().f_code.co_name) @@ -586,8 +595,8 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: logger.debug("item='%s'", item) domain = item.link.split("=")[1] - if blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) + if not utils.is_domain_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: logger.debug("domain='%s' is already added - SKIPPED!", domain) @@ -595,6 +604,9 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue logger.debug("Adding domain='%s'", domain) domains.append(domain) @@ -649,11 +661,14 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue logger.debug("Adding domain='%s',domains()=%d", domain, len(domains)) domains.append(domain) - logger.debug("domains(%d)='%s", len(domains), domains) + logger.debug("domains()='%d", len(domains)) if len(domains) > 0: locking.acquire() @@ -696,7 +711,7 @@ def fetch_instances(args: argparse.Namespace) -> int: for row in rows: logger.debug("domain='%s'", row[0]) if not utils.is_domain_wanted(row[0]): - logger.debug("Domain is not wanted: row[0]='%s'", row[0]) + logger.debug("Domain row[0]='%s' is not wanted - SKIPPED!", row[0]) continue try: @@ -768,6 +783,9 @@ def fetch_oliphant(args: argparse.Namespace) -> int: elif args.domain in domains: logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain) continue + elif instances.is_recent(block["blocker"]): + logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"]) + continue # Fetch this URL logger.info("Fetching csv_url='%s' for blocker='%s' ...", block['csv_url'], block["blocker"]) @@ -794,6 +812,9 @@ def fetch_oliphant(args: argparse.Namespace) -> int: if not utils.is_domain_wanted(domain): logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue logger.debug("Marking domain='%s' as handled", domain) domains.append(domain) @@ -835,6 +856,9 @@ def fetch_txt(args: argparse.Namespace) -> int: elif not utils.is_domain_wanted(domain): logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue logger.debug("domain='%s',row[blocker]='%s'", domain, row["blocker"]) processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name) @@ -876,6 +900,9 @@ def fetch_fedipact(args: argparse.Namespace) -> int: elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue logger.info("Fetching domain='%s' ...", domain) federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)