From 1947b43a33ec7b21042e1d4eb53929cbb31078c7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Sun, 2 Jul 2023 09:09:30 +0200 Subject: [PATCH] Continued: - added command fetch_fedilist() - please don't overdose these commands, fetching instances is limited to only not recently fetched but from static websites like fediverse.observer is NOT limited!) - flush pending data here, too --- fba/boot.py | 8 ++++++++ fba/commands.py | 44 ++++++++++++++++++++++++++++++++++++++++++ fba/http/federation.py | 1 + 3 files changed, 53 insertions(+) diff --git a/fba/boot.py b/fba/boot.py index dc53f92..07b4a60 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -172,6 +172,14 @@ def init_parser(): ) parser.set_defaults(command=commands.check_nodeinfo) + ### Fetch CSV from fedilist.com ### + parser = subparser_command.add_parser( + "fetch_fedilist", + help="Fetches CSV from fedilist.com", + ) + parser.set_defaults(command=commands.fetch_fedilist) + parser.add_argument("--software", help="Name of software, e.g. 'lemmy'") + logger.debug("EXIT!") def run_command(): diff --git a/fba/commands.py b/fba/commands.py index fc5de72..7e36311 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -1279,3 +1279,47 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: logger.debug("Success! - EXIT!") return 0 + +def fetch_fedilist(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + url = "http://demo.fedilist.com/instance/csv?onion=not" + if args.software is not None and args.software != "": + logger.debug("args.software='%s'", args.software) + url = f"http://demo.fedilist.com/instance/csv?software={args.software}&onion=not" + + locking.acquire() + + logger.info("Fetching url='%s' from fedilist.com ...", url) + response = reqto.get( + url, + headers=network.web_headers, + timeout=(config.get("connection_timeout"), config.get("read_timeout")), + allow_redirects=False + ) + + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) + reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix") + + logger.debug("reader[]='%s'", type(reader)) + blockdict = list() + for row in reader: + logger.debug("row[]='%s'", type(row)) + domain = tidyup.domain(row["hostname"]) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"]) + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching instances from domain='%s' ...", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 diff --git a/fba/http/federation.py b/fba/http/federation.py index 4e38815..0939722 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -63,6 +63,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: elif not isinstance(software, str): raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'") + logger.debug("Checking if domain='%s' is registered ...", domain) if not instances.is_registered(domain): logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software) instances.add(domain, origin, command, path, software) -- 2.39.5