From 1cc837f4f6ee8d8e518020fc87a81cb5a5ac3c9d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 29 Jan 2025 16:35:58 +0100 Subject: [PATCH] Continued: - used wrapping network.fetch_json_rows() instead of raw network.fetch_url() --- fba/commands.py | 64 ++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index eaac2f4..e4bbeae 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -105,9 +105,10 @@ ORDER BY domain ASC") for row in rows: logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"]) punycode = domain_helper.encode_idna(row["domain"]) + logger.debug("punycode='%s' - AFTER!", punycode) if row["nodeinfo_url"].startswith("/"): - logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"]) + logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches - SKIP!", row["nodeinfo_url"]) continue elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1: logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"]) @@ -1271,22 +1272,15 @@ def fetch_joinmobilizon(args: argparse.Namespace) -> int: sources.update(source_domain) logger.info("Fetching instances from source_domain='%s' ...", source_domain) - raw = network.fetch_url( - f"https://{source_domain}/api/v1/instances", - headers=network.web_headers, - timeout=config.timeout - ).text - logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) - - parsed = json.loads(raw) - logger.debug("parsed[%s]()=%d", type(parsed), len(parsed)) - - if "data" not in parsed: - logger.warning("parsed()=%d does not contain key 'data'") - return 1 + rows = network.fetch_json_rows( + source_domain, + "/api/v1/instances", + network.web_headers, + "data" + ) - logger.info("Checking %d instances ...", len(parsed["data"])) - for row in parsed["data"]: + logger.info("Checking %d instances ...", len(rows)) + for row in rows: logger.debug("row[]='%s'", type(row)) if "host" not in row: logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row) @@ -1321,23 +1315,16 @@ def fetch_joinmisskey(args: argparse.Namespace) -> int: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) - logger.info("Fetching instances.json from source_domain='%s' ...", source_domain) - raw = network.fetch_url( - f"https://{source_domain}/instances.json", - headers=network.web_headers, - timeout=config.timeout - ).text - logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) - - parsed = json.loads(raw) - logger.debug("parsed[%s]()=%d", type(parsed), len(parsed)) - - if "instancesInfos" not in parsed: - logger.warning("parsed()=%d does not contain element 'instancesInfos'") - return 1 + logger.info("Fetching /instances.json from source_domain='%s' ...", source_domain) + rows = network.fetch_json_rows( + source_domain, + "/instances.json", + network.web_headers, + "instancesInfos" + ) - logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"])) - for row in parsed["instancesInfos"]: + logger.info("Checking %d instane(s) ...", len(rows)) + for row in rows: logger.debug("row[%s]='%s'", type(row), row) if "url" not in row: logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row)) @@ -1578,7 +1565,7 @@ def fetch_fedilist(args: argparse.Namespace) -> int: logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif (args.force_all is None or not args.force_all) and instances.is_registered(domain): - logger.debug("domain='%s' is already registered, --force-all not specified: args.force_all[]='%s'", domain, type(args.force_all)) + logger.debug("domain='%s' is already registered, --force-all not specified: args.force_all[]='%s' - SKIPPED!", domain, type(args.force_all)) continue elif instances.is_recent(domain): logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain) @@ -1755,11 +1742,12 @@ def fetch_relaylist(args: argparse.Namespace) -> int: logger.info("Checking %d row(s) ...", len(rows)) for row in rows: - logger.debug("row[%s]='%s' - BEFORE!", type(row), row) + logger.debug("row[%s]='%s'", type(row), row) if "url" not in row: logger.warning("row='%s' has no required element 'url' - SKIPPED!", row) continue + logger.debug("row[url]='%s' - BEFORE!", row["url"]) domain = urlparse(row["url"]).netloc.lower().split(":")[0] logger.debug("domain='%s' - AFTER!", domain) @@ -1840,10 +1828,10 @@ def fetch_relays(args: argparse.Namespace) -> int: logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw)) continue elif "metadata" not in raw["json"]: - logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"])) + logger.warning("raw[json]()=%d does not contain key 'metadata' in response - SKIPPED!", len(raw["json"])) continue elif "peers" not in raw["json"]["metadata"]: - logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"])) + logger.warning("raw[json][metadata()=%d does not contain key 'peers' in response - SKIPPED!", len(raw["json"]["metadata"])) continue else: logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) @@ -1876,7 +1864,7 @@ def fetch_relays(args: argparse.Namespace) -> int: logger.debug("tag='%s' is an empty tag - SKIPPED!", tag) continue elif "registered instances" not in tag.contents[0]: - logger.debug("Skipping paragraph, text not found.") + logger.debug("Skipping paragraph, text not found - SKIPPED!") continue logger.debug("Found tag.contents[0][]='%s'", tag.contents[0]) @@ -1904,7 +1892,7 @@ def fetch_relays(args: argparse.Namespace) -> int: logger.debug("domains()=%d,domain='%s'", len(domains), domain) if dict_helper.has_key(domains, "domain", domain): - logger.debug("domain='%s' already added", domain) + logger.debug("domain='%s' already added - SKIPPED!", domain) continue logger.debug("Appending domain='%s',row[domain]='%s',row[software]='%s' ...", domain, row["domain"], row["software"]) -- 2.39.5