From b5871ac612bf87961f03b0609b4de5a8cb8f489e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 6 Sep 2023 03:37:48 +0200 Subject: [PATCH] Continued: - fetch_relays now supports --software=foo parameter - added support for 'pub-relay' relays, they provide their peers over their nodeinfo URL (see element metadata -> peers) --- fba/boot.py | 1 + fba/commands.py | 82 ++++++++++++++++++++++++++++++++-------- fba/helpers/blacklist.py | 1 + 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/fba/boot.py b/fba/boot.py index d9fa4af..8e0f3fe 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -231,6 +231,7 @@ def init_parser(): ) parser.set_defaults(command=commands.fetch_relays) parser.add_argument("--domain", help="Instance name (aka. 'relay')") + parser.add_argument("--software", help="Name of software, e.g. 'lemmy'") parser.add_argument("--force", action="store_true", help="Forces update of data, no matter what.") ### Remove invalid domains ### diff --git a/fba/commands.py b/fba/commands.py index 7611f55..99a2870 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -1867,9 +1867,11 @@ def fetch_relays(args: argparse.Namespace) -> int: locking.acquire() if args.domain is not None and args.domain != "": - database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain]) + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain]) + elif args.software is not None and args.software != "": + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software]) else: - database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')") + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')") domains = list() rows = database.cursor.fetchall() @@ -1883,13 +1885,44 @@ def fetch_relays(args: argparse.Namespace) -> int: continue try: - logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) - raw = utils.fetch_url( - f"https://{row['domain']}", - network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) - ).text - logger.debug("raw[%s]()=%d", type(raw), len(raw)) + if row["software"] == "pub-relay": + logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"]) + raw = network.fetch_api_url( + row["nodeinfo_url"], + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + if "exception" in raw: + logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"])) + raise raw["exception"] + elif "error_message" in raw: + logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"]) + instances.set_last_error(row["domain"], raw) + instances.set_last_instance_fetch(row["domain"]) + instances.update(row["domain"]) + continue + elif not "json" in raw: + logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw)) + continue + elif not "metadata" in raw["json"]: + logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"])) + continue + elif not "peers" in raw["json"]["metadata"]: + logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"])) + continue + else: + logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + raw = utils.fetch_url( + f"https://{row['domain']}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + + doc = bs4.BeautifulSoup(raw, features="html.parser") + logger.debug("doc[]='%s'", type(doc)) + except network.exceptions as exception: logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception)) instances.set_last_error(row["domain"], exception) @@ -1897,9 +1930,6 @@ def fetch_relays(args: argparse.Namespace) -> int: instances.update(row["domain"]) continue - doc = bs4.BeautifulSoup(raw, features="html.parser") - logger.debug("doc[]='%s'", type(doc)) - logger.debug("row[software]='%s'", row["software"]) if row["software"] == "activityrelay": logger.debug("Checking row[domain]='%s' ...", row["domain"]) @@ -1967,10 +1997,6 @@ def fetch_relays(args: argparse.Namespace) -> int: components = urlparse(link["href"]) domain = components.netloc.lower() - if not domain_helper.is_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) - continue - logger.debug("domain='%s' - BEFORE!", domain) domain = tidyup.domain(domain) logger.debug("domain='%s' - AFTER!", domain) @@ -1985,6 +2011,29 @@ def fetch_relays(args: argparse.Namespace) -> int: if dict_helper.has_key(domains, "domain", domain): logger.debug("domain='%s' already added", domain) continue + elif not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"]) + domains.append({ + "domain": domain, + "origin": row["domain"], + }) + elif row["software"] == "pub-relay": + logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"]) + for domain in raw["json"]["metadata"]["peers"]: + logger.debug("domain='%s'", domain) + if domain not in peers: + logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"]) + peers.append(domain) + + if dict_helper.has_key(domains, "domain", domain): + logger.debug("domain='%s' already added", domain) + continue + elif not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"]) domains.append({ @@ -1993,6 +2042,7 @@ def fetch_relays(args: argparse.Namespace) -> int: }) else: logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"]) + continue logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"]) instances.set_last_instance_fetch(row["domain"]) diff --git a/fba/helpers/blacklist.py b/fba/helpers/blacklist.py index bbd8d03..91c674b 100644 --- a/fba/helpers/blacklist.py +++ b/fba/helpers/blacklist.py @@ -47,6 +47,7 @@ _blacklist = { "misskeytest.chn.moe" : "Testing/developing instances shouldn't be part of public instances", "netlify.app" : "Testing/developing instances shouldn't be part of public instances", "ignorelist.com" : "Testing/developing instances shouldn't be part of public instances", + "app.github.dev" : "Testing/developing instances shouldn't be part of public instances", "hexbear.net" : "Is a Lemmy instance with malicious JavaScript code (shell commands)", } -- 2.39.5