From 10c3c3d416d4d5e39d2277ed2d1b97ec5f39e90f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Mon, 24 Jul 2023 16:35:53 +0200 Subject: [PATCH] Continued: - added command fetch_relay() for fetching instances from ActivityPub relays which show their peers in index page (/) - added grid.tf as this flooded a lot "testing/developing" sub domains --- fba/boot.py | 2 + fba/commands.py | 148 +++++++++++++++++++++++++++++++++++++-- fba/helpers/blacklist.py | 1 + 3 files changed, 145 insertions(+), 6 deletions(-) diff --git a/fba/boot.py b/fba/boot.py index 7bd816a..abc94ac 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -219,6 +219,8 @@ def init_parser(): help="Fetches instances from ActivityPub relays", ) parser.set_defaults(command=commands.fetch_relays) + parser.add_argument("--domain", help="Instance name (aka. 'relay')") + parser.add_argument("--force", action="store_true", help="Forces update of data, no matter what.") logger.debug("EXIT!") diff --git a/fba/commands.py b/fba/commands.py index d94966f..7b9849d 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -36,6 +36,7 @@ from fba import utils from fba.helpers import blacklist from fba.helpers import config from fba.helpers import cookies +from fba.helpers import dicts as dict_helper from fba.helpers import locking from fba.helpers import processing from fba.helpers import software as software_helper @@ -121,7 +122,7 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: return list() try: - logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) + logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) fetched = network.get_json_api( source_domain, "/api/v1/servers/all.json?scope=All&country=all&language=all", @@ -569,7 +570,12 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: "reject": list(), } - raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.debug("Fetching domainblocks from source_domain='%s'", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/todon/domainblocks", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(raw, "html.parser") @@ -672,7 +678,12 @@ def fetch_cs(args: argparse.Namespace): logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) - raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.info("Fetching federation.md from source_domain='%s' ...", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/chaossocial/meta/master/federation.md", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser") @@ -1727,6 +1738,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int: "Authorization": f"Bearer {config.get('instances_social_api_key')}", } + logger.info("Fetching list from source_domain='%s' ...", source_domain) fetched = network.get_json_api( source_domain, "/api/1.0/instances/list?count=0&sort_by=name", @@ -1787,14 +1799,138 @@ def fetch_instances_social(args: argparse.Namespace) -> int: def fetch_relays(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')") + if args.domain is not None and args.domain != "": + database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain]) + else: + database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')") domains = list() - rows = database.cursor.fetchall() + logger.info("Checking %d relays ...", len(rows)) for row in rows: - logger.debug("Fetching peers from row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + if not args.force and instances.is_recent(row["domain"]): + logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"]) + continue + + try: + logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + raw = utils.fetch_url( + f"https://{row['domain']}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + except network.exceptions as exception: + logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception)) + instances.set_last_error(row["domain"], exception) + instances.set_last_instance_fetch(row["domain"]) + instances.update_data(row["domain"]) + continue + + doc = bs4.BeautifulSoup(raw, features="html.parser") + logger.debug("doc[]='%s'", type(doc)) + + logger.debug("row[software]='%s'", row["software"]) + if row["software"] == "activityrelay": + logger.debug("Checking row[domain]='%s' ...", row["domain"]) + tags = doc.findAll("p") + + logger.debug("Checking %d paragraphs ...", len(tags)) + for tag in tags: + logger.debug("tag[]='%s'", type(tag)) + if len(tag.contents) == 0: + logger.debug("tag='%s' is an empty tag - SKIPPED!", tag) + continue + elif "registered instances" not in tag.contents[0]: + logger.debug("Skipping paragraph, text not found.") + continue + + logger.debug("Found tag.contents[0][]='%s'", tag.contents[0]) + for domain in tag.contents: + logger.debug("domain[%s]='%s'", type(domain), domain) + if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain: + continue + + domain = str(domain) + if not utils.is_domain_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"]) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif dict_helper.has_key(domains, "domain", domain): + logger.debug("domain='%s' already added", domain) + continue + + logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"]) + domains.append({ + "domain": domain, + "origin": row["domain"], + }) + elif row["software"] in ["aoderelay", "selective-relay"]: + logger.debug("Checking row[domain]='%s' ...", row["domain"]) + if row["software"] == "aoderelay": + tags = doc.findAll("section", {"class": "instance"}) + else: + tags = doc.find("div", {"id": "instances"}).findAll("li") + + logger.debug("Checking %d tags ...", len(tags)) + for tag in tags: + logger.debug("tag[]='%s'", type(tag)) + + link = tag.find("a") + logger.debug("link[%s]='%s'", type(link), link) + if link is None: + logger.warning("tag='%s' has no a-tag ...", tag) + continue + + components = urlparse(link["href"]) + domain = components.netloc.lower() + + if not utils.is_domain_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"]) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif dict_helper.has_key(domains, "domain", domain): + logger.debug("domain='%s' already added", domain) + continue + + logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"]) + domains.append({ + "domain": domain, + "origin": row["domain"], + }) + else: + logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"]) + + logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"]) + instances.set_last_instance_fetch(row["domain"]) + instances.update_data(row["domain"]) + + logger.info("Found %d domains to add ...", len(domains)) + for row in domains: + logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"]) + federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name) logger.debug("Success! - EXIT!") return 0 diff --git a/fba/helpers/blacklist.py b/fba/helpers/blacklist.py index d2426b0..891a337 100644 --- a/fba/helpers/blacklist.py +++ b/fba/helpers/blacklist.py @@ -32,6 +32,7 @@ _blacklist = { "lhr.life" : "Floods federation tables with fake nodes", "localhost.run" : "Floods federation tables with fake nodes", "loca.lt" : "Floods federation tables with fake nodes", + "grid.tf" : "Floods federation tables with fake nodes", "ngrok.io" : "Testing/developing instances shouldn't be part of public instances", "ngrok.app" : "Testing/developing instances shouldn't be part of public instances", "ngrok-free.app" : "Testing/developing instances shouldn't be part of public instances", -- 2.39.5