From 1b92010670a38dadfe0d302d3fee7ae32e1006e8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Thu, 17 Aug 2023 00:56:19 +0200 Subject: [PATCH] Continued: - no, nope: validators.hostname() was a bad idea, it also let IP addresses and local host names in as well - added command remove_invalid to remove those from database - renamed recheck.sh -> nodeinfo.sh --- fba/boot.py | 7 +++++++ fba/commands.py | 38 ++++++++++++++++++++++++++++++++------ fba/helpers/domain.py | 4 ++-- fba/http/federation.py | 2 +- fba/models/instances.py | 2 +- recheck.sh => nodeinfo.sh | 2 +- 6 files changed, 44 insertions(+), 11 deletions(-) rename recheck.sh => nodeinfo.sh (95%) diff --git a/fba/boot.py b/fba/boot.py index 55e9fb4..6c14c2a 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -231,6 +231,13 @@ def init_parser(): parser.add_argument("--domain", help="Instance name (aka. 'relay')") parser.add_argument("--force", action="store_true", help="Forces update of data, no matter what.") + ### Remove invalid domains ### + parser = subparser_command.add_parser( + "remove_invalid", + help="Removes invalid domains.", + ) + parser.set_defaults(command=commands.remove_invalid) + logger.debug("EXIT!") def run_command(): diff --git a/fba/commands.py b/fba/commands.py index 58e8686..b021cab 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -64,7 +64,7 @@ logger = logging.getLogger(__name__) def check_instance(args: argparse.Namespace) -> int: logger.debug("args.domain='%s' - CALLED!", args.domain) status = 0 - if not validators.hostname(args.domain): + if not validators.domain(args.domain): logger.warning("args.domain='%s' is not valid", args.domain) status = 100 elif blacklist.is_blacklisted(args.domain): @@ -266,7 +266,7 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) if args.domain is not None and args.domain != "": logger.debug("args.domain='%s' - checking ...", args.domain) - if not validators.hostname(args.domain): + if not validators.domain(args.domain): logger.warning("args.domain='%s' is not valid.", args.domain) return 100 elif blacklist.is_blacklisted(args.domain): @@ -914,7 +914,7 @@ def fetch_instances(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) logger.debug("args.domain='%s' - checking ...", args.domain) - if not validators.hostname(args.domain): + if not validators.domain(args.domain): logger.warning("args.domain='%s' is not valid.", args.domain) return 100 elif blacklist.is_blacklisted(args.domain): @@ -1076,7 +1076,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int: domain = utils.deobfuscate(domain, block["blocker"]) logger.debug("domain='%s' - AFTER!", domain) - if not validators.hostname(domain): + if not validators.domain(domain): logger.debug("domain='%s' is not a valid domain - SKIPPED!") continue elif blacklist.is_blacklisted(domain): @@ -1382,7 +1382,7 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: if not isinstance(text, str): logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text)) continue - elif validators.hostname(text.strip()): + elif validators.domain(text.strip()): logger.debug("text='%s' is a domain - SKIPPED!", text.strip()) continue @@ -1517,7 +1517,7 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain): database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain]) - elif isinstance(args.software, str) and args.software != "" and validators.hostname(args.software) == args.software: + elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software: database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software]) else: database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1") @@ -2023,3 +2023,29 @@ def convert_idna(args: argparse.Namespace) -> int: logger.debug("Success! - EXIT!") return 0 + +def remove_invalid(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC") + rows = database.cursor.fetchall() + + logger.info("Checking %d domains ...", len(rows)) + for row in rows: + logger.debug("row[domain]='%s'", row["domain"]) + if not validators.domain(row["domain"].split("/")[0]): + logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"]) + database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]]) + database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.info("Vaccum cleaning database ...") + database.cursor.execute("VACUUM") + + logger.debug("Success! - EXIT!") + return 0 diff --git a/fba/helpers/domain.py b/fba/helpers/domain.py index 6804b3a..96aa189 100644 --- a/fba/helpers/domain.py +++ b/fba/helpers/domain.py @@ -35,7 +35,7 @@ def raise_on(domain: str): raise ValueError("Parameter 'domain' is empty") elif domain.lower() != domain: raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.hostname(domain.split("/")[0]): + elif not validators.domain(domain.split("/")[0]): raise ValueError(f"domain='{domain}' is not a valid domain") elif domain.endswith(".arpa"): raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") @@ -75,7 +75,7 @@ def is_wanted(domain: str) -> bool: raise ValueError("Parameter 'domain' is empty") elif domain.lower() != domain: wanted = False - elif not validators.hostname(domain.split("/")[0]): + elif not validators.domain(domain.split("/")[0]): logger.debug("domain='%s' is not a valid domain name - setting False ...", domain) wanted = False elif domain.endswith(".arpa"): diff --git a/fba/http/federation.py b/fba/http/federation.py index f4c9529..aa1adf9 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -478,7 +478,7 @@ def find_domains(tag: bs4.element.Tag) -> list: "reason": reason, }) continue - elif not validators.hostname(domain.split("/")[0]): + elif not validators.domain(domain.split("/")[0]): logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain) continue diff --git a/fba/models/instances.py b/fba/models/instances.py index 5a801b0..1bd2884 100644 --- a/fba/models/instances.py +++ b/fba/models/instances.py @@ -174,7 +174,7 @@ def add(domain: str, origin: str, command: str, path: str = None, software: str raise ValueError(f"software[]='{type(software)}' is not of type 'str'") elif software == "": raise ValueError("Parameter 'software' is empty") - elif origin is not None and not validators.hostname(origin.split("/")[0]): + elif origin is not None and not validators.domain(origin.split("/")[0]): raise ValueError(f"Bad origin name='{origin}'") elif blacklist.is_blacklisted(domain): raise Exception(f"domain='{domain}' is blacklisted, but function invoked") diff --git a/recheck.sh b/nodeinfo.sh similarity index 95% rename from recheck.sh rename to nodeinfo.sh index 1c7fdaa..53888d0 100755 --- a/recheck.sh +++ b/nodeinfo.sh @@ -3,7 +3,7 @@ MODE="" if [ "$1" = "--help" ] then - echo "Usage: $ [file|--software|--software2|--nodeinfo|--generator|--detection|--no-auto|--no-auto2|--timeout]" + echo "Usage: $0 [file|--software|--software2|--nodeinfo|--generator|--detection|--no-auto|--no-auto2|--timeout]" exit 255 elif [ -n "$1" -a -f "$1" ] then -- 2.39.2