From 131299a2d55bdfbd989c633e99eeab3399256d28 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 11 Jul 2023 00:51:49 +0200 Subject: [PATCH 1/1] Continued: - added command convert_idna to convert UTF-8 encoded international domain names to punycode domains (IDNA), it caused some to be added in both encodings --- fba/boot.py | 7 +++ fba/commands.py | 113 +++++++++++++++++++++++++++++++++++----- fba/helpers/domain.py | 2 + fba/http/federation.py | 7 ++- fba/models/blocks.py | 29 +++++++++++ fba/models/instances.py | 43 +++++++++++++-- fba/utils.py | 13 ++++- 7 files changed, 197 insertions(+), 17 deletions(-) diff --git a/fba/boot.py b/fba/boot.py index ebde9f2..15f29a1 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -198,6 +198,13 @@ def init_parser(): ) parser.set_defaults(command=commands.fetch_instances_social) + ### Convert international domain names to punycode domains ### + parser = subparser_command.add_parser( + "convert_idna", + help="Convertes UTF-8 encoded international domain names into punycode (IDNA) domain names.", + ) + parser.set_defaults(command=commands.convert_idna) + logger.debug("EXIT!") def run_command(): diff --git a/fba/commands.py b/fba/commands.py index f80c89a..bd3d393 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -146,7 +146,12 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: elif row["domain"] == "": logger.debug("row[domain] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(row["domain"]): + + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + row["domain"] = row["domain"].encode("idna").decode("utf-8") + logger.debug("row[domain]='%s' - AFTER!", row["domain"]) + + if not utils.is_domain_wanted(row["domain"]): logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) continue elif instances.is_registered(row["domain"]): @@ -238,6 +243,10 @@ def fetch_bkali(args: argparse.Namespace) -> int: if len(domains) > 0: logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + try: logger.info("Fetching instances from domain='%s' ...", domain) federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name) @@ -354,6 +363,8 @@ def fetch_blocks(args: argparse.Namespace) -> int: elif block["blocked"].endswith(".tld"): logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"]) continue + elif "xn--" in block["blocked"]: + raise ValueError(f"blocked='{block['blocked']}' is a punycode domain, please don't crawl them!") elif block["blocked"].find("*") >= 0: logger.debug("blocker='%s' uses obfuscated domains", blocker) @@ -389,7 +400,12 @@ def fetch_blocks(args: argparse.Namespace) -> int: if block["blocked"] == "": logger.debug("block[blocked] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(block["blocked"]): + + logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"]) + block["blocked"] = block["blocked"].encode("idna").decode("utf-8") + logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + + if not utils.is_domain_wanted(block["blocked"]): logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif block["block_level"] in ["accept", "accepted"]: @@ -501,12 +517,17 @@ def fetch_observer(args: argparse.Namespace) -> int: for item in items: logger.debug("item[]='%s'", type(item)) domain = item.decode_contents() - logger.debug("domain='%s' - AFTER!", domain) + if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): @@ -673,7 +694,10 @@ def fetch_cs(args: argparse.Namespace): for row in blocklist[block_level]: logger.debug("row[%s]='%s'", type(row), row) - if instances.is_recent(row["domain"], "last_blocked"): + if not "domain" in row: + logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row)) + continue + elif instances.is_recent(row["domain"], "last_blocked"): logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) continue elif not instances.is_registered(row["domain"]): @@ -741,7 +765,12 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: @@ -761,6 +790,7 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: if len(domains) > 0: logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s'", domain) try: logger.info("Fetching instances from domain='%s' ...", domain) federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) @@ -813,7 +843,12 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: @@ -885,7 +920,12 @@ def fetch_instances(args: argparse.Namespace) -> int: if row["domain"] == "": logger.debug("row[domain] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(row["domain"]): + + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + row["domain"] = row["domain"].encode("idna").decode("utf-8") + logger.debug("row[domain]='%s' - AFTER!", row["domain"]) + + if not utils.is_domain_wanted(row["domain"]): logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) continue @@ -1035,6 +1075,8 @@ def fetch_oliphant(args: argparse.Namespace) -> int: elif domain.endswith(".tld"): logger.debug("domain='%s' is a fake domain - SKIPPED", domain) continue + elif "xn--" in domain: + raise ValueError(f"domain='{domain}' is a punycode domain, please translate them back!") elif domain.find("*") >= 0 or domain.find("?") >= 0: logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"]) domain = utils.deobfuscate(domain, block["blocker"]) @@ -1171,7 +1213,12 @@ def fetch_fedipact(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): @@ -1420,6 +1467,8 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: elif block["blocked"].endswith(".tld"): logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"]) continue + elif "xn--" in block["blocked"]: + raise ValueError(f"blocked='{block['blocked']}' is a punycode domain, please translate them back!") elif block["blocked"].endswith(".onion"): logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"]) continue @@ -1516,7 +1565,12 @@ def fetch_fedilist(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"]) continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif (args.all is None or not args.all) and instances.is_registered(domain): @@ -1625,12 +1679,17 @@ def fetch_instances_social(args: argparse.Namespace) -> int: for row in rows: logger.debug("row[]='%s'", type(row)) domain = tidyup.domain(row["name"]) - logger.debug("domain='%s' - AFTER!", domain) + if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: @@ -1648,3 +1707,33 @@ def fetch_instances_social(args: argparse.Namespace) -> int: logger.debug("Success! - EXIT!") return 0 + +def convert_idna(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "domain") + + database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "origin") + + database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocker") + + database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocked") + + logger.debug("Success! - EXIT!") + return 0 diff --git a/fba/helpers/domain.py b/fba/helpers/domain.py index 5328e3f..75f97a5 100644 --- a/fba/helpers/domain.py +++ b/fba/helpers/domain.py @@ -39,6 +39,8 @@ def raise_on(domain: str): raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!") elif domain.endswith(".tld"): raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif "xn--" in domain: + raise ValueError(f"domain='{domain}' is a punycode domain, please don't crawl them!") logger.debug("EXIT!") diff --git a/fba/http/federation.py b/fba/http/federation.py index 8cc5905..1418755 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -111,7 +111,12 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: if instance == "": logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain) continue - elif not utils.is_domain_wanted(instance): + + logger.debug("instance='%s' - BEFORE!", instance) + instance = instance.encode("idna").decode("utf-8") + logger.debug("instance='%s' - AFTER!", instance) + + if not utils.is_domain_wanted(instance): logger.debug("instance='%s' is not wanted - SKIPPED!", instance) continue elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0): diff --git a/fba/models/blocks.py b/fba/models/blocks.py index 6b8edef..cad0f6c 100644 --- a/fba/models/blocks.py +++ b/fba/models/blocks.py @@ -194,3 +194,32 @@ def valid(value: str, column: str) -> bool: logger.debug("valid='%s' - EXIT!", valid) return valid + +def translate_idnas(rows: list, column: str): + logger.debug("rows[]='%s' - CALLED!", type(rows)) + if not isinstance(rows, list): + raise ValueError(f"rows[]='%s' is not of type 'list'", type(rows)) + elif len(rows) == 0: + raise ValueError("Parameter 'rows' is an empty list") + elif not isinstance(column, str): + raise ValueError(f"column='%s' is not of type 'str'", type(column)) + elif column == "": + raise ValueError("Parameter 'column' is empty") + elif column not in ["blocker", "blocked"]: + raise ValueError(f"column='{column}' is not supported") + + logger.info("Checking/converting %d domain names ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + + translated = row[column].encode("idna").decode("utf-8") + logger.debug("translated='%s',row[%s]='%s'", translated, column, row[column]) + + if translated != row[column]: + logger.info("Translated row[%s]='%s' to '%s'", column, row[column], translated) + database.cursor.execute(f"UPDATE blocks SET {column} = ? WHERE {column} = ?", [translated, row[column]]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("EXIT!") diff --git a/fba/models/instances.py b/fba/models/instances.py index 7968ff2..1b84504 100644 --- a/fba/models/instances.py +++ b/fba/models/instances.py @@ -270,9 +270,13 @@ def set_success(domain: str): logger.debug("EXIT!") -def is_registered(domain: str) -> bool: - logger.debug("domain='%s' - CALLED!", domain) - domain_helper.raise_on(domain) +def is_registered(domain: str, skip_raise = False) -> bool: + logger.debug("domain='%s',skip_raise='%s' - CALLED!", domain, skip_raise) + if not isinstance(skip_raise, bool): + raise ValueError(f"skip_raise[]='%s' is not type of 'bool'", type(skip_raise)) + + if not skip_raise: + domain_helper.raise_on(domain) logger.debug("domain='%s' - CALLED!", domain) if not cache.key_exists("is_registered"): @@ -474,3 +478,36 @@ def valid(value: str, column: str) -> bool: logger.debug("valid='%s' - EXIT!", valid) return valid + +def translate_idnas(rows: list, column: str): + logger.debug("rows[]='%s' - CALLED!", type(rows)) + if not isinstance(rows, list): + raise ValueError(f"rows[]='%s' is not of type 'list'", type(rows)) + elif len(rows) == 0: + raise ValueError("Parameter 'rows' is an empty list") + elif not isinstance(column, str): + raise ValueError(f"column='%s' is not of type 'str'", type(column)) + elif column == "": + raise ValueError("Parameter 'column' is empty") + elif column not in ["domain", "origin"]: + raise ValueError(f"column='{column}' is not supported") + + logger.info("Checking/converting %d domain names ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + + translated = row[column].encode("idna").decode("utf-8") + logger.debug("translated='%s',row[%s]='%s'", translated, column, row[column]) + + if translated != row[column]: + logger.info("Translated row[%s]='%s' to '%s'", column, row[column], translated) + if is_registered(translated, True): + logger.warning("Deleting row[%s]='%s' as translated='%s' already exist", column, row[column], translated) + database.cursor.execute(f"DELETE FROM instances WHERE {column} = ? LIMIT 1", [row[column]]) + else: + database.cursor.execute(f"UPDATE instances SET {column} = ? WHERE {column} = ? LIMIT 1", [translated, row[column]]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("EXIT!") diff --git a/fba/utils.py b/fba/utils.py index 727af2b..019938c 100644 --- a/fba/utils.py +++ b/fba/utils.py @@ -127,11 +127,20 @@ def find_domains(tags: bs4.element.ResultSet, search: str) -> list: for tag in tags: logger.debug("tag[]='%s'", type(tag)) domain = tidyup.domain(tag.find(search).contents[0]) + logger.debug("domain='%s' - AFTER!", domain) - logger.debug("domain='%s'", domain) if domain == "": logger.debug("tag='%s' has no domain, trying ...", tag) domain = tidyup.domain(tag.find("em").contents[0]) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.warning("Empty domain after checking search='%s' and tags - SKIPPED!", search) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) if not is_domain_wanted(domain): logger.debug("domain='%s' is not wanted - SKIPPED!", domain) @@ -165,6 +174,8 @@ def is_domain_wanted(domain: str) -> bool: elif domain.endswith(".tld"): logger.debug("domain='%s' is a fake domain - settings False ...", domain) wanted = False + elif "xn--" in domain: + raise ValueError(f"domain='{domain}' is a punycode domain, please don't crawl them!") elif blacklist.is_blacklisted(domain): logger.debug("domain='%s' is blacklisted - settings False ...", domain) wanted = False -- 2.39.5