From 2d6883fad6ba05ee90224274ec57979f47a568d9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Sat, 1 Jul 2023 04:34:06 +0200 Subject: [PATCH] Continued: - skip empty domains - rewrote handling of blocklist from pleroma --- fba/commands.py | 102 +++++++++++++++++++++++++++----------- fba/networks/friendica.py | 3 +- fba/networks/pleroma.py | 97 ++++++++++++++++++------------------ 3 files changed, 122 insertions(+), 80 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index 55be64e..bd755b7 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -109,8 +109,11 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: if "domain" not in row: logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row) continue + elif row["domain"] == "": + logger.debug("row[domain] is empty - SKIPPED!") + continue elif not utils.is_domain_wanted(row["domain"]): - logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) + logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) continue elif instances.is_registered(row["domain"]): logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"]) @@ -160,8 +163,11 @@ def fetch_bkali(args: argparse.Namespace) -> int: if "domain" not in entry: logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry)) continue + elif entry["domain"] == "": + logger.debug("entry[domain] is empty - SKIPPED!") + continue elif not utils.is_domain_wanted(entry["domain"]): - logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"]) + logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"]) continue elif instances.is_registered(entry["domain"]): logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"]) @@ -327,8 +333,11 @@ def fetch_blocks(args: argparse.Namespace) -> int: nodeinfo_url = row["nodeinfo_url"] logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"]) - if not utils.is_domain_wanted(block["blocked"]): - logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif block["block_level"] in ["accept", "accepted"]: logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"]) @@ -429,9 +438,12 @@ def fetch_observer(args: argparse.Namespace) -> int: logger.debug("item[]='%s'", type(item)) domain = item.decode_contents() - logger.debug("domain='%s'", domain) - if not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) @@ -604,10 +616,14 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: logger.debug("rss[]='%s'", type(rss)) for item in rss.items: logger.debug("item='%s'", item) - domain = item.link.split("=")[1] + domain = tidyup.domain(item.link.split("=")[1]) - if not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: logger.debug("domain='%s' is already added - SKIPPED!", domain) @@ -659,13 +675,17 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: doc = bs4.BeautifulSoup(entry.content.value, "html.parser") logger.debug("doc[]='%s'", type(doc)) for element in doc.findAll("a"): + logger.debug("element[]='%s'", type(element)) for href in element["href"].split(","): - logger.debug("href[%s]='%s", type(href), href) + logger.debug("href[%s]='%s' - BEFORE!", type(href), href) domain = tidyup.domain(href) - logger.debug("domain='%s'", domain) - if not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: logger.debug("domain='%s' is already added - SKIPPED!", domain) @@ -723,9 +743,12 @@ def fetch_instances(args: argparse.Namespace) -> int: rows = database.cursor.fetchall() logger.info("Checking %d entries ...", len(rows)) for row in rows: - logger.debug("domain='%s'", row["domain"]) - if not utils.is_domain_wanted(row["domain"]): - logger.debug("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) + logger.debug("row[domain]='%s'", row["domain"]) + if row["domain"] == "": + logger.debug("row[domain] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(row["domain"]): + logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) continue try: @@ -846,8 +869,11 @@ def fetch_oliphant(args: argparse.Namespace) -> int: reject_reports = True logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports) - if not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue logger.debug("Marking domain='%s' as handled", domain) @@ -902,12 +928,15 @@ def fetch_txt(args: argparse.Namespace) -> int: logger.info("Processing %d domains ...", len(domains)) for domain in domains: - logger.debug("domain='%s'", domain) + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + + logger.debug("domain='%s' - AFTER!", domain) if domain == "": logger.debug("domain is empty - SKIPPED!") continue elif not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_recent(domain): logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) @@ -943,12 +972,12 @@ def fetch_fedipact(args: argparse.Namespace) -> int: logger.debug("row[]='%s'", type(row)) domain = tidyup.domain(row.contents[0]) - logger.debug("domain='%s'", domain) + logger.debug("domain='%s' - AFTER!", domain) if domain == "": logger.debug("domain is empty - SKIPPED!") continue elif not utils.is_domain_wanted(domain): - logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) @@ -1064,10 +1093,15 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("blocking()=%d", blocking) for block in blocking: + logger.debug("block[]='%s'", type(block)) block["blocked"] = tidyup.domain(block["blocked"]) - if not utils.is_domain_wanted(block["blocked"]): - logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif instances.is_recent(block["blocked"]): logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"]) @@ -1082,10 +1116,15 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("blocker[%s]='%s'", type(blocker), blocker) for block in blocking: + logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"]) block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None - if not utils.is_domain_wanted(block["blocked"]): - logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"]) + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"]) @@ -1151,10 +1190,13 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: obfuscated = 0 blockdict = list() for block in blocking: - logger.debug("blocked='%s'", block["blocked"]) + logger.debug("block[blocked]='%s'", block["blocked"]) blocked = None - if block["blocked"].endswith(".arpa"): + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif block["blocked"].endswith(".arpa"): logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"]) continue elif block["blocked"].endswith(".tld"): @@ -1168,7 +1210,7 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: obfuscated = obfuscated + 1 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None) elif not utils.is_domain_wanted(block["blocked"]): - logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif blocks.is_instance_blocked(row["domain"], block["blocked"]): logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"]) diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index 459f948..ab985c7 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -85,9 +85,10 @@ def fetch_blocks(domain: str) -> list: logger.debug("blocked='%s' is not wanted - SKIPPED!", domain) continue + logger.debug("blocked='%s',domain='%s' - BEFORE!", blocked, domain) blocked = utils.deobfuscate_domain(blocked, domain) - logger.debug("blocked[%s]='%s'", type(blocked), blocked) + logger.debug("blocked[%s]='%s' - DEOBFUSCATED!", type(blocked), blocked) if not utils.is_domain_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py index a701fdc..435d7ce 100644 --- a/fba/networks/pleroma.py +++ b/fba/networks/pleroma.py @@ -36,7 +36,10 @@ logger = logging.getLogger(__name__) # Language mapping X -> English language_mapping = { # English -> English - "Reject": "Suspended servers", + "limited servers" : "followers_only", + "suspended servers": "reject", + "silenced servers" : "silenced", + "filtered media" : "filtered_media", } def fetch_blocks(domain: str, nodeinfo_url: str) -> list: @@ -232,16 +235,13 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list: for blocked in rows: logger.debug("blocked='%s' - BEFORE!", blocked) blocked = tidyup.domain(blocked) - logger.debug("blocked='%s' - AFTER!", blocked) + reason = tidyup.reason(rows[blocked]["reason"]) + logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) if blocked not in rows or "reason" not in rows[blocked]: logger.warning("Cannot find blocked='%s' in rows()=%d,domain='%s' - BREAK!", blocked, len(rows), domain) break - - reason = tidyup.reason(rows[blocked]["reason"]) - logger.debug("reason='%s'", reason) - - if blocked == "": + elif blocked == "": logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level) continue elif not utils.is_domain_wanted(blocked): @@ -271,35 +271,26 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list: logger.debug("blocklist()=%d", len(blocklist)) if len(blocklist) > 0: - logger.info("Checking %d record(s) ...", len(blocklist)) + logger.info("Checking %d different blocklists ...", len(blocklist)) for block_level in blocklist: logger.debug("block_level='%s'", block_level) rows = blocklist[block_level] logger.debug("rows[%s]()=%d'", type(rows), len(rows)) - for record in rows: - logger.debug("record[]='%s'", type(record)) - blocked = tidyup.domain(record["blocked"]) - reason = tidyup.reason(record["reason"]) - logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) - - if not utils.is_domain_wanted(blocked): - logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked) - continue - - logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain) - blocked = utils.deobfuscate_domain(blocked, domain) + for block in rows: + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", block["blocked"], domain) + block["blocked"] = utils.deobfuscate_domain(block["blocked"], domain) - logger.debug("blocked='%s' - DEOBFUSCATED!", blocked) - if not utils.is_domain_wanted(blocked): - logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) + logger.debug("block[blocked]='%s' - DEOBFUSCATED!", block["blocked"]) + if not utils.is_domain_wanted(block["blocked"]): + logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"]) continue - logger.debug("Appending blocker='%s',blocked='%s',reason='%s',block_level='%s' ...",domain, blocked, reason, block_level) + logger.debug("Appending blocker='%s',block[blocked]='%s',block[reason]='%s',block_level='%s' ...",domain, block["blocked"], block["reason"], block_level) blockdict.append({ "blocker" : domain, - "blocked" : blocked, - "reason" : reason, + "blocked" : block["blocked"], + "reason" : block["reason"], "block_level": block_level, }) @@ -347,10 +338,10 @@ def fetch_blocks_from_about(domain: str) -> dict: break blocklist = { - "Suspended servers": [], - "Filtered media" : [], - "Limited servers" : [], - "Silenced servers" : [], + "reject" : [], + "filtered_media": [], + "followers_only": [], + "silenced" : [], } logger.debug("doc[]='%s'", type(doc)) @@ -358,33 +349,41 @@ def fetch_blocks_from_about(domain: str) -> dict: logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain) return list() - for header in doc.find_all("h2"): + headers = doc.find_all("h2") + + logger.debug("headers[]='%s'", type(headers)) + if headers is None: + logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain) + return list() + + logger.info("Checking %d headers ...", len(headers)) + for header in headers: logger.debug("header[%s]='%s'", type(header), header) - header_text = tidyup.reason(header.text) + block_level = tidyup.reason(header.text).lower() - logger.debug("header_text='%s' - BEFORE!", header_text) - if header_text in language_mapping: - logger.debug("header_text='%s' - FOUND!", header_text) - header_text = language_mapping[header_text] + logger.debug("block_level='%s' - BEFORE!", block_level) + if block_level in language_mapping: + logger.debug("block_level='%s' - FOUND!", block_level) + block_level = language_mapping[block_level].lower() else: - logger.warning("header_text='%s' not found in language mapping table", header_text) + logger.warning("block_level='%s' not found in language mapping table", block_level) - logger.debug("header_text='%s - AFTER!'", header_text) - if header_text in blocklist or header_text.lower() in blocklist: + logger.debug("block_level='%s - AFTER!'", block_level) + if block_level in blocklist: # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu - logger.debug("Found header_text='%s', importing domain blocks ...", header_text) + logger.debug("Found block_level='%s', importing domain blocks ...", block_level) for line in header.find_next("table").find_all("tr")[1:]: logger.debug("line[]='%s'", type(line)) - blocklist[header_text].append({ - "blocked": tidyup.domain(line.find_all("td")[0].text), - "reason" : tidyup.reason(line.find_all("td")[1].text), + blocked = tidyup.domain(line.find_all("td")[0].text) + reason = tidyup.reason(line.find_all("td")[1].text) + + logger.debug("Appending block_level='%s',blocked='%s',reason='%s' ...", block_level, blocked, reason) + blocklist[block_level].append({ + "blocked": blocked, + "reason" : reason, }) else: - logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist)) + logger.warning("block_level='%s' not found in blocklist()=%d", block_level, len(blocklist)) logger.debug("Returning blocklist for domain='%s' - EXIT!", domain) - return { - "reject" : blocklist["Suspended servers"], - "media_removal" : blocklist["Filtered media"], - "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"], - } + return blocklist -- 2.39.5