From 2240472ee86800bb630236b05e71584ed7fa790e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Mon, 4 Sep 2023 09:54:14 +0200 Subject: [PATCH] Continued: - functions in module fba.helpers.tidyup are relatively "expensive", means they need a lot of CPU cycles - let's avoid invoking them on empty string --- fba/helpers/tidyup.py | 2 ++ fba/networks/friendica.py | 18 ++++++++++++------ fba/networks/lemmy.py | 27 +++++++++++++++++++-------- fba/networks/mastodon.py | 24 +++++++++++++++++++++--- fba/networks/pleroma.py | 25 ++++++++++++++++--------- 5 files changed, 70 insertions(+), 26 deletions(-) diff --git a/fba/helpers/tidyup.py b/fba/helpers/tidyup.py index 580cb22..68abb8d 100644 --- a/fba/helpers/tidyup.py +++ b/fba/helpers/tidyup.py @@ -36,6 +36,8 @@ def domain(string: str) -> str: if not isinstance(string, str): raise ValueError(f"Parameter string[]='{type(string)}' is not of type 'str'") + elif string == "": + raise ValueError("Parameter string is empty") # All lower-case and strip spaces out + last dot string = string.lower().strip().rstrip(".") diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index f30f126..317a8ca 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -78,12 +78,18 @@ def fetch_blocks(domain: str) -> list: logger.debug("Found rows()=%d", len(rows)) for line in rows: logger.debug("line='%s'", line) - blocked = tidyup.domain(line.find_all("td")[0].text) + blocked = line.find_all("td")[0].text + logger.debug("blocked='%s'", blocked) + + blocked = tidyup.domain(blocked) if blocked != "" else None reason = tidyup.reason(line.find_all("td")[1].text) - logger.debug("blocked='%s',reason='%s'", blocked, reason) + logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) - if blocked == "": - logger.debug("line[]='%s' returned empty blocked domain - SKIPPED!", type(line)) + if blocked is None: + logger.warning("blocked is empty - SKIPPED!") + continue + elif blocked == "": + logger.warning("line[]='%s' returned empty blocked domain - SKIPPED!", type(line)) continue elif not domain_helper.is_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) @@ -92,8 +98,8 @@ def fetch_blocks(domain: str) -> list: logger.debug("Appending blocked='%s',reason='%s'", blocked, reason) blocklist.append({ "blocker" : domain, - "blocked" : tidyup.domain(blocked), - "reason" : tidyup.reason(reason), + "blocked" : blocked, + "reason" : reason, "block_level": "reject", }) diff --git a/fba/networks/lemmy.py b/fba/networks/lemmy.py index efc85de..e37c2b8 100644 --- a/fba/networks/lemmy.py +++ b/fba/networks/lemmy.py @@ -200,10 +200,13 @@ def fetch_blocks(domain: str) -> list: logger.debug("Found %d blocked instance(s) ...", len(blocking)) for tag in blocking: logger.debug("tag[]='%s'", type(tag)) - blocked = tidyup.domain(tag.contents[0]) + blocked = tidyup.domain(tag.contents[0]) if tag.contents[0] != "" else None logger.debug("blocked='%s'", blocked) - if blocked == "": + if blocked is None: + logger.warning("blocked is empty - SKIPPED!") + continue + elif blocked == "": logger.warning("blocked='%s' is empty after tidyup.domain() - SKIPPED!", tag.contents[0]) continue elif not domain_helper.is_wanted(blocked): @@ -264,11 +267,16 @@ def fetch_instances(domain: str, origin: str) -> list: for tag in rows: logger.debug("tag[]='%s'", type(tag)) text = tag.contents[0] if isinstance(tag.contents[0], str) else tag.contents[0].text - peer = tidyup.domain(text) - logger.debug("peer='%s'", peer) + logger.debug("text='%s' - BEFORE!", text) + + peer = tidyup.domain(text) if text != "" else None + logger.debug("peer='%s' - AFTER", peer) - if peer == "": - logger.debug("peer is empty - SKIPPED!") + if peer is None: + logger.warning("peer is empty - SKIPPED!") + continue + elif peer == "": + logger.warning("peer is an empty string, text='%s' - SKIPPED!", text) continue elif not domain_helper.is_wanted(peer): logger.debug("peer='%s' is not wanted - SKIPPED!", peer) @@ -367,8 +375,11 @@ def parse_script(doc: bs4.BeautifulSoup, only: str = None) -> list: peer = tidyup.domain(row["domain"]) logger.debug("peer='%s' - AFTER!", peer) - if peer == "": - logger.debug("peer is empty - SKIPPED!") + if peer is None: + logger.warning("peer is empty - SKIPPED!") + continue + elif peer == "": + logger.warning("peer is an empty string, row[domain]='%s' - SKIPPED!", row["domain"]) continue elif not domain_helper.is_wanted(peer): logger.debug("peer='%s' is not wanted - SKIPPED!", peer) diff --git a/fba/networks/mastodon.py b/fba/networks/mastodon.py index cda3cc4..3b57892 100644 --- a/fba/networks/mastodon.py +++ b/fba/networks/mastodon.py @@ -115,10 +115,27 @@ def fetch_blocks_from_about(domain: str) -> dict: if header_text in blocklist or header_text.lower() in blocklist: # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu for line in header.find_all_next("table")[0].find_all("tr")[1:]: + domain = line.find("span").text + hash = line.find("span")["title"][9:] + reason = line.find_all("td")[1].text + + logger.debug("domain='%s',reason='%s' - BEFORE!", domain, reason) + domain = tidyup.domain(domain) if domain != "" else None + reason = tidyup.reason(reason) if reason != "" else None + + logger.debug("domain='%s',reason='%s' - AFTER!", domain, reason) + if domain is None: + logger.warning("domain is empty,line='%s' - SKIPPED!", line) + continue + elif domain == "": + logger.warning("domain is an empty string,line='%s' - SKIPPED!", line) + continue + + logger.debug("Appending domain='%s',hash='%s',reason='%s' to blocklist header_text='%s' ...", domain, hash, reason, blocklist) blocklist[header_text].append({ - "domain": tidyup.domain(line.find("span").text), - "hash" : tidyup.domain(line.find("span")["title"][9:]), - "reason": tidyup.reason(line.find_all("td")[1].text), + "domain": domain, + "hash" : hash, + "reason": reason, }) else: logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist)) @@ -157,6 +174,7 @@ def fetch_blocks(domain: str) -> list: logger.debug("block[]='%s' is of type 'dict' - SKIPPED!", type(block)) continue elif "domain" not in block: + logger.debug("block='%s'", block) logger.warning("block()=%d does not contain element 'domain' - SKIPPED!", len(block)) continue elif not domain_helper.is_wanted(block["domain"]): diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py index ebd5b78..1a69c23 100644 --- a/fba/networks/pleroma.py +++ b/fba/networks/pleroma.py @@ -108,7 +108,7 @@ def fetch_blocks(domain: str) -> list: } ).items(): logger.debug("block_level='%s', blocklist()=%d", block_level, len(blocklist)) - block_level = tidyup.domain(block_level) + block_level = tidyup.domain(block_level) if block_level != "" else None logger.debug("block_level='%s' - AFTER!", block_level) if block_level == "": @@ -124,11 +124,14 @@ def fetch_blocks(domain: str) -> list: if len(blocklist) > 0: for blocked in blocklist: logger.debug("blocked='%s' - BEFORE!", blocked) - blocked = tidyup.domain(blocked) + blocked = tidyup.domain(blocked) if blocked != "" else None logger.debug("blocked='%s' - AFTER!", blocked) - if blocked == "": - logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s' - SKIPPED!", domain, block_level) + if blocked is None: + logger.warning("blocked is empty - SKIPPED!") + continue + elif blocked == "": + logger.warning("blocked is an empty string after tidyup.domain(): domain='%s',block_level='%s' - SKIPPED!", domain, block_level) continue logger.debug("Invoking utils.deobfuscate(%s, %s) ...", blocked, domain) @@ -154,7 +157,7 @@ def fetch_blocks(domain: str) -> list: for blocked in data["quarantined_instances"]: logger.debug("blocked='%s' - BEFORE!", blocked) - blocked = tidyup.domain(blocked) + blocked = tidyup.domain(blocked) if blocked != "" else None logger.debug("blocked='%s' - AFTER!", blocked) if blocked == "": @@ -189,7 +192,7 @@ def fetch_blocks(domain: str) -> list: } ).items(): logger.debug("block_level='%s', info.items()=%d", block_level, len(info.items())) - block_level = tidyup.domain(block_level) + block_level = tidyup.domain(block_level) if block_level != "" else None logger.debug("block_level='%s' - AFTER!", block_level) if block_level == "": @@ -204,7 +207,7 @@ def fetch_blocks(domain: str) -> list: logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(info.items()), domain, block_level) for blocked, reason in info.items(): logger.debug("blocked='%s',reason[%s]='%s' - BEFORE!", blocked, type(reason), reason) - blocked = tidyup.domain(blocked) + blocked = tidyup.domain(blocked) if blocked != "" else None logger.debug("blocked='%s' - AFTER!", blocked) if isinstance(reason, str): @@ -239,7 +242,7 @@ def fetch_blocks(domain: str) -> list: for blocked in rows: logger.debug("blocked='%s' - BEFORE!", blocked) reason = tidyup.reason(rows[blocked]["reason"]) - blocked = tidyup.domain(blocked) + blocked = tidyup.domain(blocked) if blocked != "" else None logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) if blocked not in rows or "reason" not in rows[blocked]: @@ -365,8 +368,12 @@ def fetch_blocks_from_about(domain: str) -> dict: logger.debug("Found block_level='%s', importing domain blocks ...", block_level) for line in header.find_next("table").find_all("tr")[1:]: logger.debug("line[]='%s'", type(line)) - blocked = tidyup.domain(line.find_all("td")[0].text) + blocked = line.find_all("td")[0].text + logger.debug("blocked='%s'", blocked) + + blocked = tidyup.domain(blocked) if blocked != "" else None reason = tidyup.reason(line.find_all("td")[1].text) + logger.debig("blocked='%s',reason='%s' - AFTER!", blocked, reason) if blocked is None or blocked == "": logger.debug("domain='%s',block_level='%s': blocked is empty - SKIPPED!", domain, block_level) -- 2.39.5