From: Roland Häder Date: Sat, 10 May 2025 15:21:55 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=HEAD;p=fba.git Continued: - added handling of multi-domain entries, e.g. splitted by comma or slash - tidyup.domain() removes last asterisk --- diff --git a/fba/commands.py b/fba/commands.py index 31d216d..1581ddb 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -660,7 +660,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: doc = bs4.BeautifulSoup(raw, "html.parser") logger.debug("doc[]='%s'", type(doc)) - silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li") + silenced = doc.find("h3", {"id": "limited_servers"}).find_next("ul").findAll("li") logger.info("Checking %d silenced/limited entries ...", len(silenced)) blocklist["silenced"] = utils.find_domains(silenced, "div") diff --git a/fba/helpers/tidyup.py b/fba/helpers/tidyup.py index 5fe8694..2f80183 100644 --- a/fba/helpers/tidyup.py +++ b/fba/helpers/tidyup.py @@ -80,5 +80,10 @@ def domain(string: str) -> str: if string.endswith("silence"): string = string.split("silence")[0] + # Some people have TLDs with this word on the end + logger.debug("string='%s' - #8", string) + if string.endswith("*"): + string = string.split("*")[-1] + logger.debug("string='%s' - EXIT!", string) return string diff --git a/fba/http/federation.py b/fba/http/federation.py index 087f91a..6be9822 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -510,6 +510,7 @@ def find_domains(tag: bs4.element.Tag, domain_column: str = "dt", reason_column: domains = [] for element in tag.find_all(domain_column): logger.debug("element[%s]='%s'", type(element), element) + domain = tidyup.domain(element.text) reasons = element.find_next(reason_column).text.split(reason_text)[1].splitlines() logger.debug("domain='%s',reasons(%d)='%s'", domain, len(reasons), reasons) diff --git a/fba/utils.py b/fba/utils.py index 37024c1..6ba7f4a 100644 --- a/fba/utils.py +++ b/fba/utils.py @@ -57,18 +57,39 @@ def find_domains(tags: bs4.element.ResultSet, search: str) -> list: logger.debug("Parsing %d tags ...", len(tags)) for tag in tags: logger.debug("tag[]='%s'", type(tag)) - domain = tidyup.domain(tag.find(search).contents[0]) - logger.debug("domain='%s' - AFTER!", domain) + domain = tag.find(search).contents[0] + logger.debug("domain='%s' - BEFORE! #1", domain) + if domain not in ["", None]: + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER! #2", domain) if domain == "": logger.debug("tag='%s' has no domain, trying ...", tag) - domain = tidyup.domain(tag.find("em").contents[0]) - logger.debug("domain='%s' - AFTER!", domain) + domain = tag.find("em").contents[0] + logger.debug("domain='%s' - BEFORE! #2", domain) + if domain not in ["", None]: + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER! #2", domain) logger.debug("domain='%s' - AFTER2!", domain) if domain == "": logger.warning("Empty domain after checking search='%s' and tags - SKIPPED!", search) continue + elif domain == "noagendasocial.com/noagenda.social": + logger.debug("domain='%s' is a double-domain entry, adding all ...", domain) + add_all_to_list(domains, domain, "/") + + logger.debug("domain='%s' - SKIPPING!", domain) + continue + elif "," in domain: + logger.debug("domain='%s' contains a comma-separated list of domains, adding all ...", domain) + add_all_to_list(domains, domain, ",") + + logger.debug("domain='%s' - SKIPPING!", domain) + continue + elif not validators.domain(domain, rfc_2782=True): + logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain) + continue logger.debug("domain='%s' - BEFORE!", domain) domain = domain_helper.encode_idna(domain) @@ -78,12 +99,37 @@ def find_domains(tags: bs4.element.ResultSet, search: str) -> list: logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue - logger.debug("Appending domain='%s'", domain) + logger.debug("Appending domain='%s' ...", domain) domains.append(domain) logger.debug("domains()=%d - EXIT!", len(domains)) return domains +def add_all_to_list(domains: list, source: str, splitter: str) -> None: + logger.debug("domains()=%d,source='%s',splitter='%s' - CALLED!") + if not isinstance(domains, list): + raise TypeError(f"Parameter domains[]='{type(domains)}' is not type 'list'") + elif not isinstance(source, str): + raise TypeError(f"Parameter source[]='{type(source)}' is not type 'list'") + elif source == "": + raise ValueError("Parameter 'source' is empty") + elif not isinstance(splitter, str): + raise TypeError(f"Parameter splitter[]='{type(splitter)}' is not type 'list'") + elif splitter == "": + raise ValueError("Parameter 'splitter' is empty") + + for domain in source.split(splitter): + logger.debug("domain='%s' - LOOP!", domain) + domain = domain.strip() + if not domain_helper.is_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("Appending domain='%s' ...", domain) + domains.append(domain) + + logger.debug("EXIT!") + def deobfuscate(domain: str, blocker: str, domain_hash: str = None) -> str: logger.debug("domain='%s',blocker='%s',domain_hash='%s' - CALLED!", domain, blocker, domain_hash) domain_helper.raise_on(blocker)