From ff159101effff821de45a779fe454df359f3239b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Sun, 15 Sep 2024 17:29:42 +0200 Subject: [PATCH] Continued: - rewrote chaos.social parser for their own documentation at meta.chaos.social --- fba/commands.py | 12 ++++++------ fba/http/federation.py | 44 +++++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index 6726b02..4abf8b7 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -747,7 +747,7 @@ def fetch_cs(args: argparse.Namespace): "rejected": list(), } - source_domain = "raw.githubusercontent.com" + source_domain = "meta.chaos.social" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) return 1 @@ -757,7 +757,7 @@ def fetch_cs(args: argparse.Namespace): logger.info("Fetching federation.md from source_domain='%s' ...", source_domain) raw = network.fetch_url( - f"https://{source_domain}/chaossocial/meta/master/federation.md", + f"https://{source_domain}/federation", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) ).text @@ -766,13 +766,13 @@ def fetch_cs(args: argparse.Namespace): doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser") logger.debug("doc()=%d[]='%s'", len(doc), type(doc)) - silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody") + silenced = doc.find("h2", {"id": "silenced-instances"}).find_next("dl", attrs={"class": "instance-list"}) logger.debug("silenced[%s]()=%d", type(silenced), len(silenced)) blocklist["silenced"] = federation.find_domains(silenced) - blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody") - logger.debug("blocked[%s]()=%d", type(blocked), len(blocked)) - blocklist["rejected"] = federation.find_domains(blocked) + defederated = doc.find("h2", {"id": "defederated-instances"}).find_next("dl", attrs={"class": "instance-list"}) + logger.debug("defederated[%s]()=%d", type(defederated), len(defederated)) + blocklist["rejected"] = federation.find_domains(defederated) blocking = blocklist["silenced"] + blocklist["rejected"] blocker = "chaos.social" diff --git a/fba/http/federation.py b/fba/http/federation.py index 98afd1c..241bdd8 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -517,24 +517,42 @@ def determine_software(domain: str, path: str = None) -> str: logger.debug("software[%s]='%s' - EXIT!", type(software), software) return software -def find_domains(tag: bs4.element.Tag) -> list: - logger.debug("tag[]='%s' - CALLED!", type(tag)) +def find_domains(tag: bs4.element.Tag, domainColumn: str = "dt", reasonColumn: str = "dd", reasonText: str = "Categories:") -> list: + logger.debug("tag[]='%s',domainColumn='%s',reasonColumn='%s',reasonText='%s' - CALLED!", type(tag), domainColumn, reasonColumn, reasonText) if not isinstance(tag, bs4.element.Tag): raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag") - elif len(tag.select("tr")) == 0: - raise KeyError("No table rows found in table!") + elif not isinstance(domainColumn, str): + raise ValueError(f"Parameter domainColumn[]='{type(domainColumn)}' is not type of 'str'") + elif domainColumn == "": + raise ValueError("Parameter 'domainColumn' is an empty string") + elif not isinstance(reasonColumn, str): + raise ValueError(f"Parameter reasonColumn[]='{type(reasonColumn)}' is not type of 'str'") + elif reasonColumn == "": + raise ValueError("Parameter 'reasonColumn' is an empty string") + elif len(tag.find_all(domainColumn)) == 0: + raise KeyError("No domainColumn='{domainColumn}' rows found in table!") + elif len(tag.find_all(reasonColumn)) == 0: + raise KeyError("No reasonColumn='{reasonColumn}' rows found in table!") + elif not isinstance(reasonText, str): + raise ValueError(f"Parameter reasonText[]='{type(reasonText)}' is not type of 'str'") + elif reasonText == "": + raise ValueError("Parameter 'reasonText' is an empty string") domains = list() - for element in tag.select("tr"): - logger.debug("element[]='%s'", type(element)) - if not element.find("td"): - logger.debug("Skipping element, no found") - continue - - domain = tidyup.domain(element.find("td").text) - reason = tidyup.reason(element.findAll("td")[1].text) - + for element in tag.find_all(domainColumn): + logger.debug("element[%s]='%s'", type(element), element) + domain = tidyup.domain(element.text) + reasons = element.find_next(reasonColumn).text.split(reasonText)[1].splitlines() + logger.debug("reasons(%d)='%s'", len(reasons), reasons) + reason = None + for r in reasons: + logger.debug("r[%s]='%s'", type(r), r) + if r != "": + reason = r + break + + reason = tidyup.reason(reason) logger.debug("domain='%s',reason='%s'", domain, reason) if not domain_helper.is_wanted(domain): -- 2.39.5