]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Sun, 15 Sep 2024 15:29:42 +0000 (17:29 +0200)
committerRoland Häder <roland@mxchange.org>
Sun, 15 Sep 2024 15:29:42 +0000 (17:29 +0200)
- rewrote chaos.social parser for their own documentation at meta.chaos.social

fba/commands.py
fba/http/federation.py

index 6726b025472bd1981617f77026cca27d2b8d2496..4abf8b74518bce0df3597ddc8b8283e338ea075e 100644 (file)
@@ -747,7 +747,7 @@ def fetch_cs(args: argparse.Namespace):
         "rejected": list(),
     }
 
-    source_domain = "raw.githubusercontent.com"
+    source_domain = "meta.chaos.social"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
         return 1
@@ -757,7 +757,7 @@ def fetch_cs(args: argparse.Namespace):
 
     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
     raw = network.fetch_url(
-        f"https://{source_domain}/chaossocial/meta/master/federation.md",
+        f"https://{source_domain}/federation",
         network.web_headers,
         (config.get("connection_timeout"), config.get("read_timeout"))
     ).text
@@ -766,13 +766,13 @@ def fetch_cs(args: argparse.Namespace):
     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
 
-    silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
+    silenced = doc.find("h2", {"id": "silenced-instances"}).find_next("dl", attrs={"class": "instance-list"})
     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
     blocklist["silenced"] = federation.find_domains(silenced)
 
-    blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
-    logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
-    blocklist["rejected"] = federation.find_domains(blocked)
+    defederated = doc.find("h2", {"id": "defederated-instances"}).find_next("dl", attrs={"class": "instance-list"})
+    logger.debug("defederated[%s]()=%d", type(defederated), len(defederated))
+    blocklist["rejected"] = federation.find_domains(defederated)
 
     blocking = blocklist["silenced"] + blocklist["rejected"]
     blocker = "chaos.social"
index 98afd1c4da61396b30e315d893d14b9354609a9e..241bdd801cebb38f759e7a2aea2f47091e64d4e7 100644 (file)
@@ -517,24 +517,42 @@ def determine_software(domain: str, path: str = None) -> str:
     logger.debug("software[%s]='%s' - EXIT!", type(software), software)
     return software
 
-def find_domains(tag: bs4.element.Tag) -> list:
-    logger.debug("tag[]='%s' - CALLED!", type(tag))
+def find_domains(tag: bs4.element.Tag, domainColumn: str = "dt", reasonColumn: str = "dd", reasonText: str = "Categories:") -> list:
+    logger.debug("tag[]='%s',domainColumn='%s',reasonColumn='%s',reasonText='%s' - CALLED!", type(tag), domainColumn, reasonColumn, reasonText)
 
     if not isinstance(tag, bs4.element.Tag):
         raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
-    elif len(tag.select("tr")) == 0:
-        raise KeyError("No table rows found in table!")
+    elif not isinstance(domainColumn, str):
+        raise ValueError(f"Parameter domainColumn[]='{type(domainColumn)}' is not type of 'str'")
+    elif domainColumn == "":
+        raise ValueError("Parameter 'domainColumn' is an empty string")
+    elif not isinstance(reasonColumn, str):
+        raise ValueError(f"Parameter reasonColumn[]='{type(reasonColumn)}' is not type of 'str'")
+    elif reasonColumn == "":
+        raise ValueError("Parameter 'reasonColumn' is an empty string")
+    elif len(tag.find_all(domainColumn)) == 0:
+        raise KeyError("No domainColumn='{domainColumn}' rows found in table!")
+    elif len(tag.find_all(reasonColumn)) == 0:
+        raise KeyError("No reasonColumn='{reasonColumn}' rows found in table!")
+    elif not isinstance(reasonText, str):
+        raise ValueError(f"Parameter reasonText[]='{type(reasonText)}' is not type of 'str'")
+    elif reasonText == "":
+        raise ValueError("Parameter 'reasonText' is an empty string")
 
     domains = list()
-    for element in tag.select("tr"):
-        logger.debug("element[]='%s'", type(element))
-        if not element.find("td"):
-            logger.debug("Skipping element, no <td> found")
-            continue
-
-        domain = tidyup.domain(element.find("td").text)
-        reason = tidyup.reason(element.findAll("td")[1].text)
-
+    for element in tag.find_all(domainColumn):
+        logger.debug("element[%s]='%s'", type(element), element)
+        domain = tidyup.domain(element.text)
+        reasons = element.find_next(reasonColumn).text.split(reasonText)[1].splitlines()
+        logger.debug("reasons(%d)='%s'", len(reasons), reasons)
+        reason = None
+        for r in reasons:
+            logger.debug("r[%s]='%s'", type(r), r)
+            if r != "":
+                reason = r
+                break
+
+        reason = tidyup.reason(reason)
         logger.debug("domain='%s',reason='%s'", domain, reason)
 
         if not domain_helper.is_wanted(domain):