From: Roland Häder <roland@mxchange.org>
Date: Sun, 11 Jun 2023 16:31:22 +0000 (+0200)
Subject: Continued:
X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=3112d0947b02f4ee1b3470fbb42cdb1a45e7daf6;p=fba.git

Continued:
- fixed handling of 404 (no exception) case when nodeinfo wasn't found
- copied (WIP!) fetching /about page to pleroma
- also added /about to /about/more as possible pages for misskey
---

diff --git a/fba/federation.py b/fba/federation.py
index 42d847f..06d07ab 100644
--- a/fba/federation.py
+++ b/fba/federation.py
@@ -195,11 +195,7 @@ def fetch_nodeinfo(domain: str, path: str = None) -> dict:
 
     # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
     nodeinfo = fetch_wellknown_nodeinfo(domain)
-
     # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]='{nodeinfo}'")
-    if "error_message" in nodeinfo:
-        print(f"WARNING: Error during fetching nodeinfo: '{nodeinfo['error_message']}' - EXIT!")
-        return nodeinfo
 
     # No CSRF by default, you don't have to add network.api_headers by yourself here
     headers = tuple()
@@ -389,17 +385,13 @@ def determine_software(domain: str, path: str = None) -> str:
     data = fetch_nodeinfo(domain, path)
 
     # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
-    if "error_message" in data:
-        # DEBUG: print(f"DEBUG: Could not determine software type, domain='{domain}'")
-        if "exception" in data:
-            # Continue raising it
-            raise data["exception"]
-        else:
-            # Raise generic exception if none is attached
-            raise Exception(f"Cannot fetch nodeinfo from domain='{domain}': '{data['error_message']}'")
-
-    # DEBUG: print("DEBUG: data():", len(data), data)
-    if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
+    if "exception" in data:
+        # Continue raising it
+        raise data["exception"]
+    elif "error_message" in data:
+        print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
+        return fetch_generator_from_path(domain)
+    elif "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
         print("WARNING: JSON response is an error:", data["json"]["message"])
         instances.update_last_error(domain, data["json"]["message"])
         return fetch_generator_from_path(domain)
diff --git a/fba/network.py b/fba/network.py
index bc7a95f..18e9437 100644
--- a/fba/network.py
+++ b/fba/network.py
@@ -76,9 +76,10 @@ def post_json_api(domain: str, path: str, data: str, headers: dict = {}) -> dict
             print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',data()={len(data)},response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'")
             json_reply["status_code"]   = response.status_code
             json_reply["error_message"] = response.reason
+            del json_reply["json"]
             instances.update_last_error(domain, response)
 
-    except requests.exceptions.ConnectionError as exception:
+    except exceptions as exception:
         # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'")
         json_reply["status_code"]   = 999
         json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'"
@@ -111,8 +112,9 @@ def fetch_api_url(url: str, timeout: tuple) -> dict:
             print(f"WARNING: Cannot query JSON API: url='{url}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'")
             json_reply["status_code"]   = response.status_code
             json_reply["error_message"] = response.reason
+            del json_reply["json"]
 
-    except requests.exceptions.ConnectionError as exception:
+    except exceptions as exception:
         # DEBUG: print(f"DEBUG: Fetching '{url}' failed. exception[{type(exception)}]='{str(exception)}'")
         json_reply["status_code"]   = 999
         json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'"
@@ -149,7 +151,7 @@ def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict:
             timeout=timeout
         )
 
-    except requests.exceptions.ConnectionError as exception:
+    except exceptions as exception:
         # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'")
         json_reply["status_code"]   = 999
         json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'"
@@ -164,6 +166,7 @@ def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict:
         print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'")
         json_reply["status_code"]   = response.status_code
         json_reply["error_message"] = response.reason
+        del json_reply["json"]
         instances.update_last_error(domain, response)
 
     # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}")
@@ -237,7 +240,7 @@ def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> req
             timeout=timeout
         )
 
-    except requests.exceptions.ConnectionError as exception:
+    except exceptions as exception:
         # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'")
         instances.update_last_error(domain, exception)
         raise exception
diff --git a/fba/networks/mastodon.py b/fba/networks/mastodon.py
index e0525fa..f26d7c9 100644
--- a/fba/networks/mastodon.py
+++ b/fba/networks/mastodon.py
@@ -70,20 +70,33 @@ def fetch_blocks_from_about(domain: str) -> dict:
         "Silenced servers" : [],
     }
 
-    try:
-        doc = bs4.BeautifulSoup(
-            network.fetch_response(
-                domain,
-                "/about/more",
-                network.web_headers,
-                (config.get("connection_timeout"), config.get("read_timeout"))
-            ).text,
-            "html.parser",
-        )
-    except BaseException as exception:
-        print("ERROR: Cannot fetch from domain:", domain, exception)
-        instances.update_last_error(domain, exception)
-        return {}
+    doc = None
+    for path in ("/about/more", "/about"):
+        try:
+            print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
+            doc = bs4.BeautifulSoup(
+                network.fetch_response(
+                    domain,
+                    path,
+                    network.web_headers,
+                    (config.get("connection_timeout"), config.get("read_timeout"))
+                ).text,
+                "html.parser",
+            )
+
+            if len(doc.find_all("h3")) > 0:
+                print(f"DEBUG: path='{path}' had some headlines - BREAK!")
+                break
+
+        except BaseException as exception:
+            print("ERROR: Cannot fetch from domain:", domain, exception)
+            instances.update_last_error(domain, exception)
+            break
+
+    print(f"DEBUG: doc[]='{type(doc)}'")
+    if doc is None:
+        print(f"WARNING: Cannot find any 'h3' tags for domain='{domain}' - EXIT!")
+        return blocklist
 
     for header in doc.find_all("h3"):
         header_text = tidyup.reason(header.text)
diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py
index 62cf4c7..5a6f7ec 100644
--- a/fba/networks/pleroma.py
+++ b/fba/networks/pleroma.py
@@ -76,60 +76,64 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 continue
 
             # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
-            for blocked in blocklist:
-                # DEBUG: print("DEBUG: BEFORE blocked:", blocked)
-                blocked = tidyup.domain(blocked)
-                # DEBUG: print("DEBUG: AFTER blocked:", blocked)
-
-                if blocked == "":
-                    print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
-                    continue
-                elif blacklist.is_blacklisted(blocked):
-                    # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
-                    continue
-                elif blocked.count("*") > 1:
-                    # -ACK!-oma also started obscuring domains without hash
-                    fba.cursor.execute(
-                        "SELECT domain, nodeinfo_url FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", [blocked.replace("*", "_")]
-                    )
-                    searchres = fba.cursor.fetchone()
+            if len(blocklist) > 0:
+                for blocked in blocklist:
+                    # DEBUG: print("DEBUG: BEFORE blocked:", blocked)
+                    blocked = tidyup.domain(blocked)
+                    # DEBUG: print("DEBUG: AFTER blocked:", blocked)
 
-                    print(f"DEBUG: searchres[]='{type(searchres)}'")
-                    if searchres is None:
-                        print(f"WARNING: Cannot deobsfucate blocked='{blocked}' - SKIPPED!")
+                    if blocked == "":
+                        print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
+                        continue
+                    elif blacklist.is_blacklisted(blocked):
+                        # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
                         continue
+                    elif blocked.count("*") > 1:
+                        # -ACK!-oma also started obscuring domains without hash
+                        fba.cursor.execute(
+                            "SELECT domain, nodeinfo_url FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", [blocked.replace("*", "_")]
+                        )
+                        searchres = fba.cursor.fetchone()
 
-                    blocked = searchres[0]
-                    nodeinfo_url = searchres[1]
-                    # DEBUG: print("DEBUG: Looked up domain:", blocked)
-                elif not validators.domain(blocked):
-                    print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - skipped!")
-                    continue
+                        print(f"DEBUG: searchres[]='{type(searchres)}'")
+                        if searchres is None:
+                            print(f"WARNING: Cannot deobsfucate blocked='{blocked}' - SKIPPED!")
+                            continue
 
-                # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
-                if not validators.domain(blocked):
-                    print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - skipped!")
-                    continue
-                elif blocked.split(".")[-1] == "arpa":
-                    print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                    continue
-                elif not instances.is_registered(blocked):
-                    # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
-                    instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
+                        blocked = searchres[0]
+                        nodeinfo_url = searchres[1]
+                        # DEBUG: print("DEBUG: Looked up domain:", blocked)
+                    elif not validators.domain(blocked):
+                        print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - skipped!")
+                        continue
 
-                if not blocks.is_instance_blocked(domain, blocked, block_level):
-                    # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
-                    blocks.add_instance(domain, blocked, "unknown", block_level)
+                    # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
+                    if not validators.domain(blocked):
+                        print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - skipped!")
+                        continue
+                    elif blocked.split(".")[-1] == "arpa":
+                        print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
+                        continue
+                    elif not instances.is_registered(blocked):
+                        # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
+                        instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
+
+                    if not blocks.is_instance_blocked(domain, blocked, block_level):
+                        # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
+                        blocks.add_instance(domain, blocked, "unknown", block_level)
 
-                    if block_level == "reject":
-                        # DEBUG: print("DEBUG: Adding to blockdict:", blocked)
-                        blockdict.append({
-                                "blocked": blocked,
-                                "reason" : None
-                        })
-                    else:
-                        # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
-                        blocks.update_last_seen(domain, blocked, block_level)
+                        if block_level == "reject":
+                            # DEBUG: print("DEBUG: Adding to blockdict:", blocked)
+                            blockdict.append({
+                                    "blocked": blocked,
+                                    "reason" : None
+                            })
+                        else:
+                            # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
+                            blocks.update_last_seen(domain, blocked, block_level)
+            else:
+                # DEBUG: print(f"DEBUG: domain='{domain}' has returned zero rows, trying /about/more page ...")
+                rows = fetch_blocks_from_about(domain)
 
     # DEBUG: print("DEBUG: Committing changes ...")
     fba.connection.commit()
@@ -203,3 +207,76 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
     fba.connection.commit()
 
     # DEBUG: print("DEBUG: EXIT!")
+
+def fetch_blocks_from_about(domain: str) -> dict:
+    print(f"DEBUG: domain='{domain}' - CALLED!")
+    if not isinstance(domain, str):
+        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+    elif domain == "":
+        raise ValueError("Parameter 'domain' is empty")
+
+    print("DEBUG: Fetching mastodon blocks from domain:", domain)
+    blocklist = {
+        "Suspended servers": [],
+        "Filtered media"   : [],
+        "Limited servers"  : [],
+        "Silenced servers" : [],
+    }
+
+    doc = None
+    for path in ("/about/more", "/about"):
+        try:
+            print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
+            doc = bs4.BeautifulSoup(
+                network.fetch_response(
+                    domain,
+                    path,
+                    network.web_headers,
+                    (config.get("connection_timeout"), config.get("read_timeout"))
+                ).text,
+                "html.parser",
+            )
+
+            if len(doc.find_all("h3")) > 0:
+                print(f"DEBUG: path='{path}' had some headlines - BREAK!")
+                break
+
+        except BaseException as exception:
+            print("ERROR: Cannot fetch from domain:", domain, exception)
+            instances.update_last_error(domain, exception)
+            break
+
+    print(f"DEBUG: doc[]='{type(doc)}'")
+    if doc is None:
+        print(f"WARNING: Cannot find any 'h3' tags for domain='{domain}' - EXIT!")
+        return blocklist
+
+    for header in doc.find_all("h3"):
+        header_text = tidyup.reason(header.text)
+
+        print(f"DEBUG: header_text='{header_text}'")
+        if header_text in language_mapping:
+            print(f"DEBUG: header_text='{header_text}'")
+            header_text = language_mapping[header_text]
+        else:
+            print(f"WARNING: header_text='{header_text}' not found in language mapping table")
+
+        if header_text in blocklist or header_text.lower() in blocklist:
+            # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
+            for line in header.find_all_next("table")[0].find_all("tr")[1:]:
+                blocklist[header_text].append(
+                    {
+                        "domain": tidyup.domain(line.find("span").text),
+                        "hash"  : tidyup.domain(line.find("span")["title"][9:]),
+                        "reason": tidyup.reason(line.find_all("td")[1].text),
+                    }
+                )
+        else:
+            print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
+
+    print("DEBUG: Returning blocklist for domain:", domain)
+    return {
+        "reject"        : blocklist["Suspended servers"],
+        "media_removal" : blocklist["Filtered media"],
+        "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
+    }