]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Wed, 21 Jun 2023 00:28:35 +0000 (02:28 +0200)
committerRoland Häder <roland@mxchange.org>
Wed, 21 Jun 2023 00:28:35 +0000 (02:28 +0200)
- implemented lemmy.fetch_blocks()
- don't fetch generically, better only network.exceptions

fba/commands.py
fba/networks/lemmy.py
fba/networks/mastodon.py
fba/networks/pleroma.py

index ddeb8ec474527abe2dbb9446988403a23638bac5..f276a1f26fff63541c8e902de865767f7446ae55 100644 (file)
@@ -40,6 +40,7 @@ from fba.models import blocks
 from fba.models import instances
 
 from fba.networks import friendica
+from fba.networks import lemmy
 from fba.networks import mastodon
 from fba.networks import misskey
 from fba.networks import pleroma
@@ -195,7 +196,7 @@ def fetch_blocks(args: argparse.Namespace):
             mastodon.fetch_blocks(blocker, origin, nodeinfo_url)
         elif software == "lemmy":
             print(f"INFO: blocker='{blocker}',software='{software}'")
-            #lemmy.fetch_blocks(blocker, origin, nodeinfo_url)
+            lemmy.fetch_blocks(blocker, origin, nodeinfo_url)
         elif software == "friendica" or software == "misskey":
             print(f"INFO: blocker='{blocker}',software='{software}'")
 
index 5a43afc0a89d0cdcbfec7f651c16c4a80af8ce66..63829bb6deb662f0c3aa6a9e52bfc98fcc627d5b 100644 (file)
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import inspect
+
+import bs4
+import validators
+
+from fba import blacklist
 from fba import config
 from fba import csrf
+from fba import fba
 from fba import federation
 from fba import network
 
+from fba.models import blocks
 from fba.models import instances
 
 def fetch_peers(domain: str) -> list:
@@ -62,7 +70,7 @@ def fetch_peers(domain: str) -> list:
             print("WARNING: JSON response does not contain 'federated_instances':", domain)
             instances.set_last_error(domain, data)
 
-    except BaseException as exception:
+    except network.exceptions as exception:
         print(f"WARNING: Exception during fetching JSON: domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
 
     # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
@@ -70,3 +78,109 @@ def fetch_peers(domain: str) -> list:
 
     # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
     return peers
+
+def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
+    # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
+    if not isinstance(domain, str):
+        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+    elif domain == "":
+        raise ValueError("Parameter 'domain' is empty")
+    elif not isinstance(origin, str) and origin is not None:
+        raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
+    elif origin == "":
+        raise ValueError("Parameter 'origin' is empty")
+    elif not isinstance(nodeinfo_url, str):
+        raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
+    elif nodeinfo_url == "":
+        raise ValueError("Parameter 'nodeinfo_url' is empty")
+
+    translations = [
+        "blocked instances",
+    ]
+
+    try:
+        # json endpoint for newer mastodongs
+        found_blocks = list()
+        blocklist = list()
+
+        rows = {
+            "reject"        : [],
+            "media_removal" : [],
+            "followers_only": [],
+            "report_removal": [],
+        }
+
+        # DEBUG: print(f"DEBUG: Fetching /instances from domain='{domain}'")
+        response = network.fetch_response(
+            domain,
+            "/instances",
+            network.web_headers,
+            (config.get("connection_timeout"), config.get("read_timeout"))
+        )
+
+        # DEBUG: print(f"DEBUG: response.ok='{response.ok}',response.status_code={response.status_code},response.text()={len(response.text)}")
+        if response.ok and response.status_code < 300 and response.text != "":
+            # DEBUG: print(f"DEBUG: Parsing {len(response.text)} Bytes ...")
+
+            doc = bs4.BeautifulSoup(response.text, "html.parser")
+            # DEBUG: print(f"DEBUG: doc[]={type(doc)}")
+
+            headers = doc.findAll("h5")
+            found = None
+            # DEBUG: print(f"DEBUG: Search in {len(headers)} header(s) ...")
+            for header in headers:
+                # DEBUG: print(f"DEBUG: header[]={type(header)}")
+                content = header.contents[0]
+                # DEBUG: print(f"DEBUG: content='{content}'")
+                if content.lower() in translations:
+                    # DEBUG: print("DEBUG: Found header with blocked instances - BREAK!")
+                    found = header
+                    break
+
+            # DEBUG: print(f"DEBUG: found[]='{type(found)}'")
+            if found is None:
+                # DEBUG: print(f"DEBUG: domain='{domain}' is not blocking any instances - EXIT!")
+                return
+
+            blocking = found.find_next("ul").findAll("a")
+            # DEBUG: print(f"DEBUG: Found {len(blocking)} blocked instance(s) ...")
+            for tag in blocking:
+                # DEBUG: print(f"DEBUG: tag[]='{type(tag)}'")
+                blocked = tag.contents[0]
+
+                # DEBUG: print(f"DEBUG: blocked='{blocked}'")
+                if not validators.domain(blocked):
+                    # DEBUG: print(f"DEBUG: blocked='{blocked}' is not a valid domain - SKIPPED!")
+                    continue
+                elif blacklist.is_blacklisted(blocked):
+                    # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
+                    continue
+                elif blocked.endswith(".arpa"):
+                    print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
+                    continue
+                elif blocked.endswith(".tld"):
+                    print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                    continue
+                elif not instances.is_registered(blocked):
+                    # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
+                    instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
+
+                if not blocks.is_instance_blocked(domain, blocked, "reject"):
+                    # DEBUG: print("DEBUG: Blocking:", domain, blocked)
+                    blocks.add_instance(domain, blocked, None, "reject")
+
+                    found_blocks.append({
+                        "blocked": blocked,
+                        "reason" : None
+                    })
+                else:
+                    # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
+                    blocks.update_last_seen(domain, blocked, "reject")
+
+        # DEBUG: print("DEBUG: Committing changes ...")
+        fba.connection.commit()
+    except network.exceptions as exception:
+        print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'")
+
+    # DEBUG: print("DEBUG: EXIT!")
index 293b8ddb2cc7e57d2deda9e23ab12017dacb0457..b0392ea13f5e751dfc85c29d01d29112ea6b5cdf 100644 (file)
@@ -82,8 +82,8 @@ def fetch_blocks_from_about(domain: str) -> dict:
                 # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!")
                 break
 
-        except BaseException as exception:
-            print("ERROR: Cannot fetch from domain:", domain, exception)
+        except network.exceptions as exception:
+            print(f"ERROR: Cannot fetch from domain='{domain}',exception='{type(exception)}'")
             instances.set_last_error(domain, exception)
             break
 
@@ -284,6 +284,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 elif blocked.endswith(".arpa"):
                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                     continue
+                elif blocked.endswith(".tld"):
+                    print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                    continue
                 elif not instances.is_registered(blocked):
                     # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -295,6 +298,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 elif blocked.endswith(".arpa"):
                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                     continue
+                elif blocked.endswith(".tld"):
+                    print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                    continue
                 elif not instances.is_registered(blocked):
                     # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
index 39ca927d4fe80248bd1c4c6c94480185d7419db9..f23393cabc70aaf60f5610d1075f1bab1865505f 100644 (file)
@@ -143,6 +143,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     elif blocked.endswith(".arpa"):
                         print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                         continue
+                    elif blocked.endswith(".tld"):
+                        print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                        continue
                     elif not instances.is_registered(blocked):
                         # Commit changes
                         fba.connection.commit()
@@ -213,6 +216,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
             elif blocked.endswith(".arpa"):
                 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                 continue
+            elif blocked.endswith(".tld"):
+                print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                continue
             elif not instances.is_registered(blocked):
                 # Commit changes
                 fba.connection.commit()
@@ -316,6 +322,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 elif blocked.endswith(".arpa"):
                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                     continue
+                elif blocked.endswith(".tld"):
+                    print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                    continue
                 elif not instances.is_registered(blocked):
                     # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -388,6 +397,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
             elif blocked.endswith(".arpa"):
                 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                 continue
+            elif blocked.endswith(".tld"):
+                print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                continue
             elif not instances.is_registered(blocked):
                 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -460,6 +472,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     elif blocked.endswith(".arpa"):
                         print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
                         continue
+                    elif blocked.endswith(".tld"):
+                        print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
+                        continue
                     elif not instances.is_registered(blocked):
                         # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -519,7 +534,7 @@ def fetch_blocks_from_about(domain: str) -> dict:
                 # DEBUG: print(f"DEBUG: Found 'h2' header in path='{path}' - BREAK!")
                 break
 
-        except BaseException as exception:
+        except network.exceptions as exception:
             print("ERROR: Cannot fetch from domain:", domain, exception)
             instances.set_last_error(domain, exception)
             break