]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Sat, 1 Jul 2023 02:34:06 +0000 (04:34 +0200)
committerRoland Häder <roland@mxchange.org>
Sat, 1 Jul 2023 02:34:06 +0000 (04:34 +0200)
- skip empty domains
- rewrote handling of blocklist from pleroma

fba/commands.py
fba/networks/friendica.py
fba/networks/pleroma.py

index 55be64e8ee8c7421c13ff77749750377c2d2d261..bd755b73bba4faaac5120cf4bc9e2ecb204f3866 100644 (file)
@@ -109,8 +109,11 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int:
             if "domain" not in row:
                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
                 continue
+            elif row["domain"] == "":
+                logger.debug("row[domain] is empty - SKIPPED!")
+                continue
             elif not utils.is_domain_wanted(row["domain"]):
-                logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
+                logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
                 continue
             elif instances.is_registered(row["domain"]):
                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
@@ -160,8 +163,11 @@ def fetch_bkali(args: argparse.Namespace) -> int:
             if "domain" not in entry:
                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
                 continue
+            elif entry["domain"] == "":
+                logger.debug("entry[domain] is empty - SKIPPED!")
+                continue
             elif not utils.is_domain_wanted(entry["domain"]):
-                logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
+                logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
                 continue
             elif instances.is_registered(entry["domain"]):
                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
@@ -327,8 +333,11 @@ def fetch_blocks(args: argparse.Namespace) -> int:
                 nodeinfo_url     = row["nodeinfo_url"]
 
             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
-            if not utils.is_domain_wanted(block["blocked"]):
-                logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
+            if block["blocked"] == "":
+                logger.debug("block[blocked] is empty - SKIPPED!")
+                continue
+            elif not utils.is_domain_wanted(block["blocked"]):
+                logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
                 continue
             elif block["block_level"] in ["accept", "accepted"]:
                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
@@ -429,9 +438,12 @@ def fetch_observer(args: argparse.Namespace) -> int:
             logger.debug("item[]='%s'", type(item))
             domain = item.decode_contents()
 
-            logger.debug("domain='%s'", domain)
-            if not utils.is_domain_wanted(domain):
-                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+            logger.debug("domain='%s' - AFTER!", domain)
+            if domain == "":
+                logger.debug("domain is empty - SKIPPED!")
+                continue
+            elif not utils.is_domain_wanted(domain):
+                logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
             elif instances.is_registered(domain):
                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
@@ -604,10 +616,14 @@ def fetch_fba_rss(args: argparse.Namespace) -> int:
         logger.debug("rss[]='%s'", type(rss))
         for item in rss.items:
             logger.debug("item='%s'", item)
-            domain = item.link.split("=")[1]
+            domain = tidyup.domain(item.link.split("=")[1])
 
-            if not utils.is_domain_wanted(domain):
-                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+            logger.debug("domain='%s' - AFTER!", domain)
+            if domain == "":
+                logger.debug("domain is empty - SKIPPED!")
+                continue
+            elif not utils.is_domain_wanted(domain):
+                logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
             elif domain in domains:
                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
@@ -659,13 +675,17 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int:
             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
             logger.debug("doc[]='%s'", type(doc))
             for element in doc.findAll("a"):
+                logger.debug("element[]='%s'", type(element))
                 for href in element["href"].split(","):
-                    logger.debug("href[%s]='%s", type(href), href)
+                    logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
                     domain = tidyup.domain(href)
 
-                    logger.debug("domain='%s'", domain)
-                    if not utils.is_domain_wanted(domain):
-                        logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+                    logger.debug("domain='%s' - AFTER!", domain)
+                    if domain == "":
+                        logger.debug("domain is empty - SKIPPED!")
+                        continue
+                    elif not utils.is_domain_wanted(domain):
+                        logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                         continue
                     elif domain in domains:
                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
@@ -723,9 +743,12 @@ def fetch_instances(args: argparse.Namespace) -> int:
     rows = database.cursor.fetchall()
     logger.info("Checking %d entries ...", len(rows))
     for row in rows:
-        logger.debug("domain='%s'", row["domain"])
-        if not utils.is_domain_wanted(row["domain"]):
-            logger.debug("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
+        logger.debug("row[domain]='%s'", row["domain"])
+        if row["domain"] == "":
+            logger.debug("row[domain] is empty - SKIPPED!")
+            continue
+        elif not utils.is_domain_wanted(row["domain"]):
+            logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
             continue
 
         try:
@@ -846,8 +869,11 @@ def fetch_oliphant(args: argparse.Namespace) -> int:
                 reject_reports = True
 
             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
-            if not utils.is_domain_wanted(domain):
-                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+            if domain == "":
+                logger.debug("domain is empty - SKIPPED!")
+                continue
+            elif not utils.is_domain_wanted(domain):
+                logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
 
             logger.debug("Marking domain='%s' as handled", domain)
@@ -902,12 +928,15 @@ def fetch_txt(args: argparse.Namespace) -> int:
 
             logger.info("Processing %d domains ...", len(domains))
             for domain in domains:
-                logger.debug("domain='%s'", domain)
+                logger.debug("domain='%s' - BEFORE!", domain)
+                domain = tidyup.domain(domain)
+
+                logger.debug("domain='%s' - AFTER!", domain)
                 if domain == "":
                     logger.debug("domain is empty - SKIPPED!")
                     continue
                 elif not utils.is_domain_wanted(domain):
-                    logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+                    logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                     continue
                 elif instances.is_recent(domain):
                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
@@ -943,12 +972,12 @@ def fetch_fedipact(args: argparse.Namespace) -> int:
             logger.debug("row[]='%s'", type(row))
             domain = tidyup.domain(row.contents[0])
 
-            logger.debug("domain='%s'", domain)
+            logger.debug("domain='%s' - AFTER!", domain)
             if domain == "":
                 logger.debug("domain is empty - SKIPPED!")
                 continue
             elif not utils.is_domain_wanted(domain):
-                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+                logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
             elif instances.is_registered(domain):
                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
@@ -1064,10 +1093,15 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int:
 
     logger.debug("blocking()=%d", blocking)
     for block in blocking:
+        logger.debug("block[]='%s'", type(block))
         block["blocked"] = tidyup.domain(block["blocked"])
 
-        if not utils.is_domain_wanted(block["blocked"]):
-            logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
+        logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
+        if block["blocked"] == "":
+            logger.debug("block[blocked] is empty - SKIPPED!")
+            continue
+        elif not utils.is_domain_wanted(block["blocked"]):
+            logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
             continue
         elif instances.is_recent(block["blocked"]):
             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
@@ -1082,10 +1116,15 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int:
         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
 
         for block in blocking:
+            logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
 
-            if not utils.is_domain_wanted(block["blocked"]):
-                logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
+            logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
+            if block["blocked"] == "":
+                logger.debug("block[blocked] is empty - SKIPPED!")
+                continue
+            elif not utils.is_domain_wanted(block["blocked"]):
+                logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
                 continue
 
             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
@@ -1151,10 +1190,13 @@ def recheck_obfuscation(args: argparse.Namespace) -> int:
         obfuscated = 0
         blockdict = list()
         for block in blocking:
-            logger.debug("blocked='%s'", block["blocked"])
+            logger.debug("block[blocked]='%s'", block["blocked"])
             blocked = None
 
-            if block["blocked"].endswith(".arpa"):
+            if block["blocked"] == "":
+                logger.debug("block[blocked] is empty - SKIPPED!")
+                continue
+            elif block["blocked"].endswith(".arpa"):
                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
                 continue
             elif block["blocked"].endswith(".tld"):
@@ -1168,7 +1210,7 @@ def recheck_obfuscation(args: argparse.Namespace) -> int:
                 obfuscated = obfuscated + 1
                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
             elif not utils.is_domain_wanted(block["blocked"]):
-                logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
+                logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
                 continue
             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
index 459f94872aca8057cfbbf09afad2b8429ac4a04e..ab985c7f3f69413e4f56a4d53939f187a2b35dbc 100644 (file)
@@ -85,9 +85,10 @@ def fetch_blocks(domain: str) -> list:
             logger.debug("blocked='%s' is not wanted - SKIPPED!", domain)
             continue
 
+        logger.debug("blocked='%s',domain='%s' - BEFORE!", blocked, domain)
         blocked = utils.deobfuscate_domain(blocked, domain)
 
-        logger.debug("blocked[%s]='%s'", type(blocked), blocked)
+        logger.debug("blocked[%s]='%s' - DEOBFUSCATED!", type(blocked), blocked)
         if not utils.is_domain_wanted(blocked):
             logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
             continue
index a701fdc186b684b8e18ac6dccfa2d85f14457ec3..435d7ce2d3099ea38f5e4ca2cc51f93474365e6b 100644 (file)
@@ -36,7 +36,10 @@ logger = logging.getLogger(__name__)
 # Language mapping X -> English
 language_mapping = {
     # English -> English
-    "Reject": "Suspended servers",
+    "limited servers"  : "followers_only",
+    "suspended servers": "reject",
+    "silenced servers" : "silenced",
+    "filtered media"   : "filtered_media",
 }
 
 def fetch_blocks(domain: str, nodeinfo_url: str) -> list:
@@ -232,16 +235,13 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list:
         for blocked in rows:
             logger.debug("blocked='%s' - BEFORE!", blocked)
             blocked = tidyup.domain(blocked)
-            logger.debug("blocked='%s' - AFTER!", blocked)
+            reason = tidyup.reason(rows[blocked]["reason"])
+            logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
             if blocked not in rows or "reason" not in rows[blocked]:
                 logger.warning("Cannot find blocked='%s' in rows()=%d,domain='%s' - BREAK!", blocked, len(rows), domain)
                 break
-
-            reason = tidyup.reason(rows[blocked]["reason"])
-            logger.debug("reason='%s'", reason)
-
-            if blocked == "":
+            elif blocked == "":
                 logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
                 continue
             elif not utils.is_domain_wanted(blocked):
@@ -271,35 +271,26 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list:
 
         logger.debug("blocklist()=%d", len(blocklist))
         if len(blocklist) > 0:
-            logger.info("Checking %d record(s) ...", len(blocklist))
+            logger.info("Checking %d different blocklists ...", len(blocklist))
             for block_level in blocklist:
                 logger.debug("block_level='%s'", block_level)
                 rows = blocklist[block_level]
 
                 logger.debug("rows[%s]()=%d'", type(rows), len(rows))
-                for record in rows:
-                    logger.debug("record[]='%s'", type(record))
-                    blocked = tidyup.domain(record["blocked"])
-                    reason  = tidyup.reason(record["reason"])
-                    logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
-
-                    if not utils.is_domain_wanted(blocked):
-                        logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
-                        continue
-
-                    logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain)
-                    blocked = utils.deobfuscate_domain(blocked, domain)
+                for block in rows:
+                    logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", block["blocked"], domain)
+                    block["blocked"] = utils.deobfuscate_domain(block["blocked"], domain)
 
-                    logger.debug("blocked='%s' - DEOBFUSCATED!", blocked)
-                    if not utils.is_domain_wanted(blocked):
-                        logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
+                    logger.debug("block[blocked]='%s' - DEOBFUSCATED!", block["blocked"])
+                    if not utils.is_domain_wanted(block["blocked"]):
+                        logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
                         continue
 
-                    logger.debug("Appending blocker='%s',blocked='%s',reason='%s',block_level='%s' ...",domain, blocked, reason, block_level)
+                    logger.debug("Appending blocker='%s',block[blocked]='%s',block[reason]='%s',block_level='%s' ...",domain, block["blocked"], block["reason"], block_level)
                     blockdict.append({
                         "blocker"    : domain,
-                        "blocked"    : blocked,
-                        "reason"     : reason,
+                        "blocked"    : block["blocked"],
+                        "reason"     : block["reason"],
                         "block_level": block_level,
                     })
 
@@ -347,10 +338,10 @@ def fetch_blocks_from_about(domain: str) -> dict:
             break
 
     blocklist = {
-        "Suspended servers": [],
-        "Filtered media"   : [],
-        "Limited servers"  : [],
-        "Silenced servers" : [],
+        "reject"        : [],
+        "filtered_media": [],
+        "followers_only": [],
+        "silenced"      : [],
     }
 
     logger.debug("doc[]='%s'", type(doc))
@@ -358,33 +349,41 @@ def fetch_blocks_from_about(domain: str) -> dict:
         logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain)
         return list()
 
-    for header in doc.find_all("h2"):
+    headers = doc.find_all("h2")
+
+    logger.debug("headers[]='%s'", type(headers))
+    if headers is None:
+        logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain)
+        return list()
+
+    logger.info("Checking %d headers ...", len(headers))
+    for header in headers:
         logger.debug("header[%s]='%s'", type(header), header)
-        header_text = tidyup.reason(header.text)
+        block_level = tidyup.reason(header.text).lower()
 
-        logger.debug("header_text='%s' - BEFORE!", header_text)
-        if header_text in language_mapping:
-            logger.debug("header_text='%s' - FOUND!", header_text)
-            header_text = language_mapping[header_text]
+        logger.debug("block_level='%s' - BEFORE!", block_level)
+        if block_level in language_mapping:
+            logger.debug("block_level='%s' - FOUND!", block_level)
+            block_level = language_mapping[block_level].lower()
         else:
-            logger.warning("header_text='%s' not found in language mapping table", header_text)
+            logger.warning("block_level='%s' not found in language mapping table", block_level)
 
-        logger.debug("header_text='%s - AFTER!'", header_text)
-        if header_text in blocklist or header_text.lower() in blocklist:
+        logger.debug("block_level='%s - AFTER!'", block_level)
+        if block_level in blocklist:
             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
-            logger.debug("Found header_text='%s', importing domain blocks ...", header_text)
+            logger.debug("Found block_level='%s', importing domain blocks ...", block_level)
             for line in header.find_next("table").find_all("tr")[1:]:
                 logger.debug("line[]='%s'", type(line))
-                blocklist[header_text].append({
-                    "blocked": tidyup.domain(line.find_all("td")[0].text),
-                    "reason" : tidyup.reason(line.find_all("td")[1].text),
+                blocked = tidyup.domain(line.find_all("td")[0].text)
+                reason = tidyup.reason(line.find_all("td")[1].text)
+
+                logger.debug("Appending block_level='%s',blocked='%s',reason='%s' ...", block_level, blocked, reason)
+                blocklist[block_level].append({
+                    "blocked": blocked,
+                    "reason" : reason,
                 })
         else:
-            logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist))
+            logger.warning("block_level='%s' not found in blocklist()=%d", block_level, len(blocklist))
 
     logger.debug("Returning blocklist for domain='%s' - EXIT!", domain)
-    return {
-        "reject"        : blocklist["Suspended servers"],
-        "media_removal" : blocklist["Filtered media"],
-        "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
-    }
+    return blocklist