]> git.mxchange.org Git - fba.git/blobdiff - fba/networks/friendica.py
Continued:
[fba.git] / fba / networks / friendica.py
index b3f6bc40d342ceee218fa42b6567b4a2026ef940..1434697ae1ac06146eb4b78d79eb275e961c38ef 100644 (file)
 import logging
 
 import bs4
-import validators
 
-from fba.helpers import blacklist
 from fba.helpers import config
+from fba.helpers import domain as domain_helper
 from fba.helpers import tidyup
 
 from fba.http import network
@@ -29,85 +28,80 @@ from fba.models import instances
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+#logger.setLevel(logging.DEBUG)
 
-def fetch_blocks(domain: str) -> dict:
-    logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
-    if not isinstance(domain, str):
-        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
-    elif domain == "":
-        raise ValueError("Parameter 'domain' is empty")
-    elif domain.lower() != domain:
-        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
-    elif not validators.domain(domain.split("/")[0]):
-        raise ValueError(f"domain='{domain}' is not a valid domain")
-    elif domain.endswith(".arpa"):
-        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif domain.endswith(".tld"):
-        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
+def fetch_blocks(domain: str) -> list:
+    logger.debug("domain='%s' - CALLED!", domain)
+    domain_helper.raise_on(domain)
+
+    if not instances.is_registered(domain):
+        raise Exception(f"domain='{domain}' is not registered but function is invoked.")
 
     blocklist = list()
     block_tag = None
 
     try:
-        logger.debug("Fetching friendica blocks from domain:", domain)
-        doc = bs4.BeautifulSoup(
-            network.fetch_response(
-                domain,
-                "/friendica",
-                network.web_headers,
-                (config.get("connection_timeout"), config.get("read_timeout"))
-            ).text,
-            "html.parser",
-        )
+        logger.debug("Fetching friendica blocks from domain='%s'", domain)
+        raw = network.fetch_response(
+            domain,
+            "/friendica",
+            network.web_headers,
+            (config.get("connection_timeout"), config.get("read_timeout"))
+        ).text
+        logger.debug("Parsing %d Bytes ...", len(raw))
+
+        doc = bs4.BeautifulSoup(raw, "html.parser",)
         logger.debug("doc[]='%s'", type(doc))
 
         block_tag = doc.find(id="about_blocklist")
+        logger.debug("block_tag[%s]='%s'", type(block_tag), block_tag)
     except network.exceptions as exception:
-        logger.warning(f"Exception '{type(exception)}' during fetching instances (friendica) from domain='{domain}'")
+        logger.warning("Exception '%s' during fetching instances from domain='%s'", type(exception), domain)
         instances.set_last_error(domain, exception)
-        return dict()
 
-    # Prevents exceptions:
+        logger.debug("Returning empty list ... - EXIT!")
+        return list()
+
+    logger.debug("block_tag[%s]='%s'", type(block_tag), block_tag)
     if block_tag is None:
-        logger.debug("Instance has no block list:", domain)
-        return dict()
+        logger.debug("Instance has no block list: domain='%s' - EXIT!", domain)
+        return list()
 
     table = block_tag.find("table")
 
-    logger.debug(f"table[]='{type(table)}'")
-    if table.find("tbody"):
+    logger.debug("table[]='%s'", type(table))
+    if table is None:
+        logger.warning("domain='%s' has no table tag - EXIT !", domain)
+        return list()
+    elif table.find("tbody"):
         rows = table.find("tbody").find_all("tr")
     else:
         rows = table.find_all("tr")
 
-    logger.debug(f"Found rows()={len(rows)}")
+    logger.debug("Found rows()=%d", len(rows))
     for line in rows:
-        logger.debug(f"line='{line}'")
-        blocked = tidyup.domain(line.find_all("td")[0].text)
+        logger.debug("line='%s'", line)
+        blocked = line.find_all("td")[0].text
+        logger.debug("blocked='%s'", blocked)
+
+        blocked = tidyup.domain(blocked) if blocked != "" else None
         reason  = tidyup.reason(line.find_all("td")[1].text)
-        logger.debug(f"blocked='{blocked}',reason='{reason}'")
+        logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
-        if not validators.domain(blocked):
-            logger.warning(f"blocked='{blocked}' is not a valid domain - SKIPPED!")
-            continue
-        elif blocked.endswith(".arpa"):
-            logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-            continue
-        elif blocked.endswith(".tld"):
-            logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
+        if blocked is None or blocked == "":
+            logger.warning("line[]='%s' returned empty blocked domain - SKIPPED!", type(line))
             continue
-        elif blacklist.is_blacklisted(blocked):
-            logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+        elif not domain_helper.is_wanted(blocked):
+            logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
             continue
 
-        logger.debug(f"Appending blocked='{blocked}',reason='{reason}'")
+        logger.debug("Appending blocked='%s',reason='%s'", blocked, reason)
         blocklist.append({
-            "domain": tidyup.domain(blocked),
-            "reason": tidyup.reason(reason)
+            "blocker"    : domain,
+            "blocked"    : blocked,
+            "reason"     : reason,
+            "block_level": "reject",
         })
-        logger.debug("Next!")
 
-    logger.debug("Returning blocklist() for domain:", domain, len(blocklist))
-    return {
-        "reject": blocklist
-    }
+    logger.debug("blocklist()=%d - EXIT!", len(blocklist))
+    return blocklist