]> git.mxchange.org Git - fba.git/blobdiff - fba/networks/lemmy.py
Continued:
[fba.git] / fba / networks / lemmy.py
index a26fe15bccfb2ce1486328200e14bb46ec5574ca..ae45e7528fa844b1c70b0b8f317b21a2dc5e36f2 100644 (file)
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import inspect
+import json
 import logging
 
 import bs4
-import validators
-
-from fba import csrf
-from fba import database
-from fba import utils
 
 from fba.helpers import config
+from fba.helpers import domain as domain_helper
 from fba.helpers import tidyup
 
+from fba.http import csrf
 from fba.http import federation
 from fba.http import network
 
-from fba.models import blocks
 from fba.models import instances
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+#logger.setLevel(logging.DEBUG)
 
-def fetch_peers(domain: str) -> list:
-    logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
-    if not isinstance(domain, str):
-        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
-    elif domain == "":
-        raise ValueError("Parameter 'domain' is empty")
-    elif domain.lower() != domain:
-        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
-    elif not validators.domain(domain.split("/")[0]):
-        raise ValueError(f"domain='{domain}' is not a valid domain")
-    elif domain.endswith(".arpa"):
-        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif domain.endswith(".tld"):
-        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
+def fetch_peers(domain: str, origin: str) -> list:
+    logger.debug("domain='%s',origin='%s' - CALLED!", domain, origin)
+    domain_helper.raise_on(domain)
 
     peers = list()
 
@@ -60,12 +46,14 @@ def fetch_peers(domain: str) -> list:
         logger.debug("Checking CSRF for domain='%s'", domain)
         headers = csrf.determine(domain, dict())
     except network.exceptions as exception:
-        logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
+        logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
         instances.set_last_error(domain, exception)
-        return peers
+
+        logger.debug("Returning empty list ... - EXIT!")
+        return list()
 
     try:
-        logger.debug(f"domain='{domain}' is Lemmy, fetching JSON ...")
+        logger.debug("Fetching '/api/v3/site' from domain='%s' ...", domain)
         data = network.get_json_api(
             domain,
             "/api/v3/site",
@@ -75,87 +63,73 @@ def fetch_peers(domain: str) -> list:
 
         logger.debug("data[]='%s'", type(data))
         if "error_message" in data:
-            logger.warning("Could not reach any JSON API:", domain)
+            logger.warning("Could not reach any JSON API: domain='%s'", domain)
             instances.set_last_error(domain, data)
         elif "federated_instances" in data["json"] and isinstance(data["json"]["federated_instances"], dict):
-            logger.debug(f"Found federated_instances for domain='{domain}'")
+            logger.debug("Found federated_instances for domain='%s'", domain)
             peers = peers + federation.add_peers(data["json"]["federated_instances"])
-            logger.debug("Added instance(s) to peers")
-        else:
-            logger.warning("JSON response does not contain 'federated_instances', domain='%s'", domain)
-            instances.set_last_error(domain, data)
+
+            logger.debug("Marking domain='%s' as successfully handled ...", domain)
+            instances.set_success(domain)
+
+        if len(peers) == 0:
+            logger.warning("Fetching instances for domain='%s' from /instances ...", domain)
+            peers = fetch_instances(domain, origin)
 
     except network.exceptions as exception:
-        logger.warning(f"Exception during fetching JSON: domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
+        logger.warning("Exception during fetching JSON: domain='%s',exception[%s]:'%s'", domain, type(exception), str(exception))
         instances.set_last_error(domain, exception)
 
-    logger.debug(f"Adding '{len(peers)}' for domain='{domain}'")
-    instances.set_total_peers(domain, peers)
-
-    logger.debug("Returning peers[]:", type(peers))
+    logger.debug("peers()=%d - EXIT!", len(peers))
     return peers
 
-def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
-    logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
-    if not isinstance(domain, str):
-        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
-    elif domain == "":
-        raise ValueError("Parameter 'domain' is empty")
-    elif domain.lower() != domain:
-        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
-    elif not validators.domain(domain.split("/")[0]):
-        raise ValueError(f"domain='{domain}' is not a valid domain")
-    elif domain.endswith(".arpa"):
-        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif domain.endswith(".tld"):
-        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
-    elif not isinstance(origin, str) and origin is not None:
-        raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
-    elif origin == "":
-        raise ValueError("Parameter 'origin' is empty")
-    elif not isinstance(nodeinfo_url, str):
-        raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
+def fetch_blocks(domain: str, nodeinfo_url: str) -> list:
+    logger.debug("domain='%s,nodeinfo_url='%s' - CALLED!", domain, nodeinfo_url)
+    domain_helper.raise_on(domain)
+
+    if not isinstance(nodeinfo_url, str):
+        raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not of type 'str'")
     elif nodeinfo_url == "":
         raise ValueError("Parameter 'nodeinfo_url' is empty")
 
     translations = [
-        "Blocked Instances",
-        "Instàncies bloquejades",
-        "Blocáilte Ásc",
-        "封锁实例",
-        "Blokované instance",
-        "Geblokkeerde instanties",
-        "Blockerade instanser",
-        "Instàncias blocadas",
-        "Istanze bloccate",
-        "Instances bloquées",
-        "Letiltott példányok",
-        "Instancias bloqueadas",
-        "Blokeatuta dauden instantziak",
-        "차단된 인스턴스",
-        "Peladen Yang Diblokir",
-        "Blokerede servere",
-        "Blokitaj nodoj",
-        "Блокирани Инстанции",
-        "Blockierte Instanzen",
-        "Estetyt instanssit",
-        "Instâncias bloqueadas",
-        "Zablokowane instancje",
-        "Blokované inštancie",
-        "المثلاء المحجوبون",
-        "Užblokuoti serveriai",
-        "ブロックしたインスタンス",
-        "Блокированные Инстансы",
-        "Αποκλεισμένοι διακομιστές",
-        "封鎖站台",
-        "Instâncias bloqueadas",
+        "Blocked Instances".lower(),
+        "Instàncies bloquejades".lower(),
+        "Blocáilte Ásc".lower(),
+        "封锁实例".lower(),
+        "Blokované instance".lower(),
+        "Geblokkeerde instanties".lower(),
+        "Blockerade instanser".lower(),
+        "Instàncias blocadas".lower(),
+        "Istanze bloccate".lower(),
+        "Instances bloquées".lower(),
+        "Letiltott példányok".lower(),
+        "Instancias bloqueadas".lower(),
+        "Blokeatuta dauden instantziak".lower(),
+        "차단된 인스턴스".lower(),
+        "Peladen Yang Diblokir".lower(),
+        "Blokerede servere".lower(),
+        "Blokitaj nodoj".lower(),
+        "Блокирани Инстанции".lower(),
+        "Blockierte Instanzen".lower(),
+        "Estetyt instanssit".lower(),
+        "Instâncias bloqueadas".lower(),
+        "Zablokowane instancje".lower(),
+        "Blokované inštancie".lower(),
+        "المثلاء المحجوبون".lower(),
+        "Užblokuoti serveriai".lower(),
+        "ブロックしたインスタンス".lower(),
+        "Блокированные Инстансы".lower(),
+        "Αποκλεισμένοι διακομιστές".lower(),
+        "封鎖站台".lower(),
+        "Instâncias bloqueadas".lower(),
     ]
 
+    blocklist = list()
+
     try:
         # json endpoint for newer mastodongs
-        found_blocks = list()
-
-        logger.debug(f"Fetching /instances from domain='{domain}'")
+        logger.debug("Fetching /instances from domain='%s'", domain)
         response = network.fetch_response(
             domain,
             "/instances",
@@ -164,60 +138,239 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
         )
 
         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
-        if response.ok and response.status_code < 300 and response.text != "":
-            logger.debug(f"Parsing {len(response.text)} Bytes ...")
+        if response.ok and response.status_code == 200 and response.text != "":
+            logger.debug("Parsing %s Bytes ...", len(response.text))
 
             doc = bs4.BeautifulSoup(response.text, "html.parser")
-            logger.debug(f"doc[]={type(doc)}")
+            logger.debug("doc[]='%s'", type(doc))
 
-            headers = doc.findAll("h5")
             found = None
-            logger.debug(f"Search in {len(headers)} header(s) ...")
-            for header in headers:
-                logger.debug(f"header[]={type(header)}")
-                content = header.contents[0]
-
-                logger.debug(f"content='{content}'")
-                if content in translations:
-                    logger.debug("Found header with blocked instances - BREAK!")
-                    found = header
+            for criteria in [{"class": "home-instances container-lg"}, {"class": "container"}]:
+                logger.debug("criteria='%s'", criteria)
+                containers = doc.findAll("div", criteria)
+
+                logger.debug("Checking %d containers ...", len(containers))
+                for container in containers:
+                    logger.debug("container[]='%s'", type(container))
+                    for header in container.find_all(["h2", "h3", "h4", "h5"]):
+                        content = header
+                        logger.debug("header[%s]='%s' - BEFORE!", type(header), header)
+                        if header is not None:
+                            content = str(header.contents[0])
+                        logger.debug("content[%s]='%s' - AFTER!", type(content), content)
+
+                        if content is None:
+                            logger.debug("domain='%s' has returned empty header='%s' - SKIPPED!", domain, header)
+                            continue
+                        elif not isinstance(content, str):
+                            logger.debug("content[]='%s' is not supported/wanted type 'str' - SKIPPED!", type(content))
+                            continue
+                        elif content.lower() in translations:
+                            logger.debug("Found header='%s' with blocked instances - BREAK(3) !", header)
+                            found = header
+                            break
+
+                    logger.debug("found[]='%s'", type(found))
+                    if found is not None:
+                        logger.debug("Found header with blocked instances - BREAK(2) !")
+                        break
+
+                logger.debug("found[]='%s'", type(found))
+                if found is not None:
+                    logger.debug("Found header with blocked instances - BREAK(1) !")
                     break
 
-            logger.debug(f"found[]='{type(found)}'")
+            logger.debug("found[]='%s'", type(found))
             if found is None:
-                logger.debug(f"domain='{domain}' is not blocking any instances - EXIT!")
-                return
+                logger.info("domain='%s' has no HTML blocklist, checking scripts ...", domain)
+                peers = parse_script(doc, "blocked")
+
+                logger.debug("domain='%s' has %d peer(s).", domain, len(peers))
+                for blocked in peers:
+                    logger.debug("Appending blocker='%s',blocked='%s',block_level='reject' ...", domain, blocked)
+                    blocklist.append({
+                        "blocker"    : domain,
+                        "blocked"    : blocked,
+                        "reason"     : None,
+                        "block_level": "reject",
+                    })
+
+                logger.debug("blocklist()=%d - EXIT!", len(blocklist))
+                return blocklist
 
-            blocking = found.find_next("ul").findAll("a")
-            logger.debug(f"Found {len(blocking)} blocked instance(s) ...")
+            blocking = found.find_next(["ul", "table"]).findAll("a")
+            logger.debug("Found %d blocked instance(s) ...", len(blocking))
             for tag in blocking:
-                logger.debug(f"tag[]='{type(tag)}'")
+                logger.debug("tag[]='%s'", type(tag))
                 blocked = tidyup.domain(tag.contents[0])
+                logger.debug("blocked='%s'", blocked)
 
-                logger.debug(f"blocked='{blocked}'")
-                if not utils.is_domain_wanted(blocked):
-                    logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
+                if blocked == "":
+                    logger.warning("blocked='%s' is empty after tidyup.domain() - SKIPPED!", tag.contents[0])
                     continue
-                elif not instances.is_registered(blocked):
-                    logger.debug("Hash wasn't found, adding:", blocked, domain)
-                    instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
 
-                if not blocks.is_instance_blocked(domain, blocked, "reject"):
-                    logger.debug("Blocking:", domain, blocked)
-                    blocks.add_instance(domain, blocked, None, "reject")
+                logger.debug("Appending blocker='%s',blocked='%s',block_level='reject' ...", domain, blocked)
+                blocklist.append({
+                    "blocker"    : domain,
+                    "blocked"    : blocked,
+                    "reason"     : None,
+                    "block_level": "reject",
+                })
+        else:
+            logger.warning("Cannot fetch /instances due to error: response.ok='%s',response.status_code=%d,response.details='%s'", response.ok, response.status_code, response.reason)
+            instances.set_last_error(domain, response)
 
-                    found_blocks.append({
-                        "blocked": blocked,
-                        "reason" : None
-                    })
-                else:
-                    logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
-                    blocks.update_last_seen(domain, blocked, "reject")
+    except network.exceptions as exception:
+        logger.warning("domain='%s',exception[%s]:'%s'", domain, type(exception), str(exception))
+        instances.set_last_error(domain, exception)
+
+    logger.debug("blocklist()=%d - EXIT!", len(blocklist))
+    return blocklist
+
+def fetch_instances(domain: str, origin: str) -> list:
+    logger.debug("domain='%s',origin='%s' - CALLED!", domain, origin)
+    domain_helper.raise_on(domain)
+
+    peers = list()
+
+    try:
+        # json endpoint for newer mastodongs
+        logger.debug("Fetching /instances from domain='%s'", domain)
+        response = network.fetch_response(
+            domain,
+            "/instances",
+            network.web_headers,
+            (config.get("connection_timeout"), config.get("read_timeout"))
+        )
+
+        logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
+        if response.ok and response.status_code == 200 and response.text != "":
+            logger.debug("Parsing %s Bytes ...", len(response.text))
+
+            doc = bs4.BeautifulSoup(response.text, "html.parser")
+            logger.debug("doc[]='%s'", type(doc))
+
+            for criteria in [{"class": "home-instances container-lg"}, {"class": "container"}]:
+                logger.debug("criteria='%s'", criteria)
+                containers = doc.findAll("div", criteria)
+
+                logger.debug("Checking %d containers ...", len(containers))
+                for header in containers:
+                    logger.debug("header[%s]='%s'", type(header), header)
+
+                    rows = header.find_next(["ul","table"]).findAll("a")
+                    logger.debug("Found %d instance(s) ...", len(rows))
+                    for tag in rows:
+                        logger.debug("tag[]='%s'", type(tag))
+                        text = tag.contents[0] if isinstance(tag.contents[0], str) else tag.contents[0].text
+                        peer = tidyup.domain(text)
+                        logger.debug("peer='%s'", peer)
+
+                        if peer == "":
+                            logger.debug("peer is empty - SKIPPED!")
+                            continue
+                        elif peer in peers:
+                            logger.debug("peer='%s' already added - SKIPPED!", peer)
+                            continue
+
+                        logger.debug("Appending peer='%s' ...", peer)
+                        peers.append(peer)
+
+            logger.debug("peers()=%d", len(peers))
+            if len(peers) == 0:
+                logger.debug("Found no peers for domain='%s', trying script tag ...", domain)
+                peers = parse_script(doc)
+        else:
+            logger.warning("Cannot fetch /instances due to error: response.ok='%s',response.status_code=%d,response.details='%s'", response.ok, response.status_code, response.reason)
+            instances.set_last_error(domain, response)
+
+        logger.debug("Marking domain='%s' as successfully handled, peers()=%d ...", domain, len(peers))
+        instances.set_success(domain)
 
-        logger.debug("Invoking commit() ...")
-        database.connection.commit()
     except network.exceptions as exception:
-        logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
+        logger.warning("domain='%s',exception[%s]:'%s'", domain, type(exception), str(exception))
         instances.set_last_error(domain, exception)
 
-    logger.debug("EXIT!")
+    logger.debug("peers()=%d - EXIT!", len(peers))
+    return peers
+
+def parse_script(doc: bs4.BeautifulSoup, only: str = None) -> list:
+    logger.debug("doc[]='%s',only='%s' - CALLED!")
+    if not isinstance(doc, bs4.BeautifulSoup):
+        raise ValueError(f"Parameter doc[]='{type(only)}' is not of type 'bs4.BeautifulSoup'")
+    elif not isinstance(only, str) and only is not None:
+        raise ValueError(f"Parameter only[]='{type(only)}' is not of type 'str'")
+    elif isinstance(only, str) and only == "":
+        raise ValueError("Parameter 'only' is empty")
+
+    scripts = doc.find_all("script")
+    peers = list()
+
+    logger.debug("scripts()=%d", len(scripts))
+    for script in scripts:
+        logger.debug("script[%s].contents()=%d", type(script), len(script.contents))
+        if len(script.contents) == 0:
+            logger.debug("script has no contents - SKIPPED!")
+            continue
+        elif not script.contents[0].startswith("window.isoData"):
+            logger.debug("script.contents[0]='%s' does not start with window.isoData - SKIPPED!", script.contents[0])
+            continue
+
+        logger.debug("script.contents[0][]='%s'", type(script.contents[0]))
+
+        iso_data = script.contents[0].split("=")[1].strip().replace(":undefined", ":\"undefined\"")
+        logger.debug("iso_data[%s]='%s'", type(iso_data), iso_data)
+
+        parsed = None
+        try:
+            parsed = json.loads(iso_data)
+        except json.decoder.JSONDecodeError as exception:
+            logger.warning("Exception '%s' during parsing %d Bytes: '%s' - EXIT!", type(exception), len(iso_data), str(exception))
+            return list()
+
+        logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
+
+        if "routeData" not in parsed:
+            logger.warning("parsed[%s]()=%d does not contain element 'routeData'", type(parsed), len(parsed))
+            continue
+        elif "federatedInstancesResponse" not in parsed["routeData"]:
+            logger.warning("parsed[routeData][%s]()=%d does not contain element 'federatedInstancesResponse'", type(parsed["routeData"]), len(parsed["routeData"]))
+            continue
+        elif "data" not in parsed["routeData"]["federatedInstancesResponse"]:
+            logger.warning("parsed[routeData][federatedInstancesResponse][%s]()=%d does not contain element 'data'", type(parsed["routeData"]["federatedInstancesResponse"]), len(parsed["routeData"]["federatedInstancesResponse"]))
+            continue
+        elif "federated_instances" not in parsed["routeData"]["federatedInstancesResponse"]["data"]:
+            logger.warning("parsed[routeData][federatedInstancesResponse][data][%s]()=%d does not contain element 'data'", type(parsed["routeData"]["federatedInstancesResponse"]["data"]), len(parsed["routeData"]["federatedInstancesResponse"]["data"]))
+            continue
+
+        data = parsed["routeData"]["federatedInstancesResponse"]["data"]["federated_instances"]
+        logger.debug("Checking %d data elements ...", len(data))
+        for element in data:
+            logger.debug("element='%s'", element)
+            if isinstance(only, str) and only != element:
+                logger.debug("Skipping unwanted element='%s',only='%s'", element, only)
+                continue
+
+            logger.debug("Checking data[%s]()=%d row(s) ...", element, len(data[element]))
+            for row in data[element]:
+                logger.debug("row[]='%s'", type(row))
+                if "domain" not in row:
+                    logger.warning("row()=%d has no element 'domain' - SKIPPED!", len(row))
+                    continue
+
+                logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
+                peer = tidyup.domain(row["domain"])
+                logger.debug("peer='%s' - AFTER!", peer)
+
+                if peer == "":
+                    logger.debug("peer is empty - SKIPPED!")
+                    continue
+                elif peer in peers:
+                    logger.debug("peer='%s' already added - SKIPPED!", peer)
+                    continue
+
+                logger.debug("Appending peer='%s' ...", peer)
+                peers.append(peer)
+
+    logger.debug("peers()=%d - EXIT!", len(peers))
+    return peers