Continued:

[fba.git] / fba / http / federation.py
diff --git a/fba/http/federation.py b/fba/http/federation.py

index a8291c7e01b22d4d347666545ca9d262282c9ec1..5f6616d0b0777baa6d561d63fece9cedada6e6c0 100644 (file)
--- a/fba/http/federation.py
+++ b/fba/http/federation.py
@@ -21,6 +21,7 @@ import bs4
  import validators
  
  from fba import csrf
+from fba import utils
  
  from fba.helpers import blacklist
  from fba.helpers import config
@@ -123,20 +124,11 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path:
          if instance == "":
              logger.warning(f"Empty instance after tidyup.domain(), domain='{domain}'")
              continue
-        elif not validators.domain(instance.split("/")[0]):
-            logger.warning(f"Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
-            continue
-        elif instance.endswith(".arpa"):
-            logger.warning(f"instance='{instance}' is a reversed .arpa domain and should not be used generally.")
-            continue
-        elif blacklist.is_blacklisted(instance):
-            logger.debug("instance is blacklisted:", instance)
+        elif not utils.is_domain_wanted((instance):
+            logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
              continue
          elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
-            logger.debug(f"instance='{instance}' is a link to a single user profile - SKIPPED!")
-            continue
-        elif instance.endswith(".tld"):
-            logger.debug(f"instance='{instance}' is a fake domain - SKIPPED!")
+            logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
              continue
          elif not instances.is_registered(instance):
              logger.debug("Adding new instance:", instance, domain)
@@ -178,7 +170,7 @@ def fetch_peers(domain: str, software: str) -> list:
      headers = tuple()
  
      try:
-        logger.debug(f"Checking CSRF for domain='{domain}'")
+        logger.debug("Checking CSRF for domain='%s'", domain)
          headers = csrf.determine(domain, dict())
      except network.exceptions as exception:
          logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
@@ -212,13 +204,13 @@ def fetch_peers(domain: str, software: str) -> list:
              logger.debug("Added instance(s) to peers")
          else:
              message = "JSON response does not contain 'federated_instances' or 'error_message'"
-            logger.warning(f"{message},domain='{domain}'")
+            logger.warning("message='%s',domain='%s'", message, domain)
              instances.set_last_error(domain, message)
      elif isinstance(data["json"], list):
-        # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
+        logger.debug("Querying API was successful: domain='%s',data[json]()=%d", domain, len(data['json']))
          peers = data["json"]
      else:
-        logger.warning(f"Cannot parse data[json][]='{type(data['json'])}'")
+        logger.warning("Cannot parse data[json][]='%s'", type(data['json']))
  
      logger.debug(f"Adding '{len(peers)}' for domain='{domain}'")
      instances.set_total_peers(domain, peers)
@@ -256,7 +248,7 @@ def fetch_nodeinfo(domain: str, path: str = None) -> dict:
      data = dict()
  
      try:
-        logger.debug(f"Checking CSRF for domain='{domain}'")
+        logger.debug("Checking CSRF for domain='%s'", domain)
          headers = csrf.determine(domain, dict())
      except network.exceptions as exception:
          logger.warning(f"Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
@@ -323,7 +315,7 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict:
      headers = tuple()
  
      try:
-        logger.debug(f"Checking CSRF for domain='{domain}'")
+        logger.debug("Checking CSRF for domain='%s'", domain)
          headers = csrf.determine(domain, dict())
      except network.exceptions as exception:
          logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
@@ -362,17 +354,8 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict:
                          url = f"https://{domain}{url}"
                          components = urlparse(url)
  
-                    if not validators.domain(components.netloc):
-                        logger.warning(f"components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
-                        continue
-                    elif domain.endswith(".arpa"):
-                        logger.warning("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                        continue
-                    elif domain.endswith(".tld"):
-                        logger.warning("domain='%s' is a fake domain - SKIPPED!", domain)
-                        continue
-                    elif blacklist.is_blacklisted(components.netloc):
-                        logger.debug(f"components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
+                    if not utils.is_domain_wanted((components.netloc):
+                        logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
                          continue
  
                      logger.debug("Fetching nodeinfo from:", url)
@@ -422,19 +405,21 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
      logger.debug(f"Fetching path='{path}' from '{domain}' ...")
      response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
  
-    logger.debug("domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
+    logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
      if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
          logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
+
          doc = bs4.BeautifulSoup(response.text, "html.parser")
+        logger.debug("doc[]='%s'", type(doc))
  
-        logger.debug("doc[]:", type(doc))
          generator = doc.find("meta", {"name"    : "generator"})
          site_name = doc.find("meta", {"property": "og:site_name"})
  
-        logger.debug(f"generator='{generator}',site_name='{site_name}'")
+        logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
          if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
              logger.debug("Found generator meta tag:", domain)
              software = tidyup.domain(generator.get("content"))
+
              logger.debug("software[%s]='%s'", type(software), software)
              if software is not None and software != "":
                  logger.info("domain='%s' is generated by '%s'", domain, software)
@@ -442,6 +427,7 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
          elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
              logger.debug("Found property=og:site_name:", domain)
              software = tidyup.domain(site_name.get("content"))
+
              logger.debug("software[%s]='%s'", type(software), software)
              if software is not None and software != "":
                  logger.info("domain='%s' has og:site_name='%s'", domain, software)
@@ -449,7 +435,7 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
  
      logger.debug("software[]='%s'", type(software))
      if isinstance(software, str) and software == "":
-        logger.debug(f"Corrected empty string to None for software of domain='{domain}'")
+        logger.debug("Corrected empty string to None for software of domain='%s'", domain)
          software = None
      elif isinstance(software, str) and ("." in software or " " in software):
          logger.debug(f"software='{software}' may contain a version number, domain='{domain}', removing it ...")
@@ -596,16 +582,7 @@ def find_domains(tag: bs4.element.Tag) -> list:
  
          logger.debug("domain='%s',reason='%s'", domain, reason)
  
-        if not validators.domain(domain.split("/")[0]):
-            logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".arpa"):
-            logger.warning("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".tld"):
-            logger.warning("domain='%s' is a fake domain - SKIPPED!", domain)
-            continue
-        elif blacklist.is_blacklisted(domain):
+        if not utils.is_domain_wanted((domain):
              logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
              continue
          elif domain == "gab.com/.ai, develop.gab.com":
@@ -661,17 +638,8 @@ def add_peers(rows: dict) -> list:
                  raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
  
              logger.debug(f"peer='{peer}' - AFTER!")
-            if not validators.domain(peer):
-                logger.warning(f"peer='{peer}' is not a valid domain - SKIPPED!")
-                continue
-            elif peer.endswith(".arpa"):
-                logger.warning(f"peer='{peer}' is a domain for reversed IP addresses -SKIPPED!")
-                continue
-            elif peer.endswith(".tld"):
-                logger.warning(f"peer='{peer}' is a fake domain - SKIPPED!")
-                continue
-            elif blacklist.is_blacklisted(peer):
-                logger.debug(f"peer='{peer}' is blacklisted - SKIPPED!")
+            if not utils.is_domain_wanted((peer):
+                logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
                  continue
  
              logger.debug(f"Adding peer='{peer}' ...")