]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Mon, 3 Jul 2023 03:09:49 +0000 (05:09 +0200)
committerRoland Häder <roland@mxchange.org>
Mon, 3 Jul 2023 03:09:49 +0000 (05:09 +0200)
- some people have broken /.well-known/nodeinfo links (href), some contain
  scheme, but no netloc (host name)

config.defaults.json
fba/commands.py
fba/csrf.py
fba/http/federation.py
fba/http/network.py

index d6cb2205ea664121b561791448f2f8e956239f43..e378f47b9b27dbfb18a691d58650ee273218506d 100644 (file)
@@ -3,7 +3,7 @@
     "log_level"         : "info",
     "host"              : "127.0.0.1",
     "port"              : 8069,
-    "useragent"         : "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/113.0",
+    "useragent"         : "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0",
     "connection_timeout": 30,
     "read_timeout"      : 5,
     "hostname"          : "fba.ryona.agency",
index 38ffd6eae3d72004d52518660c3fa3b20def8553..58953539dff5eeecf03f2abbb017d9189e865d0b 100644 (file)
@@ -1384,7 +1384,7 @@ def update_nodeinfo(args: argparse.Namespace) -> int:
         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
     else:
         logger.info("Fetching domains for recently updated ...")
-        database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_block")])
+        database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL AND software IS NULL AND last_status_code < 999", [time.time() - config.get("recheck_block")])
 
     domains = database.cursor.fetchall()
 
index 401b7cd6ba252fe758bb8b923bb5e76b845f059c..d5a3ecc0d6799c456f995e182ea11dc1af00fe89 100644 (file)
@@ -16,6 +16,8 @@
 
 import logging
 
+from urllib.parse import urlparse
+
 import bs4
 import reqto
 
@@ -25,6 +27,8 @@ from fba.helpers import domain as domain_helper
 
 from fba.http import network
 
+from fba.models import instances
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
@@ -43,12 +47,12 @@ def determine(domain: str, headers: dict) -> dict:
     response = reqto.get(
         f"https://{domain}/",
         headers=network.web_headers,
-        timeout=(config.get("connection_timeout"), config.get("read_timeout")),
-        allow_redirects=False
+        timeout=(config.get("connection_timeout"), config.get("read_timeout"))
     )
+    components = urlparse(response.url)
 
     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
-    if response.ok and response.status_code < 300 and response.text != "" and response.text.find("<html") > 0:
+    if response.ok and response.status_code < 300 and response.text.strip() != "" and response.text.find("<html") > 0 and domain == components.netloc:
         # Save cookies
         logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
         cookies.store(domain, response.cookies.get_dict())
@@ -65,6 +69,9 @@ def determine(domain: str, headers: dict) -> dict:
         if tag is not None:
             logger.debug("Adding CSRF token='%s' for domain='%s'", tag["content"], domain)
             reqheaders["X-CSRF-Token"] = tag["content"]
+    elif domain != components.netloc:
+        logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc)
+        instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'")
 
     logger.debug("reqheaders()=%d - EXIT!", len(reqheaders))
     return reqheaders
index 339b4117241042841c155a26dfa2f54d3b5031d9..a90dc8bc2e43a0a82f385c29396342317179714f 100644 (file)
@@ -340,9 +340,13 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict:
 
                         logger.debug("components[%s]='%s'", type(components), components)
                         if components.scheme == "" and components.netloc == "":
-                            logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
+                            logger.warning("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
                             url = f"https://{domain}{url}"
                             components = urlparse(url)
+                        elif components.netloc == "":
+                            logger.warning("link[href]='%s' has no netloc set, setting domain='%s'", link["href"], domain)
+                            url = f"{components.scheme}://{domain}{components.path}"
+                            components = urlparse(url)
 
                         if not utils.is_domain_wanted(components.netloc):
                             logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
@@ -390,10 +394,16 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
     software = None
 
     logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
-    response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+    response = network.fetch_response(
+        domain, path,
+        network.web_headers,
+        (config.get("connection_timeout"), config.get("read_timeout")),
+        allow_redirects=True
+    )
+    components = urlparse(response.url)
 
     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
-    if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
+    if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and components.netloc == domain:
         logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
         doc = bs4.BeautifulSoup(response.text, "html.parser")
 
@@ -418,6 +428,9 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
             if software is not None and software != "":
                 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
                 instances.set_detection_mode(domain, "SITE_NAME")
+    elif domain != components.netloc:
+        logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc)
+        instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'")
 
     logger.debug("software[]='%s'", type(software))
     if isinstance(software, str) and software == "":
index 00a31e4476494f991e1a6a12de1d67abb2d5e994..58d44af14119cf21ed1a9580a636742a7fd46028 100644 (file)
@@ -89,7 +89,7 @@ def post_json_api(domain: str, path: str, data: str = "", headers: dict = dict()
 
         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
         if not response.ok or response.status_code >= 300 or len(response.text.strip()) == 0:
-            logger.warning("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text))
+            logger.debug("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text))
             json_reply["status_code"]   = response.status_code
             json_reply["error_message"] = response.reason
             instances.set_last_error(domain, response)
@@ -183,7 +183,7 @@ def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict:
 
     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
     if not response.ok or response.status_code >= 300 or len(response.text) == 0:
-        logger.warning("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text))
+        logger.debug("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text))
         json_reply["status_code"]   = response.status_code
         json_reply["error_message"] = response.reason
         instances.set_last_error(domain, response)
@@ -238,8 +238,8 @@ def send_bot_post(domain: str, blocklist: list):
 
     return True
 
-def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> requests.models.Response:
-    logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s' - CALLED!", domain, path, len(headers), timeout)
+def fetch_response(domain: str, path: str, headers: dict, timeout: tuple, allow_redirects: bool = False) -> requests.models.Response:
+    logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s',allow_redirects='%s' - CALLED!", domain, path, len(headers), timeout, allow_redirects)
     domain_helper.raise_on(domain)
 
     if not isinstance(path, str):
@@ -258,7 +258,7 @@ def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> req
             headers=headers,
             timeout=timeout,
             cookies=cookies.get_all(domain),
-            allow_redirects=False
+            allow_redirects=allow_redirects
         )
 
     except exceptions as exception: