From c8eed9b7d8b2ee26887bea4cfca242b2ca2387f2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Mon, 3 Jul 2023 05:09:49 +0200 Subject: [PATCH] Continued: - some people have broken /.well-known/nodeinfo links (href), some contain scheme, but no netloc (host name) --- config.defaults.json | 2 +- fba/commands.py | 2 +- fba/csrf.py | 13 ++++++++++--- fba/http/federation.py | 19 ++++++++++++++++--- fba/http/network.py | 10 +++++----- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/config.defaults.json b/config.defaults.json index d6cb220..e378f47 100644 --- a/config.defaults.json +++ b/config.defaults.json @@ -3,7 +3,7 @@ "log_level" : "info", "host" : "127.0.0.1", "port" : 8069, - "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/113.0", + "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0", "connection_timeout": 30, "read_timeout" : 5, "hostname" : "fba.ryona.agency", diff --git a/fba/commands.py b/fba/commands.py index 38ffd6e..5895353 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -1384,7 +1384,7 @@ def update_nodeinfo(args: argparse.Namespace) -> int: database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software]) else: logger.info("Fetching domains for recently updated ...") - database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_block")]) + database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL AND software IS NULL AND last_status_code < 999", [time.time() - config.get("recheck_block")]) domains = database.cursor.fetchall() diff --git a/fba/csrf.py b/fba/csrf.py index 401b7cd..d5a3ecc 100644 --- a/fba/csrf.py +++ b/fba/csrf.py @@ -16,6 +16,8 @@ import logging +from urllib.parse import urlparse + import bs4 import reqto @@ -25,6 +27,8 @@ from fba.helpers import domain as domain_helper from fba.http import network +from fba.models import instances + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -43,12 +47,12 @@ def determine(domain: str, headers: dict) -> dict: response = reqto.get( f"https://{domain}/", headers=network.web_headers, - timeout=(config.get("connection_timeout"), config.get("read_timeout")), - allow_redirects=False + timeout=(config.get("connection_timeout"), config.get("read_timeout")) ) + components = urlparse(response.url) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and response.text != "" and response.text.find(" 0: + if response.ok and response.status_code < 300 and response.text.strip() != "" and response.text.find(" 0 and domain == components.netloc: # Save cookies logger.debug("Parsing response.text()=%d Bytes ...", len(response.text)) cookies.store(domain, response.cookies.get_dict()) @@ -65,6 +69,9 @@ def determine(domain: str, headers: dict) -> dict: if tag is not None: logger.debug("Adding CSRF token='%s' for domain='%s'", tag["content"], domain) reqheaders["X-CSRF-Token"] = tag["content"] + elif domain != components.netloc: + logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc) + instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'") logger.debug("reqheaders()=%d - EXIT!", len(reqheaders)) return reqheaders diff --git a/fba/http/federation.py b/fba/http/federation.py index 339b411..a90dc8b 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -340,9 +340,13 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict: logger.debug("components[%s]='%s'", type(components), components) if components.scheme == "" and components.netloc == "": - logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain) + logger.warning("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain) url = f"https://{domain}{url}" components = urlparse(url) + elif components.netloc == "": + logger.warning("link[href]='%s' has no netloc set, setting domain='%s'", link["href"], domain) + url = f"{components.scheme}://{domain}{components.path}" + components = urlparse(url) if not utils.is_domain_wanted(components.netloc): logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc) @@ -390,10 +394,16 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: software = None logger.debug("Fetching path='%s' from domain='%s' ...", path, domain) - response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_response( + domain, path, + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")), + allow_redirects=True + ) + components = urlparse(response.url) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and response.text.find(" 0: + if response.ok and response.status_code < 300 and response.text.find(" 0 and components.netloc == domain: logger.debug("Parsing response.text()=%d Bytes ...", len(response.text)) doc = bs4.BeautifulSoup(response.text, "html.parser") @@ -418,6 +428,9 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: if software is not None and software != "": logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software) instances.set_detection_mode(domain, "SITE_NAME") + elif domain != components.netloc: + logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc) + instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'") logger.debug("software[]='%s'", type(software)) if isinstance(software, str) and software == "": diff --git a/fba/http/network.py b/fba/http/network.py index 00a31e4..58d44af 100644 --- a/fba/http/network.py +++ b/fba/http/network.py @@ -89,7 +89,7 @@ def post_json_api(domain: str, path: str, data: str = "", headers: dict = dict() logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if not response.ok or response.status_code >= 300 or len(response.text.strip()) == 0: - logger.warning("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text)) + logger.debug("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text)) json_reply["status_code"] = response.status_code json_reply["error_message"] = response.reason instances.set_last_error(domain, response) @@ -183,7 +183,7 @@ def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict: logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if not response.ok or response.status_code >= 300 or len(response.text) == 0: - logger.warning("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text)) + logger.debug("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text)) json_reply["status_code"] = response.status_code json_reply["error_message"] = response.reason instances.set_last_error(domain, response) @@ -238,8 +238,8 @@ def send_bot_post(domain: str, blocklist: list): return True -def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> requests.models.Response: - logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s' - CALLED!", domain, path, len(headers), timeout) +def fetch_response(domain: str, path: str, headers: dict, timeout: tuple, allow_redirects: bool = False) -> requests.models.Response: + logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s',allow_redirects='%s' - CALLED!", domain, path, len(headers), timeout, allow_redirects) domain_helper.raise_on(domain) if not isinstance(path, str): @@ -258,7 +258,7 @@ def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> req headers=headers, timeout=timeout, cookies=cookies.get_all(domain), - allow_redirects=False + allow_redirects=allow_redirects ) except exceptions as exception: -- 2.39.2