From 5a1a4a08678afecf12011c6c19b1941a4bed1c26 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Mon, 1 Jul 2024 18:21:54 +0200 Subject: [PATCH] Continued: - WordPress is a peer list provider and can be utilized for more "peers" aka. instances - URL from response object can be different than requested, it needs to be revalidated - invalid redirect URLs are now logged with a level WARNING message --- fba/commands.py | 2 +- fba/http/csrf.py | 16 +++++++++++++--- fba/http/federation.py | 13 ++++++++++--- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index eed2813..05cdd7e 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -1031,7 +1031,7 @@ def fetch_instances(args: argparse.Namespace) -> int: database.cursor.execute( "SELECT domain, origin, software \ FROM instances \ -WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen', 'vebinet', 'toki', 'snac', 'biblioreads') \ +WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen', 'vebinet', 'toki', 'snac', 'biblioreads', 'wordpress') \ ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC" ) diff --git a/fba/http/csrf.py b/fba/http/csrf.py index 57ed544..6843905 100644 --- a/fba/http/csrf.py +++ b/fba/http/csrf.py @@ -19,6 +19,7 @@ import logging import bs4 import reqto import requests +import validators from fba.helpers import blacklist from fba.helpers import config @@ -54,7 +55,10 @@ def determine(domain: str, headers: dict) -> dict: ) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code == 200 and response.text.strip() != "" and response.text.find("= 0 and domain_helper.is_in_url(domain, response.url.split("#")[0]): + response_url = response.url.split("#")[0] + logger.debug("response_url='%s'", response_url) + + if response.ok and response.status_code == 200 and response.text.strip() != "" and response.text.find("= 0 and validators.url(response_url) and domain_helper.is_in_url(domain, response_url): # Save cookies logger.debug("Parsing response.text()=%d Bytes ...", len(response.text)) cookies.store(domain, response.cookies.get_dict()) @@ -78,10 +82,16 @@ def determine(domain: str, headers: dict) -> dict: if tag is not None: logger.debug("Adding CSRF token='%s' for domain='%s'", tag["content"], domain) reqheaders["X-CSRF-Token"] = tag["content"] - elif not domain_helper.is_in_url(domain, response.url.split("#")[0]): + elif not validators.url(response_url): + logger.warning("response_url='%s' is not valid - Raising exception ...", response_url) + + message = f"Redirect from domain='{domain}' to response_url='{response_url}'" + instances.set_last_error(domain, message) + raise requests.exceptions.TooManyRedirects(message) + elif not domain_helper.is_in_url(domain, response_url): logger.warning("domain='%s' doesn't match with response.url='%s', maybe redirect to other domain?", domain, response.url) - message = f"Redirect from domain='{domain}' to response.url='{response.url}'" + message = f"Redirect from domain='{domain}' to response_url='{response_url}'" instances.set_last_error(domain, message) raise requests.exceptions.TooManyRedirects(message) diff --git a/fba/http/federation.py b/fba/http/federation.py index b3ad86c..eae8f8e 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -294,9 +294,10 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: allow_redirects=True ) - response_url = response.url.split("#")[0] + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) + response_url = response.url.split("#")[0], response_url) + logger.debug("response_url='%s'" - logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d,response_url='%s'", response.ok, response.status_code, len(response.text), response_url) if ((response.ok and response.status_code == 200) or response.status_code == 410) and response.text.find("= 0 and validators.url(response_url) and domain_helper.is_in_url(domain, response_url): logger.debug("Parsing response.text()=%d Bytes ...", len(response.text)) doc = bs4.BeautifulSoup(response.text, "html.parser") @@ -340,7 +341,13 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: if software is not None and software != "": logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software) instances.set_detection_mode(domain, "SITE_NAME") - elif validators.url(response_url) and not domain_helper.is_in_url(domain, response_url): + elif not validators.url(response_url): + logger.warning("response_url='%s' is not valid - Raising exception ...", response_url) + + message = f"Redirect from domain='{domain}' to response_url='{response_url}'" + instances.set_last_error(domain, message) + raise requests.exceptions.TooManyRedirects(message) + elif not domain_helper.is_in_url(domain, response_url): logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url) components = urlparse(response.url) -- 2.39.5