From ba934ecd7350e88e8119b7ad336e246a50dddb4e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 10 Jan 2024 22:52:50 +0100 Subject: [PATCH] Continued: - moved utils.fetch_url() to module network as this is network-related - added some debug lines --- fba/commands.py | 20 +++++++++--------- fba/helpers/processing.py | 2 +- fba/http/network.py | 43 ++++++++++++++++++++++++++++++++++++++- fba/utils.py | 4 ++++ 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index 4772276..3f10f71 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -485,7 +485,7 @@ def fetch_observer(args: argparse.Namespace) -> int: types = list() if args.software is None: logger.info("Fetching software list ...") - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{source_domain}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -609,7 +609,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: } logger.debug("Fetching domainblocks from source_domain='%s'", source_domain) - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -724,7 +724,7 @@ def fetch_cs(args: argparse.Namespace): sources.update(source_domain) logger.info("Fetching federation.md from source_domain='%s' ...", source_domain) - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -811,7 +811,7 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: sources.update(domain) logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed) - response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and len(response.text) > 0: @@ -890,7 +890,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: domains = list() logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed) - response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and len(response.text) > 0: @@ -1088,7 +1088,7 @@ def fetch_txt(args: argparse.Namespace) -> int: logger.info("Checking %d text file(s) ...", len(blocklists.txt_files)) for row in blocklists.txt_files: logger.debug("Fetching row[url]='%s' ...", row["url"]) - response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and response.text != "": @@ -1133,7 +1133,7 @@ def fetch_fedipact(args: argparse.Namespace) -> int: sources.update(source_domain) logger.info("Fetching / from source_domain='%s' ...", source_domain) - response = utils.fetch_url( + response = network.fetch_url( f"https://{source_domain}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -1192,7 +1192,7 @@ def fetch_joinmobilizon(args: argparse.Namespace) -> int: sources.update(source_domain) logger.info("Fetching instances from source_domain='%s' ...", source_domain) - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{source_domain}/api/v1/instances", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -1240,7 +1240,7 @@ def fetch_joinmisskey(args: argparse.Namespace) -> int: sources.update(source_domain) logger.info("Fetching instances.json from source_domain='%s' ...", source_domain) - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{source_domain}/instances.json", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -1764,7 +1764,7 @@ def fetch_relays(args: argparse.Namespace) -> int: continue else: logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) - raw = utils.fetch_url( + raw = network.fetch_url( f"https://{row['domain']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) diff --git a/fba/helpers/processing.py b/fba/helpers/processing.py index 07e9371..5e0b832 100644 --- a/fba/helpers/processing.py +++ b/fba/helpers/processing.py @@ -138,7 +138,7 @@ def csv_block(blocker: str, url: str, command: str): # Fetch this URL logger.info("Fetching url='%s' for blocker='%s' ...", url, blocker) - response = utils.fetch_url( + response = network.fetch_url( url, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) diff --git a/fba/http/network.py b/fba/http/network.py index 5158c94..43f1ae4 100644 --- a/fba/http/network.py +++ b/fba/http/network.py @@ -17,6 +17,8 @@ import logging import time +from urllib.parse import urlparse + import reqto import requests import urllib3 @@ -137,7 +139,7 @@ def fetch_api_url(url: str, timeout: tuple) -> dict: try: logger.debug("Fetching url='%s' ...", url) - response = utils.fetch_url(url, api_headers, timeout) + response = fetch_url(url, api_headers, timeout) logger.debug("response.ok='%s',response.status_code=%d,response.reason='%s'", response.ok, response.status_code, response.reason) if response.ok and response.status_code == 200: @@ -305,3 +307,42 @@ def fetch_response(domain: str, path: str, headers: dict, timeout: tuple, allow_ logger.debug("response[]='%s' - EXIT!", type(response)) return response + +def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Response: + logger.debug("url='%s',headers()=%d,timeout(%d)='%s' - CALLED!", url, len(headers), len(timeout), timeout) + + if not isinstance(url, str): + raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'") + elif url == "": + raise ValueError("Parameter 'url' is empty") + elif not validators.url(url): + raise ValueError(f"Parameter url='{url}' is not a valid URL") + elif not isinstance(headers, dict): + raise ValueError(f"Parameter headers[]='{type(headers)}' is not of type 'dict'") + elif not isinstance(timeout, tuple): + raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not of type 'tuple'") + + logger.debug("Parsing url='%s' ...", url) + components = urlparse(url) + + # Invoke other function, avoid trailing ? + logger.debug("components[%s]='%s'", type(components), components) + if components.query != "": + logger.debug("Fetching path='%s?%s' from netloc='%s' ...", components.path, components.query, components.netloc) + response = fetch_response( + components.netloc.split(":")[0], + f"{components.path}?{components.query}", + headers, + timeout + ) + else: + logger.debug("Fetching path='%s' from netloc='%s' ...", components.path, components.netloc) + response = fetch_response( + components.netloc.split(":")[0], + components.path if isinstance(components.path, str) and components.path != '' else '/', + headers, + timeout + ) + + logger.debug("response[]='%s' - EXIT!", type(response)) + return response diff --git a/fba/utils.py b/fba/utils.py index 7829723..7b87b13 100644 --- a/fba/utils.py +++ b/fba/utils.py @@ -66,6 +66,7 @@ def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Respon # Invoke other function, avoid trailing ? logger.debug("components[%s]='%s'", type(components), components) if components.query != "": + logger.debug("Fetching path='%s?%s' from netloc='%s' ...", components.path, components.query, components.netloc) response = network.fetch_response( components.netloc.split(":")[0], f"{components.path}?{components.query}", @@ -73,6 +74,7 @@ def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Respon timeout ) else: + logger.debug("Fetching path='%s' from netloc='%s' ...", components.path, components.netloc) response = network.fetch_response( components.netloc.split(":")[0], components.path if isinstance(components.path, str) and components.path != '' else '/', @@ -105,6 +107,7 @@ def find_domains(tags: bs4.element.ResultSet, search: str) -> list: domain = tidyup.domain(tag.find("em").contents[0]) logger.debug("domain='%s' - AFTER!", domain) + logger.debug("domain='%s' - AFTER2!", domain) if domain == "": logger.warning("Empty domain after checking search='%s' and tags - SKIPPED!", search) continue @@ -135,6 +138,7 @@ def deobfuscate(domain: str, blocker: str, domain_hash: str = None) -> str: logger.debug("Setting has_obfuscation=False for blocker='%s' ...", blocker) instances.set_has_obfuscation(blocker, False) + logger.debug("Checking domain='%s' ...", domain) if domain.find("*") >= 0: logger.debug("blocker='%s' uses obfuscated domains", blocker) instances.set_has_obfuscation(blocker, True) -- 2.39.5