From b038b3ec9a685f2a4b7fbf49708687836d35834f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 7 Jan 2025 04:31:37 +0100 Subject: [PATCH] Continued: - local "caching" of configuration values to speedup code execution and lowering massive debug logging - combined None and "" together --- fba/commands.py | 82 ++++++++++++++++++++------------------- fba/helpers/blacklist.py | 2 + fba/helpers/processing.py | 6 ++- fba/http/csrf.py | 1 + fba/http/federation.py | 7 +++- fba/http/network.py | 7 ++-- fba/http/nodeinfo.py | 12 ++++-- 7 files changed, 68 insertions(+), 49 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index 5dfdced..f8dcd74 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -59,6 +59,10 @@ from fba.networks import mastodon from fba.networks import misskey from fba.networks import pleroma +# Locally "cached" values to speedup code and keep massive debug log shorter +_timeout = (config.get("connection_timeout"), config.get("read_timeout")) +_bot_enabled = config.get("bot_enabled") + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) #logger.setLevel(logging.DEBUG) @@ -266,7 +270,7 @@ def fetch_bkali(args: argparse.Namespace) -> int: def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - if args.domain is not None and args.domain != "": + if args.domain not in [None, ""]: logger.debug("args.domain='%s' - checking ...", args.domain) if not validators.domain(args.domain, rfc_2782=True): logger.warning("args.domain='%s' is not valid.", args.domain) @@ -281,11 +285,11 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("Invoking locking.acquire() ...") locking.acquire() - if args.domain is not None and args.domain != "": + if args.domain not in [None, ""]: # Re-check single domain logger.debug("Querying database for args.domain='%s' ...", args.domain) database.cursor.execute("SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]) - elif args.software is not None and args.software != "": + elif args.software not in [None, ""]: # Re-check single software logger.debug("Querying database for args.software='%s' ...", args.software) database.cursor.execute("SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? ORDER BY last_blocked ASC, total_blocks DESC", [args.software]) @@ -452,7 +456,7 @@ def fetch_blocks(args: argparse.Namespace) -> int: block["block_level"] = blocks.alias_block_level(block["block_level"]) logger.debug("block[block_level]='%s' - AFTER!", block["block_level"]) - if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] in ["rejected", "suspended"] and config.get("bot_enabled"): + if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] in ["rejected", "suspended"] and _bot_enabled: logger.debug("Appending block[blocked]'%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker) blockdict.append({ "blocked": block["blocked"], @@ -474,8 +478,8 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("Invoking cookies.clear(%s) ...", blocker) cookies.clear(blocker) - logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict)) - if config.get("bot_enabled") and len(blockdict) > 0: + logger.debug("_bot_enabled='%s',blockdict()=%d'", _bot_enabled, len(blockdict)) + if _bot_enabled and len(blockdict) > 0: logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) network.send_bot_post(blocker, blockdict) @@ -502,7 +506,7 @@ def fetch_observer(args: argparse.Namespace) -> int: raw = network.fetch_url( f"https://{source_domain}", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw[%s]()=%d", type(raw), len(raw)) @@ -632,7 +636,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: raw = network.fetch_url( f"https://{source_domain}/todon/domainblocks", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) @@ -656,18 +660,16 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: blockdict = list() for block_level in blocklist: + logger.debug("block_level='%s'", block_level) blockers = blocklist[block_level] - logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers)) + logger.debug("Checking %d blocker entries for block_level='%s' ...", len(blockers), block_level) for blocked in blockers: logger.debug("blocked='%s'", blocked) if not domain_helper.is_wanted(blocked): logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif not domain_helper.is_wanted(blocker): - logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker) - continue elif blocks.is_instance_blocked(blocker, blocked, block_level): logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level) continue @@ -681,7 +683,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level) - if processing.block(blocker, blocked, None, block_level) and block_level in ["suspended", "rejected"] and config.get("bot_enabled"): + if processing.block(blocker, blocked, None, block_level) and block_level in ["suspended", "rejected"] and _bot_enabled: logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker) blockdict.append({ "blocked": blocked, @@ -691,8 +693,8 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: logger.debug("Invoking commit() ...") database.connection.commit() - logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) - if config.get("bot_enabled") and len(blockdict) > 0: + logger.debug("_bot_enabled='%s',blockdict()=%d", _bot_enabled, len(blockdict)) + if _bot_enabled and len(blockdict) > 0: logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) network.send_bot_post(blocker, blockdict) @@ -747,7 +749,7 @@ def fetch_cs(args: argparse.Namespace): raw = network.fetch_url( f"https://{source_domain}/federation", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) @@ -788,7 +790,7 @@ def fetch_cs(args: argparse.Namespace): logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"]) instances.set_last_error(row["domain"], exception) - if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level in ["suspended", "rejected"] and config.get("bot_enabled"): + if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level in ["suspended", "rejected"] and _bot_enabled: logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker) blockdict.append({ "blocked": row["domain"], @@ -798,8 +800,8 @@ def fetch_cs(args: argparse.Namespace): logger.debug("Invoking commit() ...") database.connection.commit() - logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) - if config.get("bot_enabled") and len(blockdict) > 0: + logger.debug("_bot_enabled='%s',blockdict()=%d", _bot_enabled, len(blockdict)) + if _bot_enabled and len(blockdict) > 0: logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) network.send_bot_post(blocker, blockdict) @@ -831,7 +833,7 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: sources.update(domain) logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed) - response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(args.feed, network.web_headers, _timeout) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and len(response.text) > 0: @@ -910,7 +912,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: domains = list() logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed) - response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(feed, network.web_headers, _timeout) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and len(response.text) > 0: @@ -1137,7 +1139,7 @@ def fetch_txt(args: argparse.Namespace) -> int: logger.info("Checking %d text file(s) ...", len(blocklists.txt_files)) for row in blocklists.txt_files: logger.debug("Fetching row[url]='%s' ...", row["url"]) - response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = network.fetch_url(row["url"], network.web_headers, _timeout) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code == 200 and response.text != "": @@ -1185,7 +1187,7 @@ def fetch_fedipact(args: argparse.Namespace) -> int: response = network.fetch_url( f"https://{source_domain}", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) @@ -1244,7 +1246,7 @@ def fetch_joinmobilizon(args: argparse.Namespace) -> int: raw = network.fetch_url( f"https://{source_domain}/api/v1/instances", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) @@ -1295,7 +1297,7 @@ def fetch_joinmisskey(args: argparse.Namespace) -> int: raw = network.fetch_url( f"https://{source_domain}/instances.json", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) @@ -1445,7 +1447,7 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: block["block_level"] = blocks.alias_block_level(block["block_level"]) logger.debug("block[block_level]='%s' - AFTER!", block["block_level"]) - if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] in ["suspended", "rejected"] and config.get("bot_enabled"): + if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] in ["suspended", "rejected"] and _bot_enabled: logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block["block_level"], row["domain"]) blockdict.append({ "blocked": blocked, @@ -1471,8 +1473,8 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: logger.debug("Invoking commit() ...") database.connection.commit() - logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) - if config.get("bot_enabled") and len(blockdict) > 0: + logger.debug("_bot_enabled='%s',blockdict()=%d", _bot_enabled, len(blockdict)) + if _bot_enabled and len(blockdict) > 0: logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict)) network.send_bot_post(row["domain"], blockdict) @@ -1494,7 +1496,7 @@ def fetch_fedilist(args: argparse.Namespace) -> int: sources.update(source_domain) url = f"http://{source_domain}/instance/csv?onion=not" - if args.software is not None and args.software != "": + if args.software not in [None, ""]: logger.debug("args.software='%s'", args.software) url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not" @@ -1502,7 +1504,7 @@ def fetch_fedilist(args: argparse.Namespace) -> int: response = reqto.get( url, headers=network.web_headers, - timeout=(config.get("connection_timeout"), config.get("read_timeout")), + timeout=_timeout, allow_redirects=False ) @@ -1561,13 +1563,13 @@ def update_nodeinfo(args: argparse.Namespace) -> int: logger.debug("Invoking locking.acquire() ...") locking.acquire() - if args.domain is not None and args.domain != "": + if args.domain not in [None, ""]: logger.debug("Fetching args.domain='%s'", args.domain) database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain]) - elif args.software is not None and args.software != "": + elif args.software not in [None, ""]: logger.info("Fetching domains for args.software='%s'", args.software) database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software]) - elif args.mode is not None and args.mode != "": + elif args.mode not in [None, ""]: logger.info("Fetching domains for args.mode='%s'", args.mode.upper()) database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode]) elif args.no_software: @@ -1655,7 +1657,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int: logger.info("Fetching list from source_domain='%s' ...", source_domain) rows = network.fetch_json_rows( source_domain, - "/api/1.0/instances/list?count=0&sort_by=name", + "/api/1.0/instances/list?count=0&sort_by=name", { "Authorization": f"Bearer {config.get('instances_social_api_key')}", }, @@ -1664,7 +1666,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int: logger.info("Checking %d row(s) ...", len(rows)) for row in rows: - logger.debug("row[]='%s'", type(row)) + logger.debug("row[]='%s' - BEFORE!", type(row)) domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None logger.debug("domain='%s' - AFTER!", domain) @@ -1720,7 +1722,7 @@ def fetch_relaylist(args: argparse.Namespace) -> int: logger.info("Checking %d row(s) ...", len(rows)) for row in rows: - logger.debug("row[]='%s'", type(row)) + logger.debug("row[%s]='%s' - BEFORE!", type(row), row) domain = urlparse(row["url"]).netloc.lower().split(":")[0] logger.debug("domain='%s' - AFTER!", domain) @@ -1757,10 +1759,10 @@ def fetch_relays(args: argparse.Namespace) -> int: logger.debug("Invoking locking.acquire() ...") locking.acquire() - if args.domain is not None and args.domain != "": + if args.domain not in [None, ""]: logger.debug("Fetching instances record for args.domain='%s' ...", args.domain) database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain]) - elif args.software is not None and args.software != "": + elif args.software not in [None, ""]: logger.debug("Fetching instances records for args.software='%s' ...", args.software) database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software]) else: @@ -1787,7 +1789,7 @@ def fetch_relays(args: argparse.Namespace) -> int: logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"]) raw = network.fetch_api_url( row["nodeinfo_url"], - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ) logger.debug("raw[%s]()=%d", type(raw), len(raw)) @@ -1814,7 +1816,7 @@ def fetch_relays(args: argparse.Namespace) -> int: raw = network.fetch_url( f"https://{row['domain']}", network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ).text logger.debug("raw[%s]()=%d", type(raw), len(raw)) diff --git a/fba/helpers/blacklist.py b/fba/helpers/blacklist.py index 723bf65..7608756 100644 --- a/fba/helpers/blacklist.py +++ b/fba/helpers/blacklist.py @@ -79,6 +79,8 @@ _blacklist = { "static.sl-reverse.messenger.com": "Please get yourself a proper domain name, no static-IP host names", "documentation.on.seirdy.one" : "Just ignore such lines!", "drankdrankdrank" : "Mass flooding of instances", + "cn24tv.it/page/" : "Useless massive pages", + "youtube.com/channel/" : "Useless massive YT channels", } @lru_cache diff --git a/fba/helpers/processing.py b/fba/helpers/processing.py index 8326d8e..463978b 100644 --- a/fba/helpers/processing.py +++ b/fba/helpers/processing.py @@ -202,14 +202,18 @@ def csv_block(blocker: str, url: str, command: str) -> None: reject_media = True elif "reject_media" in row and row["reject_media"].lower() == "true": reject_media = True + else: + logger.debug("row='%s' for domain='%s' does not contain key '[#]reject_media'", row, domain) if "#reject_reports" in row and row["#reject_reports"].lower() == "true": reject_reports = True elif "reject_reports" in row and row["reject_reports"].lower() == "true": reject_reports = True + else: + logger.debug("row='%s' for domain='%s' does not contain key '[#]reject_reports'", row, domain) cnt = cnt + 1 - logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports) + logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s',cnt=%d", domain, severity, reject_media, reject_reports, cnt) if domain in [None, ""]: logger.debug("domain='%s' is empty - SKIPPED!", domain) continue diff --git a/fba/http/csrf.py b/fba/http/csrf.py index d85445b..96c02f2 100644 --- a/fba/http/csrf.py +++ b/fba/http/csrf.py @@ -32,6 +32,7 @@ from fba.models import instances logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) def determine(domain: str, headers: dict) -> dict: logger.debug("domain='%s',headers()=%d - CALLED!", domain, len(headers)) diff --git a/fba/http/federation.py b/fba/http/federation.py index 15c6799..c24ca63 100644 --- a/fba/http/federation.py +++ b/fba/http/federation.py @@ -49,8 +49,13 @@ _api_paths = [ "/api/v3/site", ] +# Local "cache" to shorten intense debug output +_max_crawl_depth = config.get("max_crawl_depth") +_min_peers_length = config.get("min_peers_length") + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None) -> None: global _DEPTH @@ -200,7 +205,7 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path: instances.update(domain) logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH) - if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"): + if _DEPTH <= _max_crawl_depth and len(peerlist) >= _min_peers_length: logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH) fetch_instances(instance, domain, None, command, path) else: diff --git a/fba/http/network.py b/fba/http/network.py index a2e3b43..5ca77bc 100644 --- a/fba/http/network.py +++ b/fba/http/network.py @@ -35,9 +35,6 @@ from fba.helpers import json as json_helper from fba.models import instances -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - # HTTP headers for non-API requests web_headers = { "User-Agent": config.get("useragent"), @@ -64,6 +61,10 @@ exceptions = ( urllib3.exceptions.LocationParseError ) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) + def post_json_api(domain: str, path: str, data: str = "", headers: dict = dict()) -> dict: logger.debug("domain='%s',path='%s',data='%s',headers()=%d - CALLED!", domain, path, data, len(headers)) domain_helper.raise_on(domain) diff --git a/fba/http/nodeinfo.py b/fba/http/nodeinfo.py index db89b21..9e81bb9 100644 --- a/fba/http/nodeinfo.py +++ b/fba/http/nodeinfo.py @@ -27,9 +27,6 @@ from fba.http import network from fba.models import instances -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - # Well-known URLs for nodeinfo informations _well_known_nodeinfo_urls = [ "/.well-known/x-nodeinfo2", @@ -59,6 +56,13 @@ _nodeinfo_identifier = [ "http://nodeinfo.diaspora.software/ns/schema/1.0", ] +# Locally "cached" values to speedup code and keep massive debug log shorter +_timeout = (config.get("connection_timeout"), config.get("read_timeout")) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) + def fetch(domain: str, path: str = None, update_mode: bool = True) -> dict: logger.debug("domain='%s',path='%s',update_mode='%s' - CALLED!", domain, path, update_mode) domain_helper.raise_on(domain) @@ -247,7 +251,7 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict: logger.debug("Fetching nodeinfo from url='%s' ...", url) data = network.fetch_api_url( url, - (config.get("connection_timeout"), config.get("read_timeout")) + timeout=_timeout ) logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data)) -- 2.39.5