From 2d853c9f29e7d4106589b25a7b0dcc3bce6e2234 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 12 Sep 2023 12:00:19 +0200 Subject: [PATCH] Continued: - added command 'fetch_csv' which fetches CSV files and processes them for further instance discovery and blocklist expansion - introduced function processing.csv_block() which does the above processing - return non-zero exit code when source was queried to recently --- fba/boot.py | 7 ++ fba/commands.py | 156 +++++++------------------------------- fba/helpers/blocklists.py | 14 +++- fba/helpers/processing.py | 152 +++++++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 131 deletions(-) diff --git a/fba/boot.py b/fba/boot.py index 8e0f3fe..a2941ab 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -123,6 +123,13 @@ def init_parser(): parser.set_defaults(command=commands.fetch_oliphant) parser.add_argument("--domain", help="Instance name (aka. domain) to check") + ### Fetch blocks from other CSV files + parser = subparser_command.add_parser( + "fetch_csv", + help="Fetches CSV files (block recommendations) for more possible instances to disover", + ) + parser.set_defaults(command=commands.fetch_csv) + ### Fetch instances from given initial instance ### parser = subparser_command.add_parser( "fetch_instances", diff --git a/fba/commands.py b/fba/commands.py index 1d89306..1ceb234 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -112,7 +112,7 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -185,7 +185,7 @@ def fetch_bkali(args: argparse.Namespace) -> int: source_domain = "gql.api.bka.li" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -466,7 +466,7 @@ def fetch_observer(args: argparse.Namespace) -> int: source_domain = "fediverse.observer" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -568,7 +568,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: source_domain = "wiki.todon.eu" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -688,7 +688,7 @@ def fetch_cs(args: argparse.Namespace): source_domain = "raw.githubusercontent.com" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -852,7 +852,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -993,6 +993,20 @@ def fetch_instances(args: argparse.Namespace) -> int: logger.debug("Success - EXIT!") return 0 +def fetch_csv(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + logger.info("Checking %d CSV files ...", len(blocklists.csv_files)) + for block in blocklists.csv_files: + logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"]) + processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name) + + logger.debug("Success - EXIT!") + return 0 + def fetch_oliphant(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) @@ -1002,7 +1016,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int: source_domain = "codeberg.org" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1010,8 +1024,6 @@ def fetch_oliphant(args: argparse.Namespace) -> int: # Base URL base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists" - domains = list() - logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists)) for block in blocklists.oliphant_blocklists: # Is domain given and not equal blocker? @@ -1022,119 +1034,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int: logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain) continue - instances.set_last_blocked(block["blocker"]) - - # Fetch this URL - logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"]) - response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - - logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content)) - if not response.ok or response.status_code > 200 or response.content == "": - logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"]) - continue - - logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content)) - reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") - - blockdict = list() - - cnt = 0 - for row in reader: - logger.debug("row[%s]='%s'", type(row), row) - domain = severity = None - reject_media = reject_reports = False - - if "#domain" in row: - domain = row["#domain"] - elif "domain" in row: - domain = row["domain"] - else: - logger.debug("row='%s' does not contain domain column", row) - continue - - if "#severity" in row: - severity = blocks.alias_block_level(row["#severity"]) - elif "severity" in row: - severity = blocks.alias_block_level(row["severity"]) - else: - logger.debug("row='%s' does not contain severity column", row) - continue - - if "#reject_media" in row and row["#reject_media"].lower() == "true": - reject_media = True - elif "reject_media" in row and row["reject_media"].lower() == "true": - reject_media = True - - if "#reject_reports" in row and row["#reject_reports"].lower() == "true": - reject_reports = True - elif "reject_reports" in row and row["reject_reports"].lower() == "true": - reject_reports = True - - cnt = cnt + 1 - logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports) - if domain is None or domain == "": - logger.debug("domain='%s' is empty - SKIPPED!", domain) - continue - elif domain.endswith(".onion"): - logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain) - continue - elif domain.endswith(".arpa"): - logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain) - continue - elif domain.endswith(".tld"): - logger.debug("domain='%s' is a fake domain - SKIPPED", domain) - continue - elif domain.find("*") >= 0 or domain.find("?") >= 0: - logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"]) - domain = utils.deobfuscate(domain, block["blocker"]) - logger.debug("domain='%s' - AFTER!", domain) - - if not validators.domain(domain): - logger.debug("domain='%s' is not a valid domain - SKIPPED!") - continue - elif blacklist.is_blacklisted(domain): - logger.warning("domain='%s' is blacklisted - SKIPPED!", domain) - continue - elif blocks.is_instance_blocked(block["blocker"], domain, severity): - logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity) - continue - - logger.debug("Marking domain='%s' as handled", domain) - domains.append(domain) - - logger.debug("Processing domain='%s' ...", domain) - processed = processing.instance(domain, block["blocker"], inspect.currentframe().f_code.co_name) - logger.debug("processed='%s'", processed) - - if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"): - logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"]) - blockdict.append({ - "blocked": domain, - "reason" : block["reason"], - }) - - if reject_media: - processing.block(block["blocker"], domain, None, "reject_media") - if reject_reports: - processing.block(block["blocker"], domain, None, "reject_reports") - - logger.debug("block[blocker]='%s'", block["blocker"]) - if not blocklists.has(block["blocker"]): - logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains)) - instances.set_total_blocks(block["blocker"], domains) - - logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"]) - if instances.has_pending(block["blocker"]): - logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"]) - instances.update(block["blocker"]) - - logger.debug("Invoking commit() ...") - database.connection.commit() - - logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) - if config.get("bot_enabled") and len(blockdict) > 0: - logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict)) - network.send_bot_post(block["blocker"], blockdict) + processing.csv_block(block["blocker"], f"{base_url}/{block['csv_url']}", inspect.currentframe().f_code.co_name) logger.debug("Success! - EXIT!") return 0 @@ -1197,7 +1097,7 @@ def fetch_fedipact(args: argparse.Namespace) -> int: source_domain = "fedipact.online" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1256,7 +1156,7 @@ def fetch_joinmobilizon(args: argparse.Namespace) -> int: source_domain = "instances.joinmobilizon.org" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1304,7 +1204,7 @@ def fetch_joinmisskey(args: argparse.Namespace) -> int: source_domain = "instanceapp.misskey.page" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1352,7 +1252,7 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: source_domain = "joinfediverse.wiki" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1657,7 +1557,7 @@ def fetch_fedilist(args: argparse.Namespace) -> int: source_domain = "demo.fedilist.com" if sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 1 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) @@ -1797,7 +1697,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int: return 1 elif sources.is_recent(source_domain): logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) - return 0 + return 2 else: logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) sources.update(source_domain) diff --git a/fba/helpers/blocklists.py b/fba/helpers/blocklists.py index b6d018b..819d8a8 100644 --- a/fba/helpers/blocklists.py +++ b/fba/helpers/blocklists.py @@ -22,7 +22,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) #logger.setLevel(logging.DEBUG) -# URLs to fetch +# Blocklists hosted by oliphant oliphant_blocklists = ( { "blocker": "artisan.chat", @@ -72,7 +72,15 @@ oliphant_blocklists = ( },{ "blocker": "oliphant.social", "csv_url": "mastodon/birdsite.csv", - } + }, +) + +# Other CSV files +csv_files = ( + { + "blocker": "tooters.org", + "csv_url": "https://raw.githubusercontent.com/victorwynne/victorwynne/tooters/federation/tooters_defederations.csv", + }, ) def has(domain: str) -> bool: @@ -81,7 +89,7 @@ def has(domain: str) -> bool: # Default is not found found = False - for row in oliphant_blocklists: + for row in oliphant_blocklists + csv_files: logger.debug("row[blocker]='%s',domain='%s'", row["blocker"], domain) if row["blocker"] == domain: found = True diff --git a/fba/helpers/processing.py b/fba/helpers/processing.py index 5881ded..116aa2f 100644 --- a/fba/helpers/processing.py +++ b/fba/helpers/processing.py @@ -13,12 +13,19 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import csv import logging +import validators + +from fba import database from fba import utils from fba.helpers import blacklist +from fba.helpers import blocklists +from fba.helpers import config from fba.helpers import domain as domain_helper +from fba.helpers import tidyup from fba.http import federation from fba.http import network @@ -96,3 +103,148 @@ def block(blocker: str, blocked: str, reason: str, block_level: str) -> bool: logger.debug("added='%s' - EXIT!", added) return added + +def csv_block(blocker: str, url: str, command: str): + logger.debug("blocker='%s',url='%s',command='%s' - CALLED!", blocker, url, command) + domain_helper.raise_on(blocker) + + if not isinstance(url, str): + raise ValueError(f"url[]='{url}' is not of type 'str'") + elif url == "": + raise ValueError("Parameter 'url' is empty") + elif not isinstance(command, str): + raise ValueError(f"command[]='{command}' is not of type 'str'") + elif command == "": + raise ValueError("Parameter 'command' is empty") + + logger.debug("Setting last_blocked for blocker='%s' ...", blocker) + instances.set_last_blocked(blocker) + + domains = list() + + # Fetch this URL + logger.info("Fetching url='%s' for blocker='%s' ...", url, blocker) + response = utils.fetch_url( + url, + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content)) + if not response.ok or response.status_code > 200 or response.content == "": + logger.warning("Could not fetch url='%s' for blocker='%s' - EXIT!", url, blocker) + return + + logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content)) + reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") + + blockdict = list() + + cnt = 0 + for row in reader: + logger.debug("row[%s]='%s'", type(row), row) + domain = severity = reason = None + reject_media = reject_reports = False + + if "#domain" in row: + domain = tidyup.domain(row["#domain"]) if row["#domain"] != None and row["#domain"] != "" else None + elif "domain" in row: + domain = tidyup.domain(row["domain"]) if row["domain"] != None and row["domain"] != "" else None + elif "Domain" in row: + domain = tidyup.domain(row["Domain"]) if row["Domain"] != None and row["Domain"] != "" else None + else: + logger.warning("row='%s' does not contain domain column - SKIPPED!", row) + continue + + if "#severity" in row: + severity = blocks.alias_block_level(row["#severity"]) + elif "severity" in row: + severity = blocks.alias_block_level(row["severity"]) + else: + logger.debug("row='%s' does not contain severity column, setting 'reject'", row) + severity = "reject" + + if "reason" in row: + reason = tidup.reason(row["reason"]) if row["reason"] != None and row["reason"] != "" else None + elif "comment" in row: + reason = tidup.reason(row["comment"]) if row["comment"] != None and row["comment"] != "" else None + else: + logger.debug("row='%s' has no reason/comment key provided", row) + + if "#reject_media" in row and row["#reject_media"].lower() == "true": + reject_media = True + elif "reject_media" in row and row["reject_media"].lower() == "true": + reject_media = True + + if "#reject_reports" in row and row["#reject_reports"].lower() == "true": + reject_reports = True + elif "reject_reports" in row and row["reject_reports"].lower() == "true": + reject_reports = True + + cnt = cnt + 1 + logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports) + if domain is None or domain == "": + logger.debug("domain='%s' is empty - SKIPPED!", domain) + continue + elif domain.endswith(".onion"): + logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain) + continue + elif domain.endswith(".arpa"): + logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain) + continue + elif domain.endswith(".tld"): + logger.debug("domain='%s' is a fake domain - SKIPPED", domain) + continue + elif domain.find("*") >= 0 or domain.find("?") >= 0: + logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, blocker) + domain = utils.deobfuscate(domain, blocker) + logger.debug("domain='%s' - AFTER!", domain) + + if not validators.domain(domain): + logger.debug("domain='%s' is not a valid domain - SKIPPED!") + continue + elif blacklist.is_blacklisted(domain): + logger.warning("domain='%s' is blacklisted - SKIPPED!", domain) + continue + elif blocks.is_instance_blocked(blocker, domain, severity): + logger.debug("blocker='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", blocker, domain, severity) + continue + + logger.debug("Marking domain='%s' as handled", domain) + domains.append(domain) + + logger.debug("Processing domain='%s',blocker='%s',command='%s' ...", domain, blocker, command) + processed = instance(domain, blocker, command) + logger.debug("processed='%s'", processed) + + if block(blocker, domain, reason, severity) and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, reason, blocker) + blockdict.append({ + "blocked": domain, + "reason" : reason, + }) + + if reject_media: + block(blocker, domain, None, "reject_media") + if reject_reports: + block(blocker, domain, None, "reject_reports") + + logger.debug("blocker='%s'", blocker) + if not blocklists.has(blocker): + logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", blocker, len(domains)) + instances.set_total_blocks(blocker, domains) + + logger.debug("Checking if blocker='%s' has pending updates ...", blocker) + if instances.has_pending(blocker): + logger.debug("Flushing updates for blocker='%s' ...", blocker) + instances.update(blocker) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) + + logger.debug("EXIT!") -- 2.39.5