X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fcommands.py;h=47d21ed4e10e68a6d2d35561790d3d1ebab25cd1;hb=b7835ba713d0b484e90020f747e19093a132829d;hp=65965780767b8ca20f4b0754915beb96dd743097;hpb=2c841ed5b537b238dcbee64e20152d719ce91686;p=fba.git diff --git a/fba/commands.py b/fba/commands.py index 6596578..47d21ed 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -20,6 +20,8 @@ import json import logging import time +from urllib.parse import urlparse + import argparse import atoma import bs4 @@ -27,12 +29,16 @@ import markdown import reqto import validators -from fba import fba +from fba import csrf +from fba import database +from fba import utils from fba.helpers import blacklist from fba.helpers import config from fba.helpers import cookies from fba.helpers import locking +from fba.helpers import processing +from fba.helpers import software as software_helper from fba.helpers import tidyup from fba.http import federation @@ -40,6 +46,7 @@ from fba.http import network from fba.models import blocks from fba.models import instances +from fba.models import sources from fba.networks import friendica from fba.networks import lemmy @@ -49,6 +56,7 @@ from fba.networks import pleroma logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) def check_instance(args: argparse.Namespace) -> int: logger.debug("args.domain='%s' - CALLED!", args.domain) @@ -65,28 +73,141 @@ def check_instance(args: argparse.Namespace) -> int: else: logger.info("args.domain='%s' is not known", args.domain) - logger.debug(f"status={status} - EXIT!") + logger.debug("status=%d - EXIT!", status) return status +def check_nodeinfo(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + # Fetch rows + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC") + + cnt = 0 + for row in database.cursor.fetchall(): + logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"]) + punycode = row["domain"].encode("idna").decode("utf-8") + + if row["nodeinfo_url"].startswith("/"): + logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"]) + continue + elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1: + logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"]) + cnt = cnt + 1 + + logger.info("Found %d row(s)", cnt) + + logger.debug("EXIT!") + return 0 + +def fetch_pixelfed_api(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + # No CSRF by default, you don't have to add network.source_headers by yourself here + headers = tuple() + source_domain = "pixelfed.org" + + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + try: + logger.debug("Checking CSRF from source_domain='%s' ...", source_domain) + headers = csrf.determine(source_domain, dict()) + except network.exceptions as exception: + logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__) + return list() + + try: + logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) + fetched = network.get_json_api( + source_domain, + "/api/v1/servers/all.json?scope=All&country=all&language=all", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + logger.debug("JSON API returned %d elements", len(fetched)) + if "error_message" in fetched: + logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"]) + return 101 + elif "data" not in fetched["json"]: + logger.warning("API did not return JSON with 'data' element - EXIT!") + return 102 + + rows = fetched["json"]["data"] + logger.info("Checking %d fetched rows ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + if "domain" not in row: + logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row) + continue + elif row["domain"] == "": + logger.debug("row[domain] is empty - SKIPPED!") + continue + + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + domain = row["domain"].encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.debug("Fetching instances from domain='%s' ...", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + except network.exceptions as exception: + logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception)) + return 103 + + logger.debug("Success! - EXIT!") + return 0 + def fetch_bkali(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "gql.api.bka.li" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + domains = list() try: - fetched = network.post_json_api("gql.api.bka.li", "/v1/graphql", json.dumps({ - "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" - })) + logger.info("Fetching domainlist from source_domain='%s' ...", source_domain) + fetched = network.post_json_api( + source_domain, + "/v1/graphql", + json.dumps({ + "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" + }) + ) logger.debug("fetched[]='%s'", type(fetched)) if "error_message" in fetched: - logger.warning(f"post_json_api() for 'gql.api.bka.li' returned error message: {fetched['error_message']}") + logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"]) return 100 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]: - logger.warning(f"post_json_api() returned error: {fetched['error']['message']}") + logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"]) return 101 rows = fetched["json"] - logger.debug(f"rows({len(rows)})[]='{type(rows)}'") + logger.debug("rows(%d)[]='%s'", len(rows), type(rows)) if len(rows) == 0: raise Exception("WARNING: Returned no records") elif "data" not in rows: @@ -95,271 +216,300 @@ def fetch_bkali(args: argparse.Namespace) -> int: raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'") for entry in rows["data"]["nodeinfo"]: - logger.debug(f"entry['{type(entry)}']='{entry}'") + logger.debug("entry[%s]='%s'", type(entry), entry) if "domain" not in entry: - logger.warning(f"entry()={len(entry)} does not contain 'domain' - SKIPPED!") + logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry)) continue - elif not validators.domain(entry["domain"]): - logger.warning(f"domain='{entry['domain']}' is not a valid domain - SKIPPED!") + elif entry["domain"] == "": + logger.debug("entry[domain] is empty - SKIPPED!") continue - elif blacklist.is_blacklisted(entry["domain"]): - logger.debug(f"domain='{entry['domain']}' is blacklisted - SKIPPED!") + elif not utils.is_domain_wanted(entry["domain"]): + logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"]) continue elif instances.is_registered(entry["domain"]): - logger.debug(f"domain='{entry['domain']}' is already registered - SKIPPED!") + logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"]) continue elif instances.is_recent(entry["domain"]): - logger.debug(f"domain='{entry['domain']}' has been recently fetched - SKIPPED!") + logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"]) continue - logger.debug(f"Adding domain='{entry['domain']}' ...") + logger.debug("Adding domain='%s' ...", entry["domain"]) domains.append(entry["domain"]) except network.exceptions as exception: - logger.error(f"Cannot fetch graphql,exception[{type(exception)}]:'{str(exception)}' - EXIT!") + logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception)) return 102 - logger.debug(f"domains()={len(domains)}") + logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + try: logger.info("Fetching instances from domain='%s' ...", domain) - federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) - - logger.debug(f"Invoking cookies.clear({domain}) ...") - cookies.clear(domain) + federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_bkali) from domain='{domain}'") + logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) + return 100 logger.debug("Success - EXIT!") return 0 -def fetch_blocks(args: argparse.Namespace): +def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) if args.domain is not None and args.domain != "": - logger.debug(f"args.domain='{args.domain}' - checking ...") + logger.debug("args.domain='%s' - checking ...", args.domain) if not validators.domain(args.domain): - logger.warning(f"domain='{args.domain}' is not valid.") - return + logger.warning("args.domain='%s' is not valid.", args.domain) + return 100 elif blacklist.is_blacklisted(args.domain): - logger.warning(f"domain='{args.domain}' is blacklisted, won't check it!") - return + logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain) + return 101 elif not instances.is_registered(args.domain): - logger.warning(f"domain='{args.domain}' is not registered, please run ./fba.py fetch_instances {args.domain} first.") - return + logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain) + return 102 + logger.debug("Invoking locking.acquire() ...") locking.acquire() if args.domain is not None and args.domain != "": # Re-check single domain - logger.debug(f"Querying database for single args.domain='{args.domain}' ...") - fba.cursor.execute( + logger.debug("Querying database for single args.domain='%s' ...", args.domain) + database.cursor.execute( "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain] ) elif args.software is not None and args.software != "": # Re-check single software - logger.debug(f"Querying database for args.software='{args.software}' ...") - fba.cursor.execute( - "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ?", [args.software] + logger.debug("Querying database for args.software='%s' ...", args.software) + database.cursor.execute( + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software] ) else: # Re-check after "timeout" (aka. minimum interval) - fba.cursor.execute( - "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube') AND (last_blocked IS NULL OR last_blocked < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_block")] + database.cursor.execute( + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")] ) - rows = fba.cursor.fetchall() + rows = database.cursor.fetchall() logger.info("Checking %d entries ...", len(rows)) for blocker, software, origin, nodeinfo_url in rows: - logger.debug("BEFORE blocker,software,origin,nodeinfo_url:", blocker, software, origin, nodeinfo_url) - blockdict = list() + logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url) blocker = tidyup.domain(blocker) - logger.debug("AFTER blocker,software:", blocker, software) + logger.debug("blocker='%s' - AFTER!", blocker) if blocker == "": logger.warning("blocker is now empty!") continue elif nodeinfo_url is None or nodeinfo_url == "": - logger.debug(f"blocker='{blocker}',software='{software}' has empty nodeinfo_url") + logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software) continue - elif blacklist.is_blacklisted(blocker): - logger.warning(f"blocker='{blocker}' is blacklisted now!") + elif not utils.is_domain_wanted(blocker): + logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker) continue - logger.debug(f"blocker='{blocker}'") + logger.debug("blocker='%s'", blocker) instances.set_last_blocked(blocker) + instances.set_has_obfuscation(blocker, False) + blocking = list() if software == "pleroma": logger.info("blocker='%s',software='%s'", blocker, software) - pleroma.fetch_blocks(blocker, origin, nodeinfo_url) + blocking = pleroma.fetch_blocks(blocker, nodeinfo_url) elif software == "mastodon": logger.info("blocker='%s',software='%s'", blocker, software) - mastodon.fetch_blocks(blocker, origin, nodeinfo_url) + blocking = mastodon.fetch_blocks(blocker, nodeinfo_url) elif software == "lemmy": logger.info("blocker='%s',software='%s'", blocker, software) - lemmy.fetch_blocks(blocker, origin, nodeinfo_url) - elif software == "friendica" or software == "misskey": + blocking = lemmy.fetch_blocks(blocker, nodeinfo_url) + elif software == "friendica": logger.info("blocker='%s',software='%s'", blocker, software) + blocking = friendica.fetch_blocks(blocker) + elif software == "misskey": + logger.info("blocker='%s',software='%s'", blocker, software) + blocking = misskey.fetch_blocks(blocker) + else: + logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software) + + logger.debug("blocker='%s'", blocker) + if blocker != "chaos.social": + logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) + instances.set_total_blocks(blocker, blocking) + + logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software) + blockdict = list() + for block in blocking: + logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"]) + + if block["block_level"] == "": + logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"]) + continue + + logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"]) + block["blocked"] = tidyup.domain(block["blocked"]) + block["reason"] = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None + logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"]) + + if block["blocked"] == "": + logger.warning("blocked is empty, blocker='%s'", blocker) + continue + elif block["blocked"].endswith(".onion"): + logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"]) + continue + elif block["blocked"].endswith(".arpa"): + logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"]) + continue + elif block["blocked"].endswith(".tld"): + logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"]) + continue + elif block["blocked"].find("*") >= 0: + logger.debug("blocker='%s' uses obfuscated domains", blocker) + + # Some friendica servers also obscure domains without hash + row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None) - blocking = list() - if software == "friendica": - blocking = friendica.fetch_blocks(blocker) - elif software == "misskey": - blocking = misskey.fetch_blocks(blocker) - - logger.info("Checking %s entries from blocker='%s',software='%s' ...", len(blocking.items()), blocker, software) - for block_level, blocklist in blocking.items(): - logger.debug("blocker='%s',block_level='%s',blocklist()=%d", blocker, block_level, len(blocklist)) - block_level = tidyup.domain(block_level) - logger.debug("AFTER-block_level='%s'", block_level) - if block_level == "": - logger.warning("block_level is empty, blocker:", blocker) + logger.debug("row[]='%s'", type(row)) + if row is None: + logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software) + instances.set_has_obfuscation(blocker, True) continue - logger.debug(f"Checking {len(blocklist)} entries from blocker='{blocker}',software='{software}',block_level='{block_level}' ...") - for block in blocklist: - blocked, reason = block.values() - logger.debug(f"blocked='{blocked}',reason='{reason}' - BEFORE!") - blocked = tidyup.domain(blocked) - reason = tidyup.reason(reason) if reason is not None and reason != "" else None - logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!") + block["blocked"] = row["domain"] + origin = row["origin"] + nodeinfo_url = row["nodeinfo_url"] + elif block["blocked"].find("?") >= 0: + logger.debug("blocker='%s' uses obfuscated domains", blocker) - if blocked == "": - logger.warning("blocked is empty, blocker='%s'", blocker) - continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) - continue - elif blocked.count("*") > 0: - # Some friendica servers also obscure domains without hash - row = instances.deobscure("*", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocker='{blocker}',software='{software}' - SKIPPED!") - continue - - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Some obscure them with question marks, not sure if that's dependent on version or not - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocker='{blocker}',software='{software}' - SKIPPED!") - continue - - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - - logger.debug("Looking up instance by domain:", blocked) - if not validators.domain(blocked): - logger.warning(f"blocked='{blocked}',software='{software}' is not a valid domain name - SKIPPED!") - continue - elif blocked.endswith(".arpa"): - logger.debug("blocked='%s' is a domain for reversed IP addresses - SKIPPED!", blocked) - continue - elif blocked.endswith(".tld"): - logger.debug(f"blocked='{blocked}' is a fake domain - SKIPPED!") - continue - elif not instances.is_registered(blocked): - logger.debug("Hash wasn't found, adding:", blocked, blocker) - try: - instances.add(blocked, blocker, inspect.currentframe().f_code.co_name, nodeinfo_url) - except network.exceptions as exception: - print(f"Exception during adding blocked='{blocked}',blocker='{blocker}': '{type(exception)}'") - continue - - if not blocks.is_instance_blocked(blocker, blocked, block_level): - blocks.add_instance(blocker, blocked, reason, block_level) - - if block_level == "reject": - blockdict.append({ - "blocked": blocked, - "reason" : reason - }) - else: - logger.debug(f"Updating block last seen and reason for blocker='{blocker}',blocked='{blocked}' ...") - blocks.update_last_seen(blocker, blocked, block_level) - blocks.update_reason(reason, blocker, blocked, block_level) - - logger.debug(f"Invoking cookies.clear({blocked}) ...") - cookies.clear(blocked) - - logger.debug("Committing changes ...") - fba.connection.commit() - else: - logger.warning("Unknown software:", blocker, software) + # Some obscure them with question marks, not sure if that's dependent on version or not + row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None) + logger.debug("row[]='%s'", type(row)) + if row is None: + logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software) + instances.set_has_obfuscation(blocker, True) + continue + + block["blocked"] = row["domain"] + origin = row["origin"] + nodeinfo_url = row["nodeinfo_url"] + + logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"]) + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + + logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"]) + block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8") + logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + + if not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + elif block["block_level"] in ["accept", "accepted"]: + logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"]) + continue + elif not instances.is_registered(block["blocked"]): + logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker) + federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name) + + block["block_level"] = blocks.alias_block_level(block["block_level"]) + + if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker) + blockdict.append({ + "blocked": block["blocked"], + "reason" : block["reason"], + }) + + logger.debug("Invoking cookies.clear(%s) ...", block["blocked"]) + cookies.clear(block["blocked"]) + + logger.debug("Checking if blocker='%s' has pending updates ...", blocker) if instances.has_pending(blocker): - logger.debug(f"Invoking instances.update_data({blocker}) ...") + logger.debug("Flushing updates for blocker='%s' ...", blocker) instances.update_data(blocker) - if config.get("bot_enabled") and len(blockdict) > 0: - network.send_bot_post(blocker, blockdict) + logger.debug("Invoking commit() ...") + database.connection.commit() - logger.debug(f"Invoking cookies.clear({blocker}) ...") + logger.debug("Invoking cookies.clear(%s) ...", blocker) cookies.clear(blocker) - logger.debug("EXIT!") + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) -def fetch_observer(args: argparse.Namespace): + logger.debug("Success! - EXIT!") + return 0 + +def fetch_observer(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - types = [ - "akoma", - "birdsitelive", - "bookwyrm", - "calckey", - "diaspora", - "foundkey", - "friendica", - "funkwhale", - "gancio", - "gnusocial", - "gotosocial", - "hometown", - "hubzilla", - "kbin", - "ktistec", - "lemmy", - "mastodon", - "microblogpub", - "misskey", - "mitra", - "mobilizon", - "owncast", - "peertube", - "pixelfed", - "pleroma", - "plume", - "snac", - "takahe", - "wildebeest", - "writefreely" - ] + logger.debug("Invoking locking.acquire() ...") locking.acquire() + source_domain = "fediverse.observer" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + types = list() + if args.software is None: + logger.info("Fetching software list ...") + raw = utils.fetch_url( + f"https://{source_domain}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + + doc = bs4.BeautifulSoup(raw, features="html.parser") + logger.debug("doc[]='%s'", type(doc)) + + items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"}) + logger.debug("items[]='%s'", type(items)) + + logger.info("Checking %d menu items ...", len(items)) + for item in items: + logger.debug("item[%s]='%s'", type(item), item) + if item.text.lower() == "all": + logger.debug("Skipping 'All' menu entry ...") + continue + + logger.debug("Appending item.text='%s' ...", item.text) + types.append(tidyup.domain(item.text)) + else: + logger.info("Adding args.software='%s' as type ...", args.software) + types.append(args.software) + logger.info("Fetching %d different table data ...", len(types)) for software in types: - doc = None + logger.debug("software='%s' - BEFORE!", software) + if args.software is not None and args.software != software: + logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software) + continue + doc = None try: - logger.debug(f"Fetching table data for software='{software}' ...") - raw = fba.fetch_url(f"https://fediverse.observer/app/views/tabledata.php?software={software}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text - logger.debug(f"raw[{type(raw)}]()={len(raw)}") - - doc = bs4.BeautifulSoup(raw, features='html.parser') - logger.debug("doc[]='%'", type(doc)) + logger.debug("Fetching table data for software='%s' ...", software) + raw = utils.fetch_url( + f"https://{source_domain}/app/views/tabledata.php?software={software}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + + doc = bs4.BeautifulSoup(raw, features="html.parser") + logger.debug("doc[]='%s'", type(doc)) except network.exceptions as exception: - logger.warning(f"Cannot fetch software='{software}' from fediverse.observer: '{type(exception)}'") + logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception)) continue items = doc.findAll("a", {"class": "url"}) @@ -367,28 +517,122 @@ def fetch_observer(args: argparse.Namespace): for item in items: logger.debug("item[]='%s'", type(item)) domain = item.decode_contents() + logger.debug("domain='%s' - AFTER!", domain) - logger.debug("domain='%s'", domain) - if not validators.domain(domain.split("/")[0]): - logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") continue - elif blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): - logger.debug(f"domain='{domain}' is already registered - SKIPPED!") + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' is recently being handled - SKIPPED!", domain) continue - logger.info(f"Fetching instances for domain='{domain}',software='{software}'") + software = software_helper.alias(software) + logger.info("Fetching instances for domain='%s'", domain) federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) - logger.debug(f"Invoking cookies.clear({domain}) ...") - cookies.clear(domain) + logger.debug("Success! - EXIT!") + return 0 - logger.debug("EXIT!") +def fetch_todon_wiki(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "wiki.todon.eu" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + blocklist = { + "silenced": list(), + "reject": list(), + } + + raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + doc = bs4.BeautifulSoup(raw, "html.parser") + logger.debug("doc[]='%s'", type(doc)) + + silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li") + logger.info("Checking %d silenced/limited entries ...", len(silenced)) + blocklist["silenced"] = utils.find_domains(silenced, "div") + + suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li") + logger.info("Checking %d suspended entries ...", len(suspended)) + blocklist["reject"] = utils.find_domains(suspended, "div") + + blocking = blocklist["silenced"] + blocklist["reject"] + blocker = "todon.eu" + + logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) + instances.set_total_blocks(blocker, blocking) + + blockdict = list() + for block_level in blocklist: + blockers = blocklist[block_level] + + logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers)) + for blocked in blockers: + logger.debug("blocked='%s'", blocked) + + if not instances.is_registered(blocked): + try: + logger.info("Fetching instances from domain='%s' ...", blocked) + federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name) + except network.exceptions as exception: + logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked) + instances.set_last_error(blocked, exception) + + if blocks.is_instance_blocked(blocker, blocked, block_level): + logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level) + continue + + logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level) + if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker) + blockdict.append({ + "blocked": blocked, + "reason" : None, + }) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) + + logger.debug("Checking if blocker='%s' has pending updates ...", blocker) + if instances.has_pending(blocker): + logger.debug("Flushing updates for blocker='%s' ...", blocker) + instances.update_data(blocker) + + logger.debug("Success! - EXIT!") + return 0 def fetch_cs(args: argparse.Namespace): logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + extensions = [ "extra", "abbr", @@ -409,174 +653,255 @@ def fetch_cs(args: argparse.Namespace): "wikilinks" ] - domains = { + blocklist = { "silenced": list(), "reject" : list(), } - raw = fba.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text - logger.debug(f"raw()={len(raw)}[]='{type(raw)}'") + source_domain = "raw.githubusercontent.com" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) - doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features='html.parser') + raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser") + logger.debug("doc()=%d[]='%s'", len(doc), type(doc)) - logger.debug(f"doc()={len(doc)}[]='{type(doc)}'") silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody") - logger.debug(f"silenced[]='{type(silenced)}'") - domains["silenced"] = domains["silenced"] + federation.find_domains(silenced) + logger.debug("silenced[%s]()=%d", type(silenced), len(silenced)) + blocklist["silenced"] = federation.find_domains(silenced) blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody") - logger.debug(f"blocked[]='{type(blocked)}'") - domains["reject"] = domains["reject"] + federation.find_domains(blocked) + logger.debug("blocked[%s]()=%d", type(blocked), len(blocked)) + blocklist["reject"] = federation.find_domains(blocked) - logger.debug(f"domains()={len(domains)}") - if len(domains) > 0: - locking.acquire() + blocking = blocklist["silenced"] + blocklist["reject"] + blocker = "chaos.social" - logger.info(f"Adding {len(domains)} new instances ...") - for block_level in domains: - logger.debug(f"block_level='{block_level}'") + logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) + instances.set_total_blocks(blocker, blocking) - for row in domains[block_level]: - logger.debug(f"row='{row}'") - if not blocks.is_instance_blocked('chaos.social', row["domain"], block_level): - logger.debug(f"domain='{row['domain']}',block_level='{block_level}' blocked by chaos.social, adding ...") - blocks.add_instance('chaos.social', row["domain"], row["reason"], block_level) + logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"])) + if len(blocking) > 0: + blockdict = list() + for block_level in blocklist: + logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level])) - if not instances.is_registered(row["domain"]): + for row in blocklist[block_level]: + logger.debug("row[%s]='%s'", type(row), row) + if not "domain" in row: + logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row)) + continue + elif instances.is_recent(row["domain"], "last_blocked"): + logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) + continue + elif not instances.is_registered(row["domain"]): try: - logger.info(f"Fetching instances from domain='{row['domain']}' ...") - federation.fetch_instances(row["domain"], 'chaos.social', None, inspect.currentframe().f_code.co_name) - - logger.debug(f"Invoking cookies.clear({row['domain']}) ...") - cookies.clear(row["domain"]) + logger.info("Fetching instances from domain='%s' ...", row["domain"]) + federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_cs) from domain='{row['domain']}'") + logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"]) instances.set_last_error(row["domain"], exception) - logger.debug("Committing changes ...") - fba.connection.commit() + if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker) + blockdict.append({ + "blocked": row["domain"], + "reason" : row["reason"], + }) - logger.debug("EXIT!") + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) -def fetch_fba_rss(args: argparse.Namespace): + logger.debug("Checking if blocker='%s' has pending updates ...", blocker) + if instances.has_pending(blocker): + logger.debug("Flushing updates for blocker='%s' ...", blocker) + instances.update_data(blocker) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_fba_rss(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + domains = list() - logger.info(f"Fetch FBA-specific RSS args.feed='{args.feed}' ...") - response = fba.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + components = urlparse(args.feed) - logger.debug(f"response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}") + if sources.is_recent(components.netloc): + logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc) + return 0 + else: + logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc) + sources.update(components.netloc) + + logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed) + response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code < 300 and len(response.text) > 0: - logger.debug(f"Parsing RSS feed ({len(response.text)} Bytes) ...") + logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text)) rss = atoma.parse_rss_bytes(response.content) - logger.debug(f"rss[]='{type(rss)}'") + logger.debug("rss[]='%s'", type(rss)) for item in rss.items: - logger.debug(f"item={item}") - domain = item.link.split("=")[1] + logger.debug("item='%s'", item) + domain = tidyup.domain(item.link.split("=")[1]) + + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) - if blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: - logger.debug(f"domain='{domain}' is already added - SKIPPED!") + logger.debug("domain='%s' is already added - SKIPPED!", domain) continue elif instances.is_registered(domain): - logger.debug(f"domain='{domain}' is already registered - SKIPPED!") + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue - logger.debug(f"Adding domain='{domain}'") + logger.debug("Adding domain='%s'", domain) domains.append(domain) - logger.debug(f"domains()={len(domains)}") + logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - - logger.info(f"Adding {len(domains)} new instances ...") + logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s'", domain) try: - logger.info(f"Fetching instances from domain='{domain}' ...") + logger.info("Fetching instances from domain='%s' ...", domain) federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) - - logger.debug(f"Invoking cookies.clear({domain}) ...") - cookies.clear(domain) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_fba_rss) from domain='{domain}'") + logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) + return 100 - logger.debug("EXIT!") + logger.debug("Success! - EXIT!") + return 0 -def fetch_fbabot_atom(args: argparse.Namespace): +def fetch_fbabot_atom(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - feed = "https://ryona.agency/users/fba/feed.atom" + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "ryona.agency" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + feed = f"https://{source_domain}/users/fba/feed.atom" domains = list() - logger.info(f"Fetching ATOM feed='{feed}' from FBA bot account ...") - response = fba.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed) + response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - logger.debug(f"response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}") + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code < 300 and len(response.text) > 0: - logger.debug(f"Parsing ATOM feed ({len(response.text)} Bytes) ...") + logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text)) atom = atoma.parse_atom_bytes(response.content) - logger.debug(f"atom[]='{type(atom)}'") + logger.debug("atom[]='%s'", type(atom)) for entry in atom.entries: - logger.debug(f"entry[]='{type(entry)}'") + logger.debug("entry[]='%s'", type(entry)) doc = bs4.BeautifulSoup(entry.content.value, "html.parser") - logger.debug("doc[]='%'", type(doc)) + logger.debug("doc[]='%s'", type(doc)) for element in doc.findAll("a"): + logger.debug("element[]='%s'", type(element)) for href in element["href"].split(","): - logger.debug(f"href[{type(href)}]={href}") + logger.debug("href[%s]='%s' - BEFORE!", type(href), href) domain = tidyup.domain(href) - logger.debug("domain='%s'", domain) - if blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: - logger.debug(f"domain='{domain}' is already added - SKIPPED!") + logger.debug("domain='%s' is already added - SKIPPED!", domain) continue elif instances.is_registered(domain): - logger.debug(f"domain='{domain}' is already registered - SKIPPED!") + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue - logger.debug(f"Adding domain='{domain}',domains()={len(domains)}") + logger.debug("Adding domain='%s',domains()=%d", domain, len(domains)) domains.append(domain) - logger.debug(f"domains({len(domains)})={domains}") + logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - - logger.info(f"Adding {len(domains)} new instances ...") + logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s'", domain) try: - logger.info(f"Fetching instances from domain='{domain}' ...") - federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) - - logger.debug(f"Invoking cookies.clear({domain}) ...") - cookies.clear(domain) + logger.info("Fetching instances from domain='%s' ...", domain) + federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_fbabot_atom) from domain='{domain}'") + logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) + return 100 - logger.debug("EXIT!") + logger.debug("Success! - EXIT!") + return 0 def fetch_instances(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("args.domain='%s' - checking ...", args.domain) + if not validators.domain(args.domain): + logger.warning("args.domain='%s' is not valid.", args.domain) + return 100 + elif blacklist.is_blacklisted(args.domain): + logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain) + return 101 + + logger.debug("Invoking locking.acquire() ...") locking.acquire() # Initial fetch try: - logger.info(f"Fetching instances from args.domain='{args.domain}' ...") + logger.info("Fetching instances from args.domain='%s' ...", args.domain) federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name) - - logger.debug(f"Invoking cookies.clear({args.domain}) ...") - cookies.clear(args.domain) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_instances) from args.domain='{args.domain}'") + logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain) instances.set_last_error(args.domain, exception) - + instances.update_data(args.domain) return 100 if args.single: @@ -584,37 +909,52 @@ def fetch_instances(args: argparse.Namespace) -> int: return 0 # Loop through some instances - fba.cursor.execute( - "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")] + database.cursor.execute( + "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")] ) - rows = fba.cursor.fetchall() + rows = database.cursor.fetchall() logger.info("Checking %d entries ...", len(rows)) for row in rows: - logger.debug(f"domain='{row[0]}'") - if blacklist.is_blacklisted(row[0]): - logger.warning("domain is blacklisted:", row[0]) + logger.debug("row[domain]='%s'", row["domain"]) + if row["domain"] == "": + logger.debug("row[domain] is empty - SKIPPED!") continue - try: - logger.info(f"Fetching instances for instance '{row[0]}' ('{row[2]}') of origin='{row[1]}',nodeinfo_url='{row[3]}'") - federation.fetch_instances(row[0], row[1], row[2], inspect.currentframe().f_code.co_name, row[3]) + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + domain = row["domain"].encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("Domain domain='%s' is not wanted - SKIPPED!", domain) + continue - logger.debug(f"Invoking cookies.clear({row[0]}) ...") - cookies.clear(row[0]) + try: + logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"]) + federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"]) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_instances) from domain='{row[0]}'") - instances.set_last_error(row[0], exception) + logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain) + instances.set_last_error(domain, exception) logger.debug("Success - EXIT!") return 0 -def fetch_oliphant(args: argparse.Namespace): +def fetch_oliphant(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() + source_domain = "codeberg.org" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + # Base URL - base_url = "https://codeberg.org/oliphant/blocklists/raw/branch/main/blocklists" + base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists" # URLs to fetch blocklists = ( @@ -645,6 +985,9 @@ def fetch_oliphant(args: argparse.Namespace): },{ "blocker": "sunny.garden", "csv_url": "mastodon/sunny.garden.csv", + },{ + "blocker": "sunny.garden", + "csv_url": "mastodon/gardenfence.csv", },{ "blocker": "solarpunk.moe", "csv_url": "mastodon/solarpunk.moe.csv", @@ -654,106 +997,751 @@ def fetch_oliphant(args: argparse.Namespace): },{ "blocker": "union.place", "csv_url": "mastodon/union.place.csv", + },{ + "blocker": "oliphant.social", + "csv_url": "mastodon/birdsite.csv", } ) domains = list() + + logger.debug("Downloading %d files ...", len(blocklists)) for block in blocklists: # Is domain given and not equal blocker? if isinstance(args.domain, str) and args.domain != block["blocker"]: - logger.debug(f"Skipping blocker='{block['blocker']}', not matching args.domain='{args.domain}'") + logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain) continue elif args.domain in domains: - logger.debug(f"args.domain='{args.domain}' already handled - SKIPPED!") + logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain) continue # Fetch this URL - logger.info(f"Fetching csv_url='{block['csv_url']}' for blocker='{block['blocker']}' ...") - response = fba.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - - logger.debug("response[]='%s'", type(response)) - if response.ok and response.content != "": - logger.debug(f"Fetched {len(response.content)} Bytes, parsing CSV ...") - reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix") - - logger.debug(f"reader[]='{type(reader)}'") - for row in reader: - domain = None - if "#domain" in row: - domain = row["#domain"] - elif "domain" in row: - domain = row["domain"] - else: - logger.debug(f"row='{row}' does not contain domain column") - continue + logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"]) + response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - if not validators.domain(domain): - logger.warning("domain='%s' is not a valid domain name - SKIPPED!", domain) - continue - elif domain.endswith(".arpa"): - logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain) - continue - elif domain.endswith(".tld"): - logger.debug("domain='%s' is a fake domain - SKIPPED!", domain) - continue - elif blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) - continue + logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content)) + if not response.ok or response.status_code >= 300 or response.content == "": + logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"]) + continue + + logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content)) + reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") - logger.debug(f"Marking domain='{domain}' as handled") - domains.append(domain) + blockdict = list() - logger.debug(f"Processing domain='{domain}' ...") - processed = fba.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name) + cnt = 0 + for row in reader: + logger.debug("row[%s]='%s'", type(row), row) + domain = severity = None + reject_media = reject_reports = False + + if "#domain" in row: + domain = row["#domain"] + elif "domain" in row: + domain = row["domain"] + else: + logger.debug("row='%s' does not contain domain column", row) + continue - logger.debug(f"processed='{processed}'") + if "#severity" in row: + severity = blocks.alias_block_level(row["#severity"]) + elif "severity" in row: + severity = blocks.alias_block_level(row["severity"]) + else: + logger.debug("row='%s' does not contain severity column", row) + continue - logger.debug("EXIT!") + if "#reject_media" in row and row["#reject_media"].lower() == "true": + reject_media = True + elif "reject_media" in row and row["reject_media"].lower() == "true": + reject_media = True + + if "#reject_reports" in row and row["#reject_reports"].lower() == "true": + reject_reports = True + elif "reject_reports" in row and row["reject_reports"].lower() == "true": + reject_reports = True + + cnt = cnt + 1 + logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif domain.endswith(".onion"): + logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain) + continue + elif domain.endswith(".arpa"): + logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain) + continue + elif domain.endswith(".tld"): + logger.debug("domain='%s' is a fake domain - SKIPPED", domain) + continue + elif domain.find("*") >= 0 or domain.find("?") >= 0: + logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"]) + domain = utils.deobfuscate(domain, block["blocker"]) + logger.debug("domain='%s' - AFTER!", domain) + + if not validators.domain(domain): + logger.debug("domain='%s' is not a valid domain - SKIPPED!") + continue + elif blacklist.is_blacklisted(domain): + logger.warning("domain='%s' is blacklisted - SKIPPED!", domain) + continue + + logger.debug("Marking domain='%s' as handled", domain) + domains.append(domain) + + logger.debug("Processing domain='%s' ...", domain) + processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name) + logger.debug("processed='%s'", processed) + + if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"]) + blockdict.append({ + "blocked": domain, + "reason" : block["reason"], + }) + + if reject_media: + processing.block(block["blocker"], domain, None, "reject_media") + if reject_reports: + processing.block(block["blocker"], domain, None, "reject_reports") + + logger.debug("block[blocker]='%s'", block["blocker"]) + if block["blocker"] != "chaos.social": + logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains)) + instances.set_total_blocks(block["blocker"], domains) -def fetch_txt(args: argparse.Namespace): + logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"]) + if instances.has_pending(block["blocker"]): + logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"]) + instances.update_data(block["blocker"]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict)) + network.send_bot_post(block["blocker"], blockdict) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_txt(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() # Static URLs - urls = ( - "https://seirdy.one/pb/bsl.txt", - ) - - logger.info(f"Checking {len(urls)} text file(s) ...") - for url in urls: - logger.debug("Fetching url='%s' ...", url) - response = fba.fetch_url(url, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - - logger.debug("response[]='%s'", type(response)) - if response.ok and response.text != "": - logger.debug(f"Returned {len(response.text.strip())} Bytes for processing") + urls = ({ + "blocker": "seirdy.one", + "url" : "https://seirdy.one/pb/bsl.txt", + },) + + logger.info("Checking %d text file(s) ...", len(urls)) + for row in urls: + logger.debug("Fetching row[url]='%s' ...", row["url"]) + response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) + if response.ok and response.status_code < 300 and response.text != "": + logger.debug("Returned %d Bytes for processing", len(response.text.strip())) domains = response.text.split("\n") - logger.info(f"Processing {len(domains)} domains ...") + logger.info("Processing %d domains ...", len(domains)) for domain in domains: - logger.debug("domain='%s'", domain) + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + + logger.debug("domain='%s' - AFTER!", domain) if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not validators.domain(domain): - logger.warning("domain='%s' is not a valid domain name - SKIPPED!", domain) + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) continue - elif domain.endswith(".arpa"): - logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain) - continue - elif domain.endswith(".tld"): - logger.debug("domain='%s' is a fake domain - SKIPPED!", domain) - continue - elif blacklist.is_blacklisted(domain): - logger.debug("domain='%s' is blacklisted - SKIPPED!", domain) + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue - logger.debug("domain='%s'", domain) - processed = fba.process_domain(domain, 'seirdy.one', inspect.currentframe().f_code.co_name) + logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"]) + processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name) - logger.debug(f"processed='{processed}'") + logger.debug("processed='%s'", processed) if not processed: - logger.debug(f"domain='{domain}' was not generically processed - SKIPPED!") + logger.debug("domain='%s' was not generically processed - SKIPPED!", domain) continue - logger.debug("EXIT!") + logger.debug("Success! - EXIT!") + return 0 + +def fetch_fedipact(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "fedipact.online" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + response = utils.fetch_url( + f"https://{source_domain}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) + if response.ok and response.status_code < 300 and response.text != "": + logger.debug("Parsing %d Bytes ...", len(response.text)) + + doc = bs4.BeautifulSoup(response.text, "html.parser") + logger.debug("doc[]='%s'", type(doc)) + + rows = doc.findAll("li") + logger.info("Checking %d row(s) ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + domain = tidyup.domain(row.contents[0]) + + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching domain='%s' ...", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_joinfediverse(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "joinfediverse.wiki" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + raw = utils.fetch_url( + f"https://{source_domain}/FediBlock", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + doc = bs4.BeautifulSoup(raw, "html.parser") + logger.debug("doc[]='%s'", type(doc)) + + tables = doc.findAll("table", {"class": "wikitable"}) + + logger.info("Analyzing %d table(s) ...", len(tables)) + blocklist = list() + for table in tables: + logger.debug("table[]='%s'", type(table)) + + rows = table.findAll("tr") + logger.info("Checking %d row(s) ...", len(rows)) + block_headers = dict() + for row in rows: + logger.debug("row[%s]='%s'", type(row), row) + + headers = row.findAll("th") + logger.debug("Found headers()=%d header(s)", len(headers)) + if len(headers) > 1: + block_headers = dict() + cnt = 0 + for header in headers: + cnt = cnt + 1 + logger.debug("header[]='%s',cnt=%d", type(header), cnt) + text = header.contents[0] + + logger.debug("text[]='%s'", type(text)) + if not isinstance(text, str): + logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text)) + continue + elif validators.domain(text.strip()): + logger.debug("text='%s' is a domain - SKIPPED!", text.strip()) + continue + + text = tidyup.domain(text.strip()) + logger.debug("text='%s' - AFTER!", text) + if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]: + logger.debug("Found header: '%s'=%d", text, cnt) + block_headers[cnt] = text + + elif len(block_headers) == 0: + logger.debug("row is not scrapable - SKIPPED!") + continue + elif len(block_headers) > 0: + logger.debug("Found a row with %d scrapable headers ...", len(block_headers)) + cnt = 0 + block = dict() + + for element in row.find_all(["th", "td"]): + cnt = cnt + 1 + logger.debug("element[]='%s',cnt=%d", type(element), cnt) + if cnt in block_headers: + logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt]) + + text = element.text.strip() + key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked" + + logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text) + if key in ["domain", "instance"]: + block[key] = text + elif key == "reason": + block[key] = tidyup.reason(text) + elif key == "subdomain(s)": + block[key] = list() + if text != "": + block[key] = text.split("/") + else: + logger.debug("key='%s'", key) + block[key] = text + + logger.debug("block()=%d ...", len(block)) + if len(block) > 0: + logger.debug("Appending block()=%d ...", len(block)) + blocklist.append(block) + + logger.debug("blocklist()=%d", len(blocklist)) + + database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'") + domains = database.cursor.fetchall() + + logger.debug("domains(%d)[]='%s'", len(domains), type(domains)) + blocking = list() + for block in blocklist: + logger.debug("block='%s'", block) + if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0: + origin = block["blocked"] + logger.debug("origin='%s'", origin) + for subdomain in block["subdomain(s)"]: + block["blocked"] = subdomain + "." + origin + logger.debug("block[blocked]='%s'", block["blocked"]) + blocking.append(block) + else: + blocking.append(block) + + logger.debug("blocking()=%d", blocking) + for block in blocking: + logger.debug("block[]='%s'", type(block)) + if "blocked" not in block: + raise KeyError(f"block()={len(block)} does not have element 'blocked'") + + block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8") + logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + elif instances.is_recent(block["blocked"]): + logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"]) + continue + + logger.info("Proccessing blocked='%s' ...", block["blocked"]) + processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name) + + blockdict = list() + for blocker in domains: + blocker = blocker[0] + logger.debug("blocker[%s]='%s'", type(blocker), blocker) + + for block in blocking: + logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None) + block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None + + logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"]) + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + + logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"]) + if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker) + blockdict.append({ + "blocked": block["blocked"], + "reason" : block["reason"], + }) + + if instances.has_pending(blocker): + logger.debug("Flushing updates for blocker='%s' ...", blocker) + instances.update_data(blocker) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) + + logger.debug("Success! - EXIT!") + return 0 + +def recheck_obfuscation(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain): + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain]) + elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software: + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software]) + else: + database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1") + + rows = database.cursor.fetchall() + logger.info("Checking %d domains ...", len(rows)) + for row in rows: + logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"]) + if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None: + logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all)) + continue + + blocking = list() + if row["software"] == "pleroma": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"]) + elif row["software"] == "mastodon": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"]) + elif row["software"] == "lemmy": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"]) + elif row["software"] == "friendica": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = friendica.fetch_blocks(row["domain"]) + elif row["software"] == "misskey": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = misskey.fetch_blocks(row["domain"]) + else: + logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"]) + + logger.debug("row[domain]='%s'", row["domain"]) + if row["domain"] != "chaos.social": + logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking)) + instances.set_total_blocks(row["domain"], blocking) + + obfuscated = 0 + blockdict = list() + + logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"]) + for block in blocking: + logger.debug("block[blocked]='%s'", block["blocked"]) + blocked = None + + if block["blocked"] == "": + logger.debug("block[blocked] is empty - SKIPPED!") + continue + elif block["blocked"].endswith(".arpa"): + logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"]) + continue + elif block["blocked"].endswith(".tld"): + logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"]) + continue + elif block["blocked"].endswith(".onion"): + logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"]) + continue + elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0: + logger.debug("block='%s' is obfuscated.", block["blocked"]) + obfuscated = obfuscated + 1 + blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None) + elif not utils.is_domain_wanted(block["blocked"]): + logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + elif blocks.is_instance_blocked(row["domain"], block["blocked"]): + logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"]) + continue + + logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"]) + if blocked is not None and blocked != block["blocked"]: + logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked) + obfuscated = obfuscated - 1 + if blocks.is_instance_blocked(row["domain"], blocked): + logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"]) + continue + + block["block_level"] = blocks.alias_block_level(block["block_level"]) + + logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked) + if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): + logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"]) + blockdict.append({ + "blocked": blocked, + "reason" : block["reason"], + }) + + logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated) + if obfuscated == 0 and len(blocking) > 0: + logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"]) + instances.set_has_obfuscation(row["domain"], False) + + if instances.has_pending(row["domain"]): + logger.debug("Flushing updates for blocker='%s' ...", row["domain"]) + instances.update_data(row["domain"]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict)) + network.send_bot_post(row["domain"], blockdict) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_fedilist(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "demo.fedilist.com" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + url = f"http://{source_domain}/instance/csv?onion=not" + if args.software is not None and args.software != "": + logger.debug("args.software='%s'", args.software) + url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not" + + logger.info("Fetching url='%s' ...", url) + response = reqto.get( + url, + headers=network.web_headers, + timeout=(config.get("connection_timeout"), config.get("read_timeout")), + allow_redirects=False + ) + + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) + if not response.ok or response.status_code >= 300 or len(response.content) == 0: + logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text)) + return 1 + + reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") + + logger.debug("reader[]='%s'", type(reader)) + for row in reader: + logger.debug("row[]='%s'", type(row)) + if "hostname" not in row: + logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row)) + continue + + logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"]) + domain = tidyup.domain(row["hostname"]) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"]) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif (args.all is None or not args.all) and instances.is_registered(domain): + logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", domain, type(args.all)) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching instances from domain='%s' ...", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def update_nodeinfo(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + if args.domain is not None and args.domain != "": + logger.debug("Fetching args.domain='%s'", args.domain) + database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain]) + elif args.software is not None and args.software != "": + logger.info("Fetching domains for args.software='%s'", args.software) + database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software]) + else: + logger.info("Fetching domains for recently updated ...") + database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")]) + + domains = database.cursor.fetchall() + + logger.info("Checking %d domain(s) ...", len(domains)) + cnt = 0 + for row in domains: + logger.debug("row[]='%s'", type(row)) + try: + logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100)) + software = federation.determine_software(row["domain"]) + + logger.debug("Determined software='%s'", software) + if (software != row["software"] and software is not None) or args.force is True: + logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software) + instances.set_software(row["domain"], software) + + instances.set_success(row["domain"]) + except network.exceptions as exception: + logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"]) + instances.set_last_error(row["domain"], exception) + + instances.set_last_nodeinfo(row["domain"]) + instances.update_data(row["domain"]) + cnt = cnt + 1 + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_instances_social(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "instances.social" + + if config.get("instances_social_api_key") == "": + logger.error("API key not set. Please set in your config.json file.") + return 1 + elif sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + headers = { + "Authorization": f"Bearer {config.get('instances_social_api_key')}", + } + + fetched = network.get_json_api( + source_domain, + "/api/1.0/instances/list?count=0&sort_by=name", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + logger.debug("fetched[]='%s'", type(fetched)) + + if "error_message" in fetched: + logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"]) + return 2 + elif "exception" in fetched: + logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"])) + return 3 + elif "json" not in fetched: + logger.warning("fetched has no element 'json' - EXIT!") + return 4 + elif "instances" not in fetched["json"]: + logger.warning("fetched[row] has no element 'instances' - EXIT!") + return 5 + + domains = list() + rows = fetched["json"]["instances"] + + logger.info("Checking %d row(s) ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + domain = tidyup.domain(row["name"]) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif domain in domains: + logger.debug("domain='%s' is already added - SKIPPED!", domain) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching instances from domain='%s'", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def convert_idna(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "domain") + + database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "origin") + + database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocker") + + database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocked") + + logger.debug("Success! - EXIT!") + return 0