X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fcommands.py;h=3fb4ea6869ded2a25451493b1172fd5db88889bf;hb=6dfd4d714dee64c124a249a3a379e60023ebd3ea;hp=73814f3323ade601d1f91b871a8c2edcaf186a75;hpb=7d9bdb821cfd458c0344b44991188be25d43a140;p=fba.git diff --git a/fba/commands.py b/fba/commands.py index 73814f3..3fb4ea6 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -20,6 +20,8 @@ import json import logging import time +from urllib.parse import urlparse + import argparse import atoma import bs4 @@ -27,22 +29,27 @@ import markdown import reqto import validators -from fba import csrf from fba import database from fba import utils from fba.helpers import blacklist +from fba.helpers import blocklists from fba.helpers import config from fba.helpers import cookies +from fba.helpers import dicts as dict_helper +from fba.helpers import domain as domain_helper from fba.helpers import locking +from fba.helpers import processing from fba.helpers import software as software_helper from fba.helpers import tidyup +from fba.http import csrf from fba.http import federation from fba.http import network from fba.models import blocks from fba.models import instances +from fba.models import sources from fba.networks import friendica from fba.networks import lemmy @@ -56,6 +63,7 @@ logger = logging.getLogger(__name__) def check_instance(args: argparse.Namespace) -> int: logger.debug("args.domain='%s' - CALLED!", args.domain) + status = 0 if not validators.domain(args.domain): logger.warning("args.domain='%s' is not valid", args.domain) @@ -98,20 +106,28 @@ def check_nodeinfo(args: argparse.Namespace) -> int: def fetch_pixelfed_api(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - # No CSRF by default, you don't have to add network.api_headers by yourself here + # No CSRF by default, you don't have to add network.source_headers by yourself here headers = tuple() + source_domain = "pixelfed.org" + + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) try: - logger.debug("Checking CSRF from pixelfed.org") - headers = csrf.determine("pixelfed.org", dict()) + logger.debug("Checking CSRF from source_domain='%s' ...", source_domain) + headers = csrf.determine(source_domain, dict()) except network.exceptions as exception: logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__) return list() try: - logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) + logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) fetched = network.get_json_api( - "pixelfed.org", + source_domain, "/api/v1/servers/all.json?scope=All&country=all&language=all", headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -135,18 +151,23 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: elif row["domain"] == "": logger.debug("row[domain] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(row["domain"]): - logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) + + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + domain = row["domain"].encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue - elif instances.is_registered(row["domain"]): - logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"]) + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) continue - elif instances.is_recent(row["domain"]): - logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue - logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"]) - federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name) + logger.debug("Fetching instances from domain='%s' ...", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception)) @@ -157,15 +178,32 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: def fetch_bkali(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "gql.api.bka.li" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + domains = list() try: - fetched = network.post_json_api("gql.api.bka.li", "/v1/graphql", json.dumps({ - "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" - })) + logger.info("Fetching domainlist from source_domain='%s' ...", source_domain) + fetched = network.post_json_api( + source_domain, + "/v1/graphql", + json.dumps({ + "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" + }) + ) logger.debug("fetched[]='%s'", type(fetched)) if "error_message" in fetched: - logger.warning("post_json_api() for 'gql.api.bka.li' returned error message='%s", fetched["error_message"]) + logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"]) return 100 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]: logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"]) @@ -189,8 +227,8 @@ def fetch_bkali(args: argparse.Namespace) -> int: elif entry["domain"] == "": logger.debug("entry[domain] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(entry["domain"]): - logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"]) + elif not domain_helper.is_wanted(entry["domain"]): + logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"]) continue elif instances.is_registered(entry["domain"]): logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"]) @@ -208,10 +246,12 @@ def fetch_bkali(args: argparse.Namespace) -> int: logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + try: logger.info("Fetching instances from domain='%s' ...", domain) federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name) @@ -237,72 +277,81 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain) return 102 + logger.debug("Invoking locking.acquire() ...") locking.acquire() if args.domain is not None and args.domain != "": # Re-check single domain - logger.debug("Querying database for single args.domain='%s' ...", args.domain) + logger.debug("Querying database for args.domain='%s' ...", args.domain) database.cursor.execute( - "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain] + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain] ) elif args.software is not None and args.software != "": # Re-check single software logger.debug("Querying database for args.software='%s' ...", args.software) database.cursor.execute( - "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software] + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software] + ) + elif args.force: + # Re-check all + logger.debug("Re-checking all instances ...") + database.cursor.execute( + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC" ) else: # Re-check after "timeout" (aka. minimum interval) database.cursor.execute( - "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")] + "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")] ) rows = database.cursor.fetchall() logger.info("Checking %d entries ...", len(rows)) for blocker, software, origin, nodeinfo_url in rows: logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url) - blocker = tidyup.domain(blocker) - logger.debug("blocker='%s' - AFTER!", blocker) - if blocker == "": - logger.warning("blocker is now empty!") - continue - elif nodeinfo_url is None or nodeinfo_url == "": - logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software) - continue - elif not utils.is_domain_wanted(blocker): + if not domain_helper.is_wanted(blocker): logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker) continue - logger.debug("blocker='%s'", blocker) + logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker) instances.set_last_blocked(blocker) instances.set_has_obfuscation(blocker, False) - blocking = list() - blockdict = list() - if software == "pleroma": - logger.info("blocker='%s',software='%s'", blocker, software) - blocking = pleroma.fetch_blocks(blocker, nodeinfo_url) - elif software == "mastodon": - logger.info("blocker='%s',software='%s'", blocker, software) - blocking = mastodon.fetch_blocks(blocker, nodeinfo_url) - elif software == "lemmy": - logger.info("blocker='%s',software='%s'", blocker, software) - blocking = lemmy.fetch_blocks(blocker, nodeinfo_url) - elif software == "friendica": - logger.info("blocker='%s',software='%s'", blocker, software) - blocking = friendica.fetch_blocks(blocker) - elif software == "misskey": - logger.info("blocker='%s',software='%s'", blocker, software) - blocking = misskey.fetch_blocks(blocker) - else: - logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software) + # c.s isn't part of oliphant's "hidden" blocklists + if blocker == "chaos.social" or blocklists.has(blocker): + logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker) + continue + + logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker) + blocking = federation.fetch_blocks(blocker) + + logger.info("blocker='%s',software='%s' has %d block entries returned.", blocker, software, len(blocking)) + if len(blocking) == 0: + logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software) + if software == "pleroma": + blocking = pleroma.fetch_blocks(blocker) + logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software) + elif software == "mastodon": + blocking = mastodon.fetch_blocks(blocker) + logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software) + elif software == "lemmy": + blocking = lemmy.fetch_blocks(blocker) + logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software) + elif software == "friendica": + blocking = friendica.fetch_blocks(blocker) + logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software) + elif software == "misskey": + blocking = misskey.fetch_blocks(blocker) + logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software) + else: + logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software) logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) instances.set_total_blocks(blocker, blocking) - logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software) blockdict = list() + + logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software) for block in blocking: logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"]) @@ -362,8 +411,13 @@ def fetch_blocks(args: argparse.Namespace) -> int: if block["blocked"] == "": logger.debug("block[blocked] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(block["blocked"]): - logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + + logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"]) + block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8") + logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + + if not domain_helper.is_wanted(block["blocked"]): + logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif block["block_level"] in ["accept", "accepted"]: logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"]) @@ -372,9 +426,9 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker) federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name) - block["block_level"] = utils.alias_block_level(block["block_level"]) + block["block_level"] = blocks.alias_block_level(block["block_level"]) - if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): + if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker) blockdict.append({ "blocked": block["blocked"], @@ -387,7 +441,7 @@ def fetch_blocks(args: argparse.Namespace) -> int: logger.debug("Checking if blocker='%s' has pending updates ...", blocker) if instances.has_pending(blocker): logger.debug("Flushing updates for blocker='%s' ...", blocker) - instances.update_data(blocker) + instances.update(blocker) logger.debug("Invoking commit() ...") database.connection.commit() @@ -406,14 +460,22 @@ def fetch_blocks(args: argparse.Namespace) -> int: def fetch_observer(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - # Acquire lock + logger.debug("Invoking locking.acquire() ...") locking.acquire() + source_domain = "fediverse.observer" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + types = list() if args.software is None: logger.info("Fetching software list ...") raw = utils.fetch_url( - "https://fediverse.observer", + f"https://{source_domain}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) ).text @@ -422,7 +484,13 @@ def fetch_observer(args: argparse.Namespace) -> int: doc = bs4.BeautifulSoup(raw, features="html.parser") logger.debug("doc[]='%s'", type(doc)) - items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"}) + navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}) + logger.debug("navbar[]='%s'", type(navbar)) + if navbar is None: + logger.warning("Cannot find navigation bar, cannot continue!") + return 1 + + items = navbar.findAll("a", {"class": "dropdown-item"}) logger.debug("items[]='%s'", type(items)) logger.info("Checking %d menu items ...", len(items)) @@ -449,7 +517,7 @@ def fetch_observer(args: argparse.Namespace) -> int: try: logger.debug("Fetching table data for software='%s' ...", software) raw = utils.fetch_url( - f"https://fediverse.observer/app/views/tabledata.php?software={software}", + f"https://{source_domain}/app/views/tabledata.php?software={software}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) ).text @@ -458,7 +526,7 @@ def fetch_observer(args: argparse.Namespace) -> int: doc = bs4.BeautifulSoup(raw, features="html.parser") logger.debug("doc[]='%s'", type(doc)) except network.exceptions as exception: - logger.warning("Cannot fetch software='%s' from fediverse.observer: '%s'", software, type(exception)) + logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception)) continue items = doc.findAll("a", {"class": "url"}) @@ -466,20 +534,22 @@ def fetch_observer(args: argparse.Namespace) -> int: for item in items: logger.debug("item[]='%s'", type(item)) domain = item.decode_contents() - logger.debug("domain='%s' - AFTER!", domain) + if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) continue - elif instances.is_recent(domain): - logger.debug("domain='%s' is recently being handled - SKIPPED!", domain) - continue software = software_helper.alias(software) logger.info("Fetching instances for domain='%s'", domain) @@ -491,13 +561,28 @@ def fetch_observer(args: argparse.Namespace) -> int: def fetch_todon_wiki(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + logger.debug("Invoking locking.acquire() ...") locking.acquire() + + source_domain = "wiki.todon.eu" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + blocklist = { "silenced": list(), "reject": list(), } - raw = utils.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.debug("Fetching domainblocks from source_domain='%s'", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/todon/domainblocks", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(raw, "html.parser") @@ -515,6 +600,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: blocker = "todon.eu" logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) + instances.set_last_blocked(blocker) instances.set_total_blocks(blocker, blocking) blockdict = list() @@ -538,7 +624,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: continue logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level) - if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"): + if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker) blockdict.append({ "blocked": blocked, @@ -556,13 +642,17 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int: logger.debug("Checking if blocker='%s' has pending updates ...", blocker) if instances.has_pending(blocker): logger.debug("Flushing updates for blocker='%s' ...", blocker) - instances.update_data(blocker) + instances.update(blocker) logger.debug("Success! - EXIT!") return 0 def fetch_cs(args: argparse.Namespace): logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + extensions = [ "extra", "abbr", @@ -583,12 +673,25 @@ def fetch_cs(args: argparse.Namespace): "wikilinks" ] - domains = { + blocklist = { "silenced": list(), "reject" : list(), } - raw = utils.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + source_domain = "raw.githubusercontent.com" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + logger.info("Fetching federation.md from source_domain='%s' ...", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/chaossocial/meta/master/federation.md", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser") @@ -596,30 +699,29 @@ def fetch_cs(args: argparse.Namespace): silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody") logger.debug("silenced[%s]()=%d", type(silenced), len(silenced)) - domains["silenced"] = federation.find_domains(silenced) + blocklist["silenced"] = federation.find_domains(silenced) blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody") logger.debug("blocked[%s]()=%d", type(blocked), len(blocked)) - domains["reject"] = federation.find_domains(blocked) + blocklist["reject"] = federation.find_domains(blocked) blocking = blocklist["silenced"] + blocklist["reject"] blocker = "chaos.social" logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking)) + instances.set_last_blocked(blocker) instances.set_total_blocks(blocker, blocking) - logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"])) - blockdict = list() - if len(domains) > 0: - locking.acquire() - - for block_level in domains: - logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level])) + logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"])) + if len(blocking) > 0: + blockdict = list() + for block_level in blocklist: + logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level])) - for row in domains[block_level]: + for row in blocklist[block_level]: logger.debug("row[%s]='%s'", type(row), row) - if instances.is_recent(row["domain"], "last_blocked"): - logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"]) + if not "domain" in row: + logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row)) continue elif not instances.is_registered(row["domain"]): try: @@ -629,7 +731,7 @@ def fetch_cs(args: argparse.Namespace): logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"]) instances.set_last_error(row["domain"], exception) - if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"): + if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker) blockdict.append({ "blocked": row["domain"], @@ -647,34 +749,52 @@ def fetch_cs(args: argparse.Namespace): logger.debug("Checking if blocker='%s' has pending updates ...", blocker) if instances.has_pending(blocker): logger.debug("Flushing updates for blocker='%s' ...", blocker) - instances.update_data(blocker) + instances.update(blocker) logger.debug("Success! - EXIT!") return 0 def fetch_fba_rss(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + domains = list() + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + components = urlparse(args.feed) + + if sources.is_recent(components.netloc): + logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc) + return 0 + else: + logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc) + sources.update(components.netloc) + logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed) response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and len(response.text) > 0: + if response.ok and response.status_code == 200 and len(response.text) > 0: logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text)) rss = atoma.parse_rss_bytes(response.content) logger.debug("rss[]='%s'", type(rss)) for item in rss.items: - logger.debug("item='%s'", item) + logger.debug("item[%s]='%s'", type(item), item) domain = tidyup.domain(item.link.split("=")[1]) logger.debug("domain='%s' - AFTER!", domain) if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: logger.debug("domain='%s' is already added - SKIPPED!", domain) @@ -691,10 +811,9 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - logger.info("Adding %d new instances ...", len(domains)) for domain in domains: + logger.debug("domain='%s'", domain) try: logger.info("Fetching instances from domain='%s' ...", domain) federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) @@ -708,7 +827,25 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: def fetch_fbabot_atom(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - feed = "https://ryona.agency/users/fba/feed.atom" + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "ryona.agency" + feed = f"https://{source_domain}/users/fba/feed.atom" + + logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed) + if args.feed is not None and validators.url(args.feed): + logger.debug("Setting feed='%s' ...", args.feed) + feed = str(args.feed) + source_domain = urlparse(args.feed).netloc + + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) domains = list() @@ -716,7 +853,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and len(response.text) > 0: + if response.ok and response.status_code == 200 and len(response.text) > 0: logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text)) atom = atoma.parse_atom_bytes(response.content) @@ -735,8 +872,13 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif domain in domains: logger.debug("domain='%s' is already added - SKIPPED!", domain) @@ -753,14 +895,12 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: logger.debug("domains()=%d", len(domains)) if len(domains) > 0: - locking.acquire() - logger.info("Adding %d new instances ...", len(domains)) for domain in domains: logger.debug("domain='%s'", domain) try: logger.info("Fetching instances from domain='%s' ...", domain) - federation.fetch_instances(domain, "ryona.agency", None, inspect.currentframe().f_code.co_name) + federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) @@ -771,16 +911,37 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: def fetch_instances(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("args.domain='%s' - checking ...", args.domain) + if not validators.domain(args.domain): + logger.warning("args.domain='%s' is not valid.", args.domain) + return 100 + elif blacklist.is_blacklisted(args.domain): + logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain) + return 101 + + logger.debug("Invoking locking.acquire() ...") locking.acquire() + # Initialize values + domain = tidyup.domain(args.domain) + origin = software = None + + # Fetch record + database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain]) + row = database.cursor.fetchone() + if row is not None: + origin = row["origin"] + software = row["software"] + # Initial fetch try: - logger.info("Fetching instances from args.domain='%s' ...", args.domain) - federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name) + logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software) + federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name) except network.exceptions as exception: logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain) instances.set_last_error(args.domain, exception) - instances.update_data(args.domain) + instances.update(args.domain) return 100 if args.single: @@ -789,7 +950,7 @@ def fetch_instances(args: argparse.Namespace) -> int: # Loop through some instances database.cursor.execute( - "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")] + "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")] ) rows = database.cursor.fetchall() @@ -799,72 +960,46 @@ def fetch_instances(args: argparse.Namespace) -> int: if row["domain"] == "": logger.debug("row[domain] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(row["domain"]): - logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"]) + + logger.debug("row[domain]='%s' - BEFORE!", row["domain"]) + domain = row["domain"].encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain) continue try: - logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"]) - federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"]) + logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"]) + federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"]) except network.exceptions as exception: - logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"]) - instances.set_last_error(row["domain"], exception) + logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain) + instances.set_last_error(domain, exception) logger.debug("Success - EXIT!") return 0 def fetch_oliphant(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() + source_domain = "codeberg.org" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + # Base URL - base_url = "https://codeberg.org/oliphant/blocklists/raw/branch/main/blocklists" - - # URLs to fetch - blocklists = ( - { - "blocker": "artisan.chat", - "csv_url": "mastodon/artisan.chat.csv", - },{ - "blocker": "mastodon.art", - "csv_url": "mastodon/mastodon.art.csv", - },{ - "blocker": "pleroma.envs.net", - "csv_url": "mastodon/pleroma.envs.net.csv", - },{ - "blocker": "oliphant.social", - "csv_url": "mastodon/_unified_tier3_blocklist.csv", - },{ - "blocker": "mastodon.online", - "csv_url": "mastodon/mastodon.online.csv", - },{ - "blocker": "mastodon.social", - "csv_url": "mastodon/mastodon.social.csv", - },{ - "blocker": "mastodon.social", - "csv_url": "other/missing-tier0-mastodon.social.csv", - },{ - "blocker": "rage.love", - "csv_url": "mastodon/rage.love.csv", - },{ - "blocker": "sunny.garden", - "csv_url": "mastodon/sunny.garden.csv", - },{ - "blocker": "solarpunk.moe", - "csv_url": "mastodon/solarpunk.moe.csv", - },{ - "blocker": "toot.wales", - "csv_url": "mastodon/toot.wales.csv", - },{ - "blocker": "union.place", - "csv_url": "mastodon/union.place.csv", - } - ) + base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists" domains = list() - logger.debug("Downloading %d files ...", len(blocklists)) - for block in blocklists: + logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists)) + for block in blocklists.oliphant_blocklists: # Is domain given and not equal blocker? if isinstance(args.domain, str) and args.domain != block["blocker"]: logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain) @@ -872,25 +1007,23 @@ def fetch_oliphant(args: argparse.Namespace) -> int: elif args.domain in domains: logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain) continue - elif instances.is_recent(block["blocker"]): - logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"]) - continue + + instances.set_last_blocked(block["blocker"]) # Fetch this URL logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"]) response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content)) - if not response.ok or response.status_code >= 300 or response.content == "": + if not response.ok or response.status_code > 200 or response.content == "": logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"]) continue logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content)) - reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix") + reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") blockdict = list() - logger.info("Processing %d rows ...", len(reader)) cnt = 0 for row in reader: logger.debug("row[%s]='%s'", type(row), row) @@ -906,9 +1039,9 @@ def fetch_oliphant(args: argparse.Namespace) -> int: continue if "#severity" in row: - severity = row["#severity"] + severity = blocks.alias_block_level(row["#severity"]) elif "severity" in row: - severity = row["severity"] + severity = blocks.alias_block_level(row["severity"]) else: logger.debug("row='%s' does not contain severity column", row) continue @@ -928,18 +1061,38 @@ def fetch_oliphant(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + elif domain.endswith(".onion"): + logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain) + continue + elif domain.endswith(".arpa"): + logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain) + continue + elif domain.endswith(".tld"): + logger.debug("domain='%s' is a fake domain - SKIPPED", domain) + continue + elif domain.find("*") >= 0 or domain.find("?") >= 0: + logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"]) + domain = utils.deobfuscate(domain, block["blocker"]) + logger.debug("domain='%s' - AFTER!", domain) + + if not validators.domain(domain): + logger.debug("domain='%s' is not a valid domain - SKIPPED!") + continue + elif blacklist.is_blacklisted(domain): + logger.warning("domain='%s' is blacklisted - SKIPPED!", domain) + continue + elif blocks.is_instance_blocked(block["blocker"], domain, severity): + logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity) continue logger.debug("Marking domain='%s' as handled", domain) domains.append(domain) logger.debug("Processing domain='%s' ...", domain) - processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name) + processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name) logger.debug("processed='%s'", processed) - if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"): + if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"]) blockdict.append({ "blocked": domain, @@ -947,17 +1100,19 @@ def fetch_oliphant(args: argparse.Namespace) -> int: }) if reject_media: - utils.process_block(block["blocker"], domain, None, "reject_media") + processing.block(block["blocker"], domain, None, "reject_media") if reject_reports: - utils.process_block(block["blocker"], domain, None, "reject_reports") + processing.block(block["blocker"], domain, None, "reject_reports") - logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", block["blocker"], cnt) - instances.set_total_blocks(block["blocker"], cnt) + logger.debug("block[blocker]='%s'", block["blocker"]) + if not blocklists.has(block["blocker"]): + logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains)) + instances.set_total_blocks(block["blocker"], domains) logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"]) if instances.has_pending(block["blocker"]): logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"]) - instances.update_data(block["blocker"]) + instances.update(block["blocker"]) logger.debug("Invoking commit() ...") database.connection.commit() @@ -972,6 +1127,8 @@ def fetch_oliphant(args: argparse.Namespace) -> int: def fetch_txt(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() # Static URLs @@ -986,7 +1143,7 @@ def fetch_txt(args: argparse.Namespace) -> int: response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and response.text != "": + if response.ok and response.status_code == 200 and response.text != "": logger.debug("Returned %d Bytes for processing", len(response.text.strip())) domains = response.text.split("\n") @@ -999,15 +1156,15 @@ def fetch_txt(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + elif not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_recent(domain): logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"]) - processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name) + processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name) logger.debug("processed='%s'", processed) if not processed: @@ -1019,12 +1176,27 @@ def fetch_txt(args: argparse.Namespace) -> int: def fetch_fedipact(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() - response = utils.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + source_domain = "fedipact.online" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + logger.info("Fetching / from source_domain='%s' ...", source_domain) + response = utils.fetch_url( + f"https://{source_domain}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - if response.ok and response.status_code < 300 and response.text != "": + if response.ok and response.status_code == 200 and response.text != "": logger.debug("Parsing %d Bytes ...", len(response.text)) doc = bs4.BeautifulSoup(response.text, "html.parser") @@ -1040,8 +1212,13 @@ def fetch_fedipact(args: argparse.Namespace) -> int: if domain == "": logger.debug("domain is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue elif instances.is_registered(domain): logger.debug("domain='%s' is already registered - SKIPPED!", domain) @@ -1051,16 +1228,127 @@ def fetch_fedipact(args: argparse.Namespace) -> int: continue logger.info("Fetching domain='%s' ...", domain) - federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_joinmobilizon(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "instances.joinmobilizon.org" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + logger.info("Fetching instances from source_domain='%s' ...", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/api/v1/instances", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + parsed = json.loads(raw) + logger.debug("parsed[%s]()=%d", type(parsed), len(parsed)) + + if "data" not in parsed: + logger.warning("parsed()=%d does not contain key 'data'") + return 1 + + logger.info("Checking %d instances ...", len(parsed["data"])) + for row in parsed["data"]: + logger.debug("row[]='%s'", type(row)) + if "host" not in row: + logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row) + continue + elif not domain_helper.is_wanted(row["host"]): + logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"]) + continue + elif instances.is_registered(row["host"]): + logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"]) + continue + + logger.info("Fetching row[host]='%s' ...", row["host"]) + federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_joinmisskey(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "instanceapp.misskey.page" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + logger.info("Fetching instances.json from source_domain='%s' ...", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/instances.json", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + parsed = json.loads(raw) + logger.debug("parsed[%s]()=%d", type(parsed), len(parsed)) + + if "instancesInfos" not in parsed: + logger.warning("parsed()=%d does not contain element 'instancesInfos'") + return 1 + + logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"])) + for row in parsed["instancesInfos"]: + logger.debug("row[%s]='%s'", type(row), row) + if "url" not in row: + logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row)) + continue + elif not domain_helper.is_wanted(row["url"]): + logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"]) + continue + elif instances.is_registered(row["url"]): + logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"]) + continue + + logger.info("Fetching row[url]='%s' ...", row["url"]) + federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name) logger.debug("Success! - EXIT!") return 0 def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") locking.acquire() - raw = utils.fetch_url("https://joinfediverse.wiki/FediBlock", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + source_domain = "joinfediverse.wiki" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain) + raw = utils.fetch_url( + f"https://{source_domain}/FediBlock", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(raw, "html.parser") @@ -1091,14 +1379,14 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("text[]='%s'", type(text)) if not isinstance(text, str): - logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text)) + logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text)) continue elif validators.domain(text.strip()): logger.debug("text='%s' is a domain - SKIPPED!", text.strip()) continue text = tidyup.domain(text.strip()) - logger.debug("text='%s'", text) + logger.debug("text='%s' - AFTER!", text) if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]: logger.debug("Found header: '%s'=%d", text, cnt) block_headers[cnt] = text @@ -1149,8 +1437,10 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("block='%s'", block) if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0: origin = block["blocked"] + logger.debug("origin='%s'", origin) for subdomain in block["subdomain(s)"]: block["blocked"] = subdomain + "." + origin + logger.debug("block[blocked]='%s'", block["blocked"]) blocking.append(block) else: blocking.append(block) @@ -1158,41 +1448,45 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("blocking()=%d", blocking) for block in blocking: logger.debug("block[]='%s'", type(block)) - block["blocked"] = tidyup.domain(block["blocked"]) + if "blocked" not in block: + raise KeyError(f"block()={len(block)} does not have element 'blocked'") + block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8") logger.debug("block[blocked]='%s' - AFTER!", block["blocked"]) + if block["blocked"] == "": logger.debug("block[blocked] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(block["blocked"]): - logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + elif not domain_helper.is_wanted(block["blocked"]): + logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif instances.is_recent(block["blocked"]): - logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"]) + logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"]) continue - logger.info("Proccessing blocked='%s' ...", block["blocked"]) - utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name) + logger.debug("Proccessing blocked='%s' ...", block["blocked"]) + processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name) blockdict = list() for blocker in domains: blocker = blocker[0] logger.debug("blocker[%s]='%s'", type(blocker), blocker) + instances.set_last_blocked(blocker) for block in blocking: - logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"]) + logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None) block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"]) if block["blocked"] == "": logger.debug("block[blocked] is empty - SKIPPED!") continue - elif not utils.is_domain_wanted(block["blocked"]): - logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + elif not domain_helper.is_wanted(block["blocked"]): + logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"]) - if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"): + if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker) blockdict.append({ "blocked": block["blocked"], @@ -1201,7 +1495,7 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: if instances.has_pending(blocker): logger.debug("Flushing updates for blocker='%s' ...", blocker) - instances.update_data(blocker) + instances.update(blocker) logger.debug("Invoking commit() ...") database.connection.commit() @@ -1217,9 +1511,10 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int: def recheck_obfuscation(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + logger.debug("Invoking locking.acquire() ...") locking.acquire() - if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain): + if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain): database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain]) elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software: database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software]) @@ -1230,35 +1525,44 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: logger.info("Checking %d domains ...", len(rows)) for row in rows: logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"]) - if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None: - logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all)) + if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"): + logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force)) continue - blocking = list() - if row["software"] == "pleroma": - logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) - blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"]) - elif row["software"] == "mastodon": - logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) - blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"]) - elif row["software"] == "lemmy": - logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) - blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"]) - elif row["software"] == "friendica": - logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) - blocking = friendica.fetch_blocks(row["domain"]) - elif row["software"] == "misskey": - logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) - blocking = misskey.fetch_blocks(row["domain"]) - else: - logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"]) + logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"]) + blocking = federation.fetch_blocks(row["domain"]) + + logger.debug("blocking()=%d", len(blocking)) + if len(blocking) == 0: + if row["software"] == "pleroma": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = pleroma.fetch_blocks(row["domain"]) + elif row["software"] == "mastodon": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = mastodon.fetch_blocks(row["domain"]) + elif row["software"] == "lemmy": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = lemmy.fetch_blocks(row["domain"]) + elif row["software"] == "friendica": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = friendica.fetch_blocks(row["domain"]) + elif row["software"] == "misskey": + logger.debug("domain='%s',software='%s'", row["domain"], row["software"]) + blocking = misskey.fetch_blocks(row["domain"]) + else: + logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"]) - logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking)) - instances.set_total_blocks(row["domain"], blocking) + # c.s isn't part of oliphant's "hidden" blocklists + logger.debug("row[domain]='%s'", row["domain"]) + if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]): + logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking)) + instances.set_last_blocked(row["domain"]) + instances.set_total_blocks(row["domain"], blocking) - logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"]) obfuscated = 0 blockdict = list() + + logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"]) for block in blocking: logger.debug("block[blocked]='%s'", block["blocked"]) blocked = None @@ -1278,9 +1582,9 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0: logger.debug("block='%s' is obfuscated.", block["blocked"]) obfuscated = obfuscated + 1 - blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None) - elif not utils.is_domain_wanted(block["blocked"]): - logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None) + elif not domain_helper.is_wanted(block["blocked"]): + logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) continue elif blocks.is_instance_blocked(row["domain"], block["blocked"]): logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"]) @@ -1290,20 +1594,27 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: if blocked is not None and blocked != block["blocked"]: logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked) obfuscated = obfuscated - 1 + if blocks.is_instance_blocked(row["domain"], blocked): logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"]) continue + elif blacklist.is_blacklisted(blocked): + logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) + continue - block["block_level"] = utils.alias_block_level(block["block_level"]) + block["block_level"] = blocks.alias_block_level(block["block_level"]) logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked) - if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): + if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"): logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"]) blockdict.append({ "blocked": blocked, "reason" : block["reason"], }) + logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"]) + instances.set_obfuscated_blocks(row["domain"], obfuscated) + logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated) if obfuscated == 0 and len(blocking) > 0: logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"]) @@ -1311,7 +1622,7 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: if instances.has_pending(row["domain"]): logger.debug("Flushing updates for blocker='%s' ...", row["domain"]) - instances.update_data(row["domain"]) + instances.update(row["domain"]) logger.debug("Invoking commit() ...") database.connection.commit() @@ -1327,14 +1638,23 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: def fetch_fedilist(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - url = "http://demo.fedilist.com/instance/csv?onion=not" + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "demo.fedilist.com" + if sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + url = f"http://{source_domain}/instance/csv?onion=not" if args.software is not None and args.software != "": logger.debug("args.software='%s'", args.software) - url = f"http://demo.fedilist.com/instance/csv?software={args.software}&onion=not" - - locking.acquire() + url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not" - logger.info("Fetching url='%s' from fedilist.com ...", url) + logger.info("Fetching url='%s' ...", url) response = reqto.get( url, headers=network.web_headers, @@ -1343,23 +1663,43 @@ def fetch_fedilist(args: argparse.Namespace) -> int: ) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) - reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix") + if not response.ok or response.status_code > 200 or len(response.content) == 0: + logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text)) + return 1 + + reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix") logger.debug("reader[]='%s'", type(reader)) - blockdict = list() - for row in reader: + if reader is None: + logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content)) + return 2 + + rows = list(reader) + + logger.info("Checking %d rows ...", len(rows)) + for row in rows: logger.debug("row[]='%s'", type(row)) + if "hostname" not in row: + logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row)) + continue + + logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"]) domain = tidyup.domain(row["hostname"]) logger.debug("domain='%s' - AFTER!", domain) if domain == "": logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"]) continue - elif not utils.is_domain_wanted(domain): - logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) continue - elif (args.all is None or not args.all) and instances.is_registered(domain): - logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all)) + elif (args.force is None or not args.force) and instances.is_registered(domain): + logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force)) continue elif instances.is_recent(domain): logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) @@ -1374,37 +1714,351 @@ def fetch_fedilist(args: argparse.Namespace) -> int: def update_nodeinfo(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + if args.domain is not None and args.domain != "": logger.debug("Fetching args.domain='%s'", args.domain) database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain]) elif args.software is not None and args.software != "": logger.info("Fetching domains for args.software='%s'", args.software) - database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software]) + database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")]) + elif args.mode is not None and args.mode != "": + logger.info("Fetching domains for args.mode='%s'", args.mode.upper()) + database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")]) + elif args.no_software: + logger.info("Fetching domains with no software type detected ...") + database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")]) else: logger.info("Fetching domains for recently updated ...") - database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_block")]) + database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")]) domains = database.cursor.fetchall() logger.info("Checking %d domain(s) ...", len(domains)) + cnt = 0 for row in domains: logger.debug("row[]='%s'", type(row)) + if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"): + logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"]) + continue + try: - logger.info("Updating nodeinfo for row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100)) software = federation.determine_software(row["domain"]) logger.debug("Determined software='%s'", software) - if software != row["software"]: - logger.warning("Software type has changed from '%s' to '%s'!", row["software"], software) + if (software != row["software"] and software is not None) or args.force is True: + logger.debug("software='%s'", software) + if software is None: + logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"]) + instances.set_nodeinfo_url(row["domain"], None) + + logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software) instances.set_software(row["domain"], software) - instances.set_success(row["domain"]) + if software is not None: + logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"]) + instances.set_success(row["domain"]) except network.exceptions as exception: logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"]) instances.set_last_error(row["domain"], exception) instances.set_last_nodeinfo(row["domain"]) - instances.update_data(row["domain"]) + instances.update(row["domain"]) + cnt = cnt + 1 + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_instances_social(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + source_domain = "instances.social" + + if config.get("instances_social_api_key") == "": + logger.error("API key not set. Please set in your config.json file.") + return 1 + elif sources.is_recent(source_domain): + logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain) + return 0 + else: + logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain) + sources.update(source_domain) + + headers = { + "Authorization": f"Bearer {config.get('instances_social_api_key')}", + } + + logger.info("Fetching list from source_domain='%s' ...", source_domain) + fetched = network.get_json_api( + source_domain, + "/api/1.0/instances/list?count=0&sort_by=name", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + logger.debug("fetched[]='%s'", type(fetched)) + + if "error_message" in fetched: + logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"]) + return 2 + elif "exception" in fetched: + logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"])) + return 3 + elif "json" not in fetched: + logger.warning("fetched has no element 'json' - EXIT!") + return 4 + elif "instances" not in fetched["json"]: + logger.warning("fetched[row] has no element 'instances' - EXIT!") + return 5 + + domains = list() + rows = fetched["json"]["instances"] + + logger.info("Checking %d row(s) ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + domain = tidyup.domain(row["name"]) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = domain.encode("idna").decode("utf-8") + logger.debug("domain='%s' - AFTER!", domain) + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif domain in domains: + logger.debug("domain='%s' is already added - SKIPPED!", domain) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching instances from domain='%s'", domain) + federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def fetch_relays(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + if args.domain is not None and args.domain != "": + database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain]) + else: + database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')") + + domains = list() + rows = database.cursor.fetchall() + + logger.info("Checking %d relays ...", len(rows)) + for row in rows: + logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + peers = list() + if not args.force and instances.is_recent(row["domain"]): + logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"]) + continue + + try: + logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"]) + raw = utils.fetch_url( + f"https://{row['domain']}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("raw[%s]()=%d", type(raw), len(raw)) + except network.exceptions as exception: + logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception)) + instances.set_last_error(row["domain"], exception) + instances.set_last_instance_fetch(row["domain"]) + instances.update(row["domain"]) + continue + + doc = bs4.BeautifulSoup(raw, features="html.parser") + logger.debug("doc[]='%s'", type(doc)) + + logger.debug("row[software]='%s'", row["software"]) + if row["software"] == "activityrelay": + logger.debug("Checking row[domain]='%s' ...", row["domain"]) + tags = doc.findAll("p") + + logger.debug("Checking %d paragraphs ...", len(tags)) + for tag in tags: + logger.debug("tag[]='%s'", type(tag)) + if len(tag.contents) == 0: + logger.debug("tag='%s' is an empty tag - SKIPPED!", tag) + continue + elif "registered instances" not in tag.contents[0]: + logger.debug("Skipping paragraph, text not found.") + continue + + logger.debug("Found tag.contents[0][]='%s'", tag.contents[0]) + for domain in tag.contents: + logger.debug("domain[%s]='%s'", type(domain), domain) + if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain: + continue + + domain = str(domain) + logger.debug("domain='%s'", domain) + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"]) + continue + elif domain not in peers: + logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"]) + peers.append(domain) + + if dict_helper.has_key(domains, "domain", domain): + logger.debug("domain='%s' already added", domain) + continue + + logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"]) + domains.append({ + "domain": domain, + "origin": row["domain"], + }) + elif row["software"] in ["aoderelay", "selective-relay"]: + logger.debug("Checking row[domain]='%s' ...", row["domain"]) + if row["software"] == "aoderelay": + tags = doc.findAll("section", {"class": "instance"}) + else: + tags = doc.find("div", {"id": "instances"}).findAll("li") + + logger.debug("Checking %d tags ...", len(tags)) + for tag in tags: + logger.debug("tag[]='%s'", type(tag)) + + link = tag.find("a") + logger.debug("link[%s]='%s'", type(link), link) + if link is None: + logger.warning("tag='%s' has no a-tag ...", tag) + continue + + components = urlparse(link["href"]) + domain = components.netloc.lower() + + if not domain_helper.is_wanted(domain): + logger.debug("domain='%s' is not wanted - SKIPPED!", domain) + continue + + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"]) + continue + elif domain not in peers: + logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"]) + peers.append(domain) + + if dict_helper.has_key(domains, "domain", domain): + logger.debug("domain='%s' already added", domain) + continue + + logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"]) + domains.append({ + "domain": domain, + "origin": row["domain"], + }) + else: + logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"]) + + logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"]) + instances.set_last_instance_fetch(row["domain"]) + + logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers)) + instances.set_total_peers(row["domain"], peers) + + logger.debug("Flushing data for row[domain]='%s'", row["domain"]) + instances.update(row["domain"]) + + logger.info("Checking %d domains ...", len(domains)) + for row in domains: + logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"]) + if instances.is_registered(row["domain"]): + logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"]) + continue + + logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"]) + federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 + +def convert_idna(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "domain") + + database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + instances.translate_idnas(rows, "origin") + + database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocker") + + database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC") + rows = database.cursor.fetchall() + + logger.debug("rows[]='%s'", type(rows)) + blocks.translate_idnas(rows, "blocked") + + logger.debug("Success! - EXIT!") + return 0 + +def remove_invalid(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + logger.debug("Invoking locking.acquire() ...") + locking.acquire() + + database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC") + rows = database.cursor.fetchall() + + logger.info("Checking %d domains ...", len(rows)) + for row in rows: + logger.debug("row[domain]='%s'", row["domain"]) + if not validators.domain(row["domain"].split("/")[0]): + logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"]) + database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]]) + database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.info("Vaccum cleaning database ...") + database.cursor.execute("VACUUM") logger.debug("Success! - EXIT!") return 0