From f6b374725ff4af893f305af980040a67ae3e15c8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 4 Jul 2023 00:37:31 +0200 Subject: [PATCH] Continued: - added command fetch_instances_social to fetch new instances from instances.social - you need to get an API key from them, please don't lower api_last_access to much, your API key/IP address might get banned! - added table `apis` which keeps track of "API" accessed, including github and wikis, this is to lower traffic on these sites, again: please DO NOT overdose these requests! Your IP/API key might get blocked! --- blocks_empty.db | Bin 32768 -> 40960 bytes config.defaults.json | 50 +++++----- fba/boot.py | 9 +- fba/commands.py | 213 ++++++++++++++++++++++++++++++++++++---- fba/helpers/config.py | 2 +- fba/models/__init__.py | 1 + fba/models/apis.py | 66 +++++++++++++ fba/models/instances.py | 4 +- 8 files changed, 297 insertions(+), 48 deletions(-) create mode 100644 fba/models/apis.py diff --git a/blocks_empty.db b/blocks_empty.db index c17a1b83412cdebc0db96508edf61b70fe29c29c..5e62f7136fdf8970606fda5d5324a40579723296 100644 GIT binary patch delta 242 zcmZo@U}`wPG(lSM4+8@O7ZAgM{X`vO`9BPL-V=GbH!*Os7c%h8<$cFq$n~3RI+qgv zR9;)&y_*FEKC^8uWOrm?;eE!~y|J;K(XURNU0hU@u`#|RF)1fCu^_V;ggKpqTpdGP z6+#@Hd|VZjKtf6i8eE)88TrK}d5O8HN}M6C5g|H2k(|WhlK8~rJn_?)}o9InRzLx6~%=)nI)<5iKQj^V17Kvl6XT9$;H>q Oz_IxiuUOF{0|fwDk4AU^ delta 81 zcmZoTz|_#dG(lSMHvE8@`-V=GbH!-lY7c%h8<$cFq$n~3RI+qgv e)Xjnd-`O@7vOBV{@HI2pY;3e(+Wd}Js0aWAl@#Fs diff --git a/config.defaults.json b/config.defaults.json index 7f7d37c..a930207 100644 --- a/config.defaults.json +++ b/config.defaults.json @@ -1,27 +1,29 @@ { - "base_url" : "", - "log_level" : "info", - "host" : "127.0.0.1", - "port" : 8069, - "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0", - "connection_timeout": 30, - "read_timeout" : 5, - "hostname" : "fba.ryona.agency", - "timestamp_format" : "%Y-%m-%d %H:%M", + "base_url" : "", + "log_level" : "info", + "host" : "127.0.0.1", + "port" : 8069, + "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0", + "connection_timeout" : 30, + "read_timeout" : 5, + "hostname" : "fba.ryona.agency", + "timestamp_format" : "%Y-%m-%d %H:%M", "nodeinfo_connection_timeout": 3, - "nodeinfo_read_timeout" : 2, - "bot_enabled" : false, - "bot_instance" : "https://example.com", - "bot_token" : "", - "bot_visibility" : "unlisted", - "slogan" : "### Your footer slogan ###", - "recheck_instance" : 604800, - "recheck_block" : 43200, - "recheck_nodeinfo" : 604800, - "misskey_limit" : 100, - "error_log_cleanup" : 604800, - "write_error_log" : "true", - "rss_limit" : 50, - "api_limit" : 500, - "theme" : "light", + "nodeinfo_read_timeout" : 2, + "bot_enabled" : false, + "bot_instance" : "https://example.com", + "bot_token" : "", + "bot_visibility" : "unlisted", + "slogan" : "### Your footer slogan ###", + "recheck_instance" : 604800, + "recheck_block" : 43200, + "recheck_nodeinfo" : 604800, + "api_last_access" : 604800, + "misskey_limit" : 100, + "error_log_cleanup" : 604800, + "write_error_log" : "true", + "rss_limit" : 50, + "api_limit" : 500, + "theme" : "light", + "instances_social_api_key": "" } diff --git a/fba/boot.py b/fba/boot.py index c25f707..ebde9f2 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -36,7 +36,7 @@ def init_parser(): logger.debug("Initializing parser ...") _PARSER = argparse.ArgumentParser( description="Fetches block reasons from the fediverse", - epilog="Please note that some commands have optional arguments, you may want to try fba.py --help to find them out.", + epilog="Please note that some commands have optional arguments, you may want to try fba.py --help to find them out. Please DO NOT overdose requests that are not limited by themselves. Typically parameters like --domain, --software and --all are unlimited. \"Unlimited\" here means that there is no \"is recently accessed?\" limitation.", ) # Generic: @@ -191,6 +191,13 @@ def init_parser(): parser.add_argument("--domain", help="Instance name (aka. domain)") parser.add_argument("--software", help="Name of software, e.g. 'lemmy'") + ### Fetch instances from instances.social ### + parser = subparser_command.add_parser( + "fetch_instances_social", + help="Fetch instances from instances.social, you need an API key to access the API. Please consider donating to them when you want to more frequent use their API!", + ) + parser.set_defaults(command=commands.fetch_instances_social) + logger.debug("EXIT!") def run_command(): diff --git a/fba/commands.py b/fba/commands.py index 05970a1..cbfc97a 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -41,6 +41,7 @@ from fba.helpers import tidyup from fba.http import federation from fba.http import network +from fba.models import apis from fba.models import blocks from fba.models import instances @@ -100,11 +101,18 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: # No CSRF by default, you don't have to add network.api_headers by yourself here headers = tuple() - domain = "pixelfed.org" + api_domain = "pixelfed.org" + + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) try: - logger.debug("Checking CSRF from domain='%s' ...", domain) - headers = csrf.determine(domain, dict()) + logger.debug("Checking CSRF from api_domain='%s' ...", api_domain) + headers = csrf.determine(api_domain, dict()) except network.exceptions as exception: logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__) return list() @@ -112,7 +120,7 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: try: logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers)) fetched = network.get_json_api( - domain, + api_domain, "/api/v1/servers/all.json?scope=All&country=all&language=all", headers, (config.get("connection_timeout"), config.get("read_timeout")) @@ -158,15 +166,29 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int: def fetch_bkali(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + api_domain = "gql.apis.bka.li" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + domains = list() try: - fetched = network.post_json_api("gql.api.bka.li", "/v1/graphql", json.dumps({ - "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" - })) + logger.info("Fetching domainlist from api_domain='%s' ...", api_domain) + fetched = network.post_json_api( + api_domain, + "/v1/graphql", + json.dumps({ + "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}" + }) + ) logger.debug("fetched[]='%s'", type(fetched)) if "error_message" in fetched: - logger.warning("post_json_api() for 'gql.api.bka.li' returned error message='%s", fetched["error_message"]) + logger.warning("post_json_api() for 'gql.apis.bka.li' returned error message='%s", fetched["error_message"]) return 100 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]: logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"]) @@ -407,6 +429,14 @@ def fetch_blocks(args: argparse.Namespace) -> int: def fetch_observer(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + api_domain = "fediverse.observer" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + # Acquire lock locking.acquire() @@ -414,7 +444,7 @@ def fetch_observer(args: argparse.Namespace) -> int: if args.software is None: logger.info("Fetching software list ...") raw = utils.fetch_url( - "https://fediverse.observer", + f"https://{api_domain}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) ).text @@ -450,7 +480,7 @@ def fetch_observer(args: argparse.Namespace) -> int: try: logger.debug("Fetching table data for software='%s' ...", software) raw = utils.fetch_url( - f"https://fediverse.observer/app/views/tabledata.php?software={software}", + f"https://{api_domain}/app/views/tabledata.php?software={software}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")) ).text @@ -459,7 +489,7 @@ def fetch_observer(args: argparse.Namespace) -> int: doc = bs4.BeautifulSoup(raw, features="html.parser") logger.debug("doc[]='%s'", type(doc)) except network.exceptions as exception: - logger.warning("Cannot fetch software='%s' from fediverse.observer: '%s'", software, type(exception)) + logger.warning("Cannot fetch software='%s' from api_domain='%s': '%s'", software, api_domain, type(exception)) continue items = doc.findAll("a", {"class": "url"}) @@ -492,13 +522,22 @@ def fetch_observer(args: argparse.Namespace) -> int: def fetch_todon_wiki(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + api_domain = "wiki.todon.eu" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + locking.acquire() + blocklist = { "silenced": list(), "reject": list(), } - raw = utils.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + raw = utils.fetch_url(f"https://{api_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(raw, "html.parser") @@ -589,7 +628,15 @@ def fetch_cs(args: argparse.Namespace): "reject" : list(), } - raw = utils.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + api_domain = "raw.githubusercontent.com" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + + raw = utils.fetch_url(f"https://{api_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser") @@ -709,7 +756,16 @@ def fetch_fba_rss(args: argparse.Namespace) -> int: def fetch_fbabot_atom(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - feed = "https://ryona.agency/users/fba/feed.atom" + + api_domain = "ryana.agency" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + + feed = f"https://{api_domain}/users/fba/feed.atom" domains = list() @@ -761,7 +817,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: logger.debug("domain='%s'", domain) try: logger.info("Fetching instances from domain='%s' ...", domain) - federation.fetch_instances(domain, "ryona.agency", None, inspect.currentframe().f_code.co_name) + federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name) except network.exceptions as exception: logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) @@ -772,6 +828,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int: def fetch_instances(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + locking.acquire() # Initial fetch @@ -816,10 +873,19 @@ def fetch_instances(args: argparse.Namespace) -> int: def fetch_oliphant(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + api_domain = "codeberg.org" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + locking.acquire() # Base URL - base_url = "https://codeberg.org/oliphant/blocklists/raw/branch/main/blocklists" + base_url = f"https://{api_domain}/oliphant/blocklists/raw/branch/main/blocklists" # URLs to fetch blocklists = ( @@ -973,6 +1039,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int: def fetch_txt(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + locking.acquire() # Static URLs @@ -1020,9 +1087,22 @@ def fetch_txt(args: argparse.Namespace) -> int: def fetch_fedipact(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + api_domain = "fedipact.online" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + locking.acquire() - response = utils.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = utils.fetch_url( + f"https://{api_domain}", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if response.ok and response.status_code < 300 and response.text != "": @@ -1059,9 +1139,22 @@ def fetch_fedipact(args: argparse.Namespace) -> int: def fetch_joinfediverse(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) + + api_domain = "joinfediverse.wiki" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + locking.acquire() - raw = utils.fetch_url("https://joinfediverse.wiki/FediBlock", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + raw = utils.fetch_url( + f"https://{api_domain}/FediBlock", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) doc = bs4.BeautifulSoup(raw, "html.parser") @@ -1328,10 +1421,18 @@ def recheck_obfuscation(args: argparse.Namespace) -> int: def fetch_fedilist(args: argparse.Namespace) -> int: logger.debug("args[]='%s' - CALLED!", type(args)) - url = "http://demo.fedilist.com/instance/csv?onion=not" + api_domain = "demo.fedilist.com" + if apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + + url = f"http://{api_domain}/instance/csv?onion=not" if args.software is not None and args.software != "": logger.debug("args.software='%s'", args.software) - url = f"http://demo.fedilist.com/instance/csv?software={args.software}&onion=not" + url = f"http://{api_domain}/instance/csv?software={args.software}&onion=not" locking.acquire() @@ -1413,3 +1514,75 @@ def update_nodeinfo(args: argparse.Namespace) -> int: logger.debug("Success! - EXIT!") return 0 + +def fetch_instances_social(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + + api_domain = "instances.social" + + if config.get("instances_social_api_key") == "": + logger.error("API key not set. Please set in your config.json file.") + return 1 + elif apis.is_recent(api_domain): + logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain) + return 0 + else: + logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain) + apis.update(api_domain) + + locking.acquire() + headers = { + "Authorization": f"Bearer {config.get('instances_social_api_key')}", + } + + fetched = network.get_json_api( + api_domain, + "/api/1.0/instances/list?count=0&sort_by=name", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + logger.debug("fetched[]='%s'", type(fetched)) + + if "error_message" in fetched: + logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"]) + return 2 + elif "exception" in fetched: + logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"])) + return 3 + elif "json" not in fetched: + logger.warning("fetched has no element 'json' - EXIT!") + return 4 + elif "instances" not in fetched["json"]: + logger.warning("fetched[row] has no element 'instances' - EXIT!") + return 5 + + domains = list() + rows = fetched["json"]["instances"] + + logger.info("Checking %d row(s) ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + domain = tidyup.domain(row["name"]) + + logger.debug("domain='%s' - AFTER!", domain) + if domain == "": + logger.debug("domain is empty - SKIPPED!") + continue + elif not utils.is_domain_wanted(domain): + logger.warning("domain='%s' is not wanted - SKIPPED!", domain) + continue + elif domain in domains: + logger.debug("domain='%s' is already added - SKIPPED!", domain) + continue + elif instances.is_registered(domain): + logger.debug("domain='%s' is already registered - SKIPPED!", domain) + continue + elif instances.is_recent(domain): + logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) + continue + + logger.info("Fetching instances from domain='%s'", domain) + federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name) + + logger.debug("Success! - EXIT!") + return 0 diff --git a/fba/helpers/config.py b/fba/helpers/config.py index ea78c46..b684f90 100644 --- a/fba/helpers/config.py +++ b/fba/helpers/config.py @@ -33,5 +33,5 @@ def get(key: str) -> any: elif not key in _config: raise KeyError(f"key='{key}' does not exist in _config array") - logger.debug("_config[%s][%s]='%s - EXIT!", key, type(_config[key]), _config[key]) + logger.debug("_config[%s][%s]='%s' - EXIT!", key, type(_config[key]), _config[key] if not key.endswith("_api_key") else "***") return _config[key] diff --git a/fba/models/__init__.py b/fba/models/__init__.py index bc2afbe..af51119 100644 --- a/fba/models/__init__.py +++ b/fba/models/__init__.py @@ -14,6 +14,7 @@ # along with this program. If not, see . __all__ = [ + 'apis', 'blocks', 'error_log', 'instances', diff --git a/fba/models/apis.py b/fba/models/apis.py new file mode 100644 index 0000000..8725e9a --- /dev/null +++ b/fba/models/apis.py @@ -0,0 +1,66 @@ +# Fedi API Block - An aggregator for fetching blocking data from fediverse nodes +# Copyright (C) 2023 Free Software Foundation +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import logging + +import time + +from fba import database + +from fba.helpers import config +from fba.helpers import domain as domain_helper + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def is_recent(api_domain: str) -> bool: + logger.debug("api_domain='%s' - CALLED!", api_domain) + domain_helper.raise_on(api_domain) + + is_recent = False + database.cursor.execute("SELECT last_accessed FROM apis WHERE hostname = ? LIMIT 1", [api_domain]) + + row = database.cursor.fetchone() + logger.debug("row[]='%s'", type(row)) + if row is not None: + logger.debug("api_domain='%s',row[last_accessed]=%d", api_domain, row["last_accessed"]) + is_recent = (time.time() - row["last_accessed"]) <= config.get("api_last_access") + + logger.debug("is_recent='%s' - EXIT!", is_recent) + return is_recent + +def update (api_domain: str): + logger.debug("api_domain='%s' - CALLED!", api_domain) + domain_helper.raise_on(api_domain) + + database.cursor.execute("SELECT * FROM apis WHERE hostname = ? LIMIT 1", [api_domain]) + + row = database.cursor.fetchone() + logger.debug("row[]='%s'", type(row)) + if row is None: + # Add instance + database.cursor.execute("INSERT INTO apis (hostname, last_accessed) VALUES (?, ?)", [ + api_domain, + time.time() + ]) + else: + # Update last_accessed + database.cursor.execute("UPDATE apis SET last_accessed = ? WHERE hostname = ? LIMIT 1", [ + time.time(), + api_domain + ]) + + logger.debug("EXIT!") diff --git a/fba/models/instances.py b/fba/models/instances.py index 2017153..5a374a5 100644 --- a/fba/models/instances.py +++ b/fba/models/instances.py @@ -303,10 +303,10 @@ def is_recent(domain: str, column: str = "last_instance_fetch") -> bool: database.cursor.execute(f"SELECT {column} FROM instances WHERE domain = ? LIMIT 1", [domain]) # Fetch row - fetched = database.cursor.fetchone()[0] + fetched = database.cursor.fetchone()[column] logger.debug("fetched[%s]='%s'", type(fetched), fetched) - recently = isinstance(fetched, float) and time.time() - fetched <= config.get("recheck_instance") + recently = isinstance(fetched, float) and (time.time() - fetched) <= config.get("recheck_instance") logger.debug("recently='%s' - EXIT!", recently) return recently -- 2.39.5