From: Roland Häder Date: Wed, 21 Jun 2023 02:21:45 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=c0ac309c7ca70790fba3715cdb61e75570c5950d;p=fba.git Continued: - moved fba.federation|network to package fba.http --- diff --git a/api.py b/api.py index 41580f3..ff0fa99 100644 --- a/api.py +++ b/api.py @@ -30,11 +30,12 @@ import requests import validators from fba import fba -from fba import network from fba.helpers import config from fba.helpers import tidyup +from fba.http import network + router = fastapi.FastAPI(docs_url=config.get("base_url") + "/docs", redoc_url=config.get("base_url") + "/redoc") templates = Jinja2Templates(directory="templates") diff --git a/fba/__init__.py b/fba/__init__.py index e5e1748..e716e10 100644 --- a/fba/__init__.py +++ b/fba/__init__.py @@ -18,11 +18,10 @@ __all__ = [ 'boot', 'commands', 'csrf', - 'federation', 'fba', - 'network', # Sub packages: 'helpers', + 'http', 'models', 'networks', ] diff --git a/fba/commands.py b/fba/commands.py index 24d4840..277a1b7 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -26,9 +26,7 @@ import markdown import reqto import validators -from fba import federation from fba import fba -from fba import network from fba.helpers import blacklist from fba.helpers import config @@ -36,6 +34,9 @@ from fba.helpers import cookies from fba.helpers import locking from fba.helpers import tidyup +from fba.http import federation +from fba.http import network + from fba.models import blocks from fba.models import instances diff --git a/fba/csrf.py b/fba/csrf.py index 8cd8c5e..5158ca5 100644 --- a/fba/csrf.py +++ b/fba/csrf.py @@ -18,11 +18,12 @@ import bs4 import reqto import validators -from fba import network from fba.helpers import config from fba.helpers import cookies +from fba.http import network + def determine(domain: str, headers: dict) -> dict: # DEBUG: print(f"DEBUG: domain='{domain}',headers()={len(headers)} - CALLED!") if not isinstance(domain, str): diff --git a/fba/fba.py b/fba/fba.py index 368de88..e945ffd 100644 --- a/fba/fba.py +++ b/fba/fba.py @@ -21,12 +21,12 @@ from urllib.parse import urlparse import requests import validators -from fba import federation -from fba import network - from fba.helpers import blacklist from fba.helpers import cookies +from fba.http import federation +from fba.http import network + from fba.models import instances # Connect to database diff --git a/fba/federation.py b/fba/federation.py deleted file mode 100644 index b41e651..0000000 --- a/fba/federation.py +++ /dev/null @@ -1,620 +0,0 @@ -# Copyright (C) 2023 Free Software Foundation -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published -# by the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -from urllib.parse import urlparse - -import bs4 -import validators - -from fba import csrf -from fba import network - -from fba.helpers import blacklist -from fba.helpers import config -from fba.helpers import tidyup -from fba.helpers import version - -from fba.models import instances - -from fba.networks import lemmy -from fba.networks import misskey -from fba.networks import peertube - -# "rel" identifiers (no real URLs) -nodeinfo_identifier = [ - "https://nodeinfo.diaspora.software/ns/schema/2.1", - "https://nodeinfo.diaspora.software/ns/schema/2.0", - "https://nodeinfo.diaspora.software/ns/schema/1.1", - "https://nodeinfo.diaspora.software/ns/schema/1.0", - "http://nodeinfo.diaspora.software/ns/schema/2.1", - "http://nodeinfo.diaspora.software/ns/schema/2.0", - "http://nodeinfo.diaspora.software/ns/schema/1.1", - "http://nodeinfo.diaspora.software/ns/schema/1.0", -] - -def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None): - # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(origin, str) and origin is not None: - raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'") - elif software is None: - # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") - instances.set_last_instance_fetch(domain) - - # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...") - software = None - try: - software = determine_software(domain, path) - except network.exceptions as exception: - # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type") - pass - - # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'") - elif not isinstance(software, str): - raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'") - elif not isinstance(command, str): - raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'") - elif command == "": - raise ValueError("Parameter 'command' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain") - - if not instances.is_registered(domain): - # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'") - instances.add(domain, origin, command, path, software) - - # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") - instances.set_last_instance_fetch(domain) - - # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software) - peerlist = fetch_peers(domain, software) - - if peerlist is None: - print("ERROR: Cannot fetch peers:", domain) - return - elif instances.has_pending(domain): - # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...") - instances.update_data(domain) - - print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...") - for instance in peerlist: - # DEBUG: print(f"DEBUG: instance='{instance}'") - if instance is None: - # Skip "None" types as tidup.domain() cannot parse them - continue - - # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE") - instance = tidyup.domain(instance) - # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER") - - if instance == "": - print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'") - continue - elif not validators.domain(instance.split("/")[0]): - print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'") - continue - elif instance.endswith(".arpa"): - print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.") - continue - elif blacklist.is_blacklisted(instance): - # DEBUG: print("DEBUG: instance is blacklisted:", instance) - continue - elif instance.find("/profile/") > 0 or instance.find("/users/") > 0: - # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!") - continue - elif instance.endswith(".tld"): - # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!") - continue - elif not instances.is_registered(instance): - # DEBUG: print("DEBUG: Adding new instance:", instance, domain) - instances.add(instance, domain, command) - - # DEBUG: print("DEBUG: EXIT!") - -def fetch_peers(domain: str, software: str) -> list: - # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(software, str) and software is not None: - raise ValueError(f"software[]='{type(software)}' is not 'str'") - - if software == "misskey": - # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...") - return misskey.fetch_peers(domain) - elif software == "lemmy": - # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...") - return lemmy.fetch_peers(domain) - elif software == "peertube": - # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...") - return peertube.fetch_peers(domain) - - # Init peers variable - peers = list() - - # No CSRF by default, you don't have to add network.api_headers by yourself here - headers = tuple() - - try: - # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") - headers = csrf.determine(domain, dict()) - except network.exceptions as exception: - print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!") - instances.set_last_error(domain, exception) - return peers - - # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...") - data = network.get_json_api( - domain, - "/api/v1/instance/peers", - headers, - (config.get("connection_timeout"), config.get("read_timeout")) - ) - - # DEBUG: print(f"DEBUG: data[]='{type(data)}'") - if "error_message" in data: - # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...") - data = network.get_json_api( - domain, - "/api/v3/site", - headers, - (config.get("connection_timeout"), config.get("read_timeout")) - ) - - # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'") - if "error_message" in data: - print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'") - elif "federated_instances" in data["json"]: - # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'") - peers = peers + add_peers(data["json"]["federated_instances"]) - # DEBUG: print("DEBUG: Added instance(s) to peers") - else: - message = "JSON response does not contain 'federated_instances' or 'error_message'" - print(f"WARNING: {message},domain='{domain}'") - instances.set_last_error(domain, message) - elif isinstance(data["json"], list): - # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json'])) - peers = data["json"] - else: - print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'") - - # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'") - instances.set_total_peers(domain, peers) - - # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) - return peers - -def fetch_nodeinfo(domain: str, path: str = None) -> dict: - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(path, str) and path is not None: - raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") - - # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...") - nodeinfo = fetch_wellknown_nodeinfo(domain) - - # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'") - if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0: - # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!") - return nodeinfo["json"] - - # No CSRF by default, you don't have to add network.api_headers by yourself here - headers = tuple() - data = dict() - - try: - # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") - headers = csrf.determine(domain, dict()) - except network.exceptions as exception: - print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!") - instances.set_last_error(domain, exception) - return { - "status_code" : 500, - "error_message": f"exception[{type(exception)}]='{str(exception)}'", - "exception" : exception, - } - - request_paths = [ - "/nodeinfo/2.1.json", - "/nodeinfo/2.1", - "/nodeinfo/2.0.json", - "/nodeinfo/2.0", - "/nodeinfo/1.0", - "/api/v1/instance" - ] - - for request in request_paths: - # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'") - if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}": - # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...") - if path == f"http://{domain}{path}" or path == f"https://{domain}{path}": - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...") - components = urlparse(path) - path = components.path - - data = network.get_json_api( - domain, - request, - headers, - (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")) - ) - - # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'") - if "error_message" not in data: - # DEBUG: print("DEBUG: Success:", request) - instances.set_detection_mode(domain, "STATIC_CHECK") - instances.set_nodeinfo_url(domain, request) - break - - print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'") - - # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!") - return data - -def fetch_wellknown_nodeinfo(domain: str) -> dict: - # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - - # No CSRF by default, you don't have to add network.api_headers by yourself here - headers = tuple() - - try: - # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") - headers = csrf.determine(domain, dict()) - except network.exceptions as exception: - print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!") - instances.set_last_error(domain, exception) - return { - "status_code" : 500, - "error_message": type(exception), - "exception" : exception, - } - - # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain) - data = network.get_json_api( - domain, - "/.well-known/nodeinfo", - headers, - (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")) - ) - - if "error_message" not in data: - nodeinfo = data["json"] - # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain) - if "links" in nodeinfo: - # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"])) - for link in nodeinfo["links"]: - # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'") - if not isinstance(link, dict) or not "rel" in link: - print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found") - elif link["rel"] in nodeinfo_identifier: - # Default is that 'href' has a complete URL, but some hosts don't send that - url = link["href"] - components = urlparse(link["href"]) - - # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'") - if components.scheme == "" and components.netloc == "": - # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'") - url = f"https://{domain}{url}" - components = urlparse(url) - - if blacklist.is_blacklisted(components.netloc): - print(f"WARNING: components.netloc='{components.netloc}' is blacklisted - SKIPPED!") - continue - elif not validators.domain(components.netloc): - print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!") - continue - - # DEBUG: print("DEBUG: Fetching nodeinfo from:", url) - data = network.fetch_api_url( - url, - (config.get("connection_timeout"), config.get("read_timeout")) - ) - - # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data)) - if "error_message" not in data and "json" in data: - # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data)) - instances.set_detection_mode(domain, "AUTO_DISCOVERY") - instances.set_nodeinfo_url(domain, link["href"]) - break - else: - instances.set_last_error(domain, data) - else: - print("WARNING: Unknown 'rel' value:", domain, link["rel"]) - else: - print("WARNING: nodeinfo does not contain 'links':", domain) - - # DEBUG: print("DEBUG: Returning data[]:", type(data)) - return data - -def fetch_generator_from_path(domain: str, path: str = "/") -> str: - # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(path, str): - raise ValueError(f"path[]='{type(path)}' is not 'str'") - elif path == "": - raise ValueError("Parameter 'path' is empty") - - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!") - software = None - - # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...") - response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) - - # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text)) - if response.ok and response.status_code < 300 and response.text.find(" 0: - # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...") - doc = bs4.BeautifulSoup(response.text, "html.parser") - - # DEBUG: print("DEBUG: doc[]:", type(doc)) - generator = doc.find("meta", {"name" : "generator"}) - site_name = doc.find("meta", {"property": "og:site_name"}) - - # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'") - if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str): - # DEBUG: print("DEBUG: Found generator meta tag:", domain) - software = tidyup.domain(generator.get("content")) - # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") - if software is not None and software != "": - print(f"INFO: domain='{domain}' is generated by '{software}'") - instances.set_detection_mode(domain, "GENERATOR") - elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str): - # DEBUG: print("DEBUG: Found property=og:site_name:", domain) - software = tidyup.domain(site_name.get("content")) - # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") - if software is not None and software != "": - print(f"INFO: domain='{domain}' has og:site_name='{software}'") - instances.set_detection_mode(domain, "SITE_NAME") - - # DEBUG: print(f"DEBUG: software[]='{type(software)}'") - if isinstance(software, str) and software == "": - # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'") - software = None - elif isinstance(software, str) and ("." in software or " " in software): - # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") - software = version.remove(software) - - # DEBUG: print(f"DEBUG: software[]='{type(software)}'") - if isinstance(software, str) and "powered by " in software: - # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") - software = version.remove(version.strip_powered_by(software)) - elif isinstance(software, str) and " hosted on " in software: - # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it") - software = version.remove(version.strip_hosted_on(software)) - elif isinstance(software, str) and " by " in software: - # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it") - software = version.strip_until(software, " by ") - elif isinstance(software, str) and " see " in software: - # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it") - software = version.strip_until(software, " see ") - - # DEBUG: print(f"DEBUG: software='{software}' - EXIT!") - return software - -def determine_software(domain: str, path: str = None) -> str: - # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not isinstance(path, str) and path is not None: - raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") - - # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path) - software = None - - # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...") - data = fetch_nodeinfo(domain, path) - - # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'") - if "exception" in data: - # Continue raising it - raise data["exception"] - elif "error_message" in data: - # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'") - return fetch_generator_from_path(domain) - elif "status" in data and data["status"] == "error" and "message" in data: - print("WARNING: JSON response is an error:", data["message"]) - instances.set_last_error(domain, data["message"]) - return fetch_generator_from_path(domain) - elif "message" in data: - print("WARNING: JSON response contains only a message:", data["message"]) - instances.set_last_error(domain, data["message"]) - return fetch_generator_from_path(domain) - elif "software" not in data or "name" not in data["software"]: - # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...") - software = fetch_generator_from_path(domain) - # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'") - elif "software" in data and "name" in data["software"]: - # DEBUG: print("DEBUG: Found data[software][name] in JSON response") - software = data["software"]["name"] - - if software is None: - # DEBUG: print("DEBUG: Returning None - EXIT!") - return None - - sofware = tidyup.domain(software) - # DEBUG: print("DEBUG: sofware after tidyup.domain():", software) - - if software in ["akkoma", "rebased", "akkounfucked", "ched"]: - # DEBUG: print("DEBUG: Setting pleroma:", domain, software) - software = "pleroma" - elif software in ["hometown", "ecko"]: - # DEBUG: print("DEBUG: Setting mastodon:", domain, software) - software = "mastodon" - elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]: - # DEBUG: print("DEBUG: Setting misskey:", domain, software) - software = "misskey" - elif software == "runtube.re": - # DEBUG: print("DEBUG: Setting peertube:", domain, software) - software = "peertube" - elif software == "nextcloud social": - # DEBUG: print("DEBUG: Setting nextcloud:", domain, software) - software = "nextcloud" - elif software.find("/") > 0: - print("WARNING: Spliting of slash:", software) - software = tidyup.domain(software.split("/")[-1]) - elif software.find("|") > 0: - print("WARNING: Spliting of pipe:", software) - software = tidyup.domain(software.split("|")[0]) - elif "powered by" in software: - # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") - software = version.strip_powered_by(software) - elif isinstance(software, str) and " by " in software: - # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it") - software = version.strip_until(software, " by ") - elif isinstance(software, str) and " see " in software: - # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it") - software = version.strip_until(software, " see ") - - # DEBUG: print(f"DEBUG: software[]='{type(software)}'") - if software == "": - print("WARNING: tidyup.domain() left no software name behind:", domain) - software = None - - # DEBUG: print(f"DEBUG: software[]='{type(software)}'") - if str(software) == "": - # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...") - software = fetch_generator_from_path(domain) - elif len(str(software)) > 0 and ("." in software or " " in software): - # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") - software = version.remove(software) - - # DEBUG: print(f"DEBUG: software[]='{type(software)}'") - if isinstance(software, str) and "powered by" in software: - # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") - software = version.remove(version.strip_powered_by(software)) - - # DEBUG: print("DEBUG: Returning domain,software:", domain, software) - return software - -def find_domains(tag: bs4.element.Tag) -> list: - # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!") - if not isinstance(tag, bs4.element.Tag): - raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag") - elif len(tag.select("tr")) == 0: - raise KeyError("No table rows found in table!") - - domains = list() - for element in tag.select("tr"): - # DEBUG: print(f"DEBUG: element[]='{type(element)}'") - if not element.find("td"): - # DEBUG: print("DEBUG: Skipping element, no found") - continue - - domain = tidyup.domain(element.find("td").text) - reason = tidyup.reason(element.findAll("td")[1].text) - - # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'") - - if blacklist.is_blacklisted(domain): - print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!") - continue - elif domain == "gab.com/.ai, develop.gab.com": - # DEBUG: print("DEBUG: Multiple domains detected in one row") - domains.append({ - "domain": "gab.com", - "reason": reason, - }) - domains.append({ - "domain": "gab.ai", - "reason": reason, - }) - domains.append({ - "domain": "develop.gab.com", - "reason": reason, - }) - continue - elif not validators.domain(domain.split("/")[0]): - print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!") - continue - - # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...") - domains.append({ - "domain": domain, - "reason": reason, - }) - - # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!") - return domains - -def add_peers(rows: dict) -> list: - # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!") - peers = list() - for key in ["linked", "allowed", "blocked"]: - # DEBUG: print(f"DEBUG: Checking key='{key}'") - if key in rows and rows[key] is not None: - # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...") - for peer in rows[key]: - # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!") - peer = tidyup.domain(peer) - - # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!") - if blacklist.is_blacklisted(peer): - # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!") - continue - - # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...") - peers.append(peer) - - # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!") - return peers diff --git a/fba/http/__init__.py b/fba/http/__init__.py new file mode 100644 index 0000000..5d1da3a --- /dev/null +++ b/fba/http/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2023 Free Software Foundation +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +__all__ = [ + 'federation', + 'network', +] diff --git a/fba/http/federation.py b/fba/http/federation.py new file mode 100644 index 0000000..054fc3f --- /dev/null +++ b/fba/http/federation.py @@ -0,0 +1,621 @@ +# Copyright (C) 2023 Free Software Foundation +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from urllib.parse import urlparse + +import bs4 +import validators + +from fba import csrf + +from fba.helpers import blacklist +from fba.helpers import config +from fba.helpers import tidyup +from fba.helpers import version + +from fba.http import network + +from fba.models import instances + +from fba.networks import lemmy +from fba.networks import misskey +from fba.networks import peertube + +# "rel" identifiers (no real URLs) +nodeinfo_identifier = [ + "https://nodeinfo.diaspora.software/ns/schema/2.1", + "https://nodeinfo.diaspora.software/ns/schema/2.0", + "https://nodeinfo.diaspora.software/ns/schema/1.1", + "https://nodeinfo.diaspora.software/ns/schema/1.0", + "http://nodeinfo.diaspora.software/ns/schema/2.1", + "http://nodeinfo.diaspora.software/ns/schema/2.0", + "http://nodeinfo.diaspora.software/ns/schema/1.1", + "http://nodeinfo.diaspora.software/ns/schema/1.0", +] + +def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None): + # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(origin, str) and origin is not None: + raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'") + elif software is None: + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + instances.set_last_instance_fetch(domain) + + # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...") + software = None + try: + software = determine_software(domain, path) + except network.exceptions as exception: + # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type") + pass + + # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'") + elif not isinstance(software, str): + raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'") + elif not isinstance(command, str): + raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'") + elif command == "": + raise ValueError("Parameter 'command' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain") + + if not instances.is_registered(domain): + # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'") + instances.add(domain, origin, command, path, software) + + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + instances.set_last_instance_fetch(domain) + + # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software) + peerlist = fetch_peers(domain, software) + + if peerlist is None: + print("ERROR: Cannot fetch peers:", domain) + return + elif instances.has_pending(domain): + # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...") + instances.update_data(domain) + + print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...") + for instance in peerlist: + # DEBUG: print(f"DEBUG: instance='{instance}'") + if instance is None: + # Skip "None" types as tidup.domain() cannot parse them + continue + + # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE") + instance = tidyup.domain(instance) + # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER") + + if instance == "": + print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'") + continue + elif not validators.domain(instance.split("/")[0]): + print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'") + continue + elif instance.endswith(".arpa"): + print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.") + continue + elif blacklist.is_blacklisted(instance): + # DEBUG: print("DEBUG: instance is blacklisted:", instance) + continue + elif instance.find("/profile/") > 0 or instance.find("/users/") > 0: + # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!") + continue + elif instance.endswith(".tld"): + # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!") + continue + elif not instances.is_registered(instance): + # DEBUG: print("DEBUG: Adding new instance:", instance, domain) + instances.add(instance, domain, command) + + # DEBUG: print("DEBUG: EXIT!") + +def fetch_peers(domain: str, software: str) -> list: + # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(software, str) and software is not None: + raise ValueError(f"software[]='{type(software)}' is not 'str'") + + if software == "misskey": + # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...") + return misskey.fetch_peers(domain) + elif software == "lemmy": + # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...") + return lemmy.fetch_peers(domain) + elif software == "peertube": + # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...") + return peertube.fetch_peers(domain) + + # Init peers variable + peers = list() + + # No CSRF by default, you don't have to add network.api_headers by yourself here + headers = tuple() + + try: + # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") + headers = csrf.determine(domain, dict()) + except network.exceptions as exception: + print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!") + instances.set_last_error(domain, exception) + return peers + + # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...") + data = network.get_json_api( + domain, + "/api/v1/instance/peers", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + # DEBUG: print(f"DEBUG: data[]='{type(data)}'") + if "error_message" in data: + # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...") + data = network.get_json_api( + domain, + "/api/v3/site", + headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'") + if "error_message" in data: + print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'") + elif "federated_instances" in data["json"]: + # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'") + peers = peers + add_peers(data["json"]["federated_instances"]) + # DEBUG: print("DEBUG: Added instance(s) to peers") + else: + message = "JSON response does not contain 'federated_instances' or 'error_message'" + print(f"WARNING: {message},domain='{domain}'") + instances.set_last_error(domain, message) + elif isinstance(data["json"], list): + # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json'])) + peers = data["json"] + else: + print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'") + + # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'") + instances.set_total_peers(domain, peers) + + # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) + return peers + +def fetch_nodeinfo(domain: str, path: str = None) -> dict: + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(path, str) and path is not None: + raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") + + # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...") + nodeinfo = fetch_wellknown_nodeinfo(domain) + + # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'") + if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0: + # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!") + return nodeinfo["json"] + + # No CSRF by default, you don't have to add network.api_headers by yourself here + headers = tuple() + data = dict() + + try: + # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") + headers = csrf.determine(domain, dict()) + except network.exceptions as exception: + print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!") + instances.set_last_error(domain, exception) + return { + "status_code" : 500, + "error_message": f"exception[{type(exception)}]='{str(exception)}'", + "exception" : exception, + } + + request_paths = [ + "/nodeinfo/2.1.json", + "/nodeinfo/2.1", + "/nodeinfo/2.0.json", + "/nodeinfo/2.0", + "/nodeinfo/1.0", + "/api/v1/instance" + ] + + for request in request_paths: + # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'") + if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}": + # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...") + if path == f"http://{domain}{path}" or path == f"https://{domain}{path}": + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...") + components = urlparse(path) + path = components.path + + data = network.get_json_api( + domain, + request, + headers, + (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")) + ) + + # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'") + if "error_message" not in data: + # DEBUG: print("DEBUG: Success:", request) + instances.set_detection_mode(domain, "STATIC_CHECK") + instances.set_nodeinfo_url(domain, request) + break + + print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'") + + # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!") + return data + +def fetch_wellknown_nodeinfo(domain: str) -> dict: + # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + + # No CSRF by default, you don't have to add network.api_headers by yourself here + headers = tuple() + + try: + # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'") + headers = csrf.determine(domain, dict()) + except network.exceptions as exception: + print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!") + instances.set_last_error(domain, exception) + return { + "status_code" : 500, + "error_message": type(exception), + "exception" : exception, + } + + # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain) + data = network.get_json_api( + domain, + "/.well-known/nodeinfo", + headers, + (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")) + ) + + if "error_message" not in data: + nodeinfo = data["json"] + # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain) + if "links" in nodeinfo: + # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"])) + for link in nodeinfo["links"]: + # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'") + if not isinstance(link, dict) or not "rel" in link: + print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found") + elif link["rel"] in nodeinfo_identifier: + # Default is that 'href' has a complete URL, but some hosts don't send that + url = link["href"] + components = urlparse(link["href"]) + + # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'") + if components.scheme == "" and components.netloc == "": + # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'") + url = f"https://{domain}{url}" + components = urlparse(url) + + if blacklist.is_blacklisted(components.netloc): + print(f"WARNING: components.netloc='{components.netloc}' is blacklisted - SKIPPED!") + continue + elif not validators.domain(components.netloc): + print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!") + continue + + # DEBUG: print("DEBUG: Fetching nodeinfo from:", url) + data = network.fetch_api_url( + url, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data)) + if "error_message" not in data and "json" in data: + # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data)) + instances.set_detection_mode(domain, "AUTO_DISCOVERY") + instances.set_nodeinfo_url(domain, link["href"]) + break + else: + instances.set_last_error(domain, data) + else: + print("WARNING: Unknown 'rel' value:", domain, link["rel"]) + else: + print("WARNING: nodeinfo does not contain 'links':", domain) + + # DEBUG: print("DEBUG: Returning data[]:", type(data)) + return data + +def fetch_generator_from_path(domain: str, path: str = "/") -> str: + # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(path, str): + raise ValueError(f"path[]='{type(path)}' is not 'str'") + elif path == "": + raise ValueError("Parameter 'path' is empty") + + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!") + software = None + + # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...") + response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + + # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text)) + if response.ok and response.status_code < 300 and response.text.find(" 0: + # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...") + doc = bs4.BeautifulSoup(response.text, "html.parser") + + # DEBUG: print("DEBUG: doc[]:", type(doc)) + generator = doc.find("meta", {"name" : "generator"}) + site_name = doc.find("meta", {"property": "og:site_name"}) + + # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'") + if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str): + # DEBUG: print("DEBUG: Found generator meta tag:", domain) + software = tidyup.domain(generator.get("content")) + # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") + if software is not None and software != "": + print(f"INFO: domain='{domain}' is generated by '{software}'") + instances.set_detection_mode(domain, "GENERATOR") + elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str): + # DEBUG: print("DEBUG: Found property=og:site_name:", domain) + software = tidyup.domain(site_name.get("content")) + # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") + if software is not None and software != "": + print(f"INFO: domain='{domain}' has og:site_name='{software}'") + instances.set_detection_mode(domain, "SITE_NAME") + + # DEBUG: print(f"DEBUG: software[]='{type(software)}'") + if isinstance(software, str) and software == "": + # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'") + software = None + elif isinstance(software, str) and ("." in software or " " in software): + # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") + software = version.remove(software) + + # DEBUG: print(f"DEBUG: software[]='{type(software)}'") + if isinstance(software, str) and "powered by " in software: + # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") + software = version.remove(version.strip_powered_by(software)) + elif isinstance(software, str) and " hosted on " in software: + # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it") + software = version.remove(version.strip_hosted_on(software)) + elif isinstance(software, str) and " by " in software: + # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it") + software = version.strip_until(software, " by ") + elif isinstance(software, str) and " see " in software: + # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it") + software = version.strip_until(software, " see ") + + # DEBUG: print(f"DEBUG: software='{software}' - EXIT!") + return software + +def determine_software(domain: str, path: str = None) -> str: + # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not isinstance(path, str) and path is not None: + raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") + + # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path) + software = None + + # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...") + data = fetch_nodeinfo(domain, path) + + # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'") + if "exception" in data: + # Continue raising it + raise data["exception"] + elif "error_message" in data: + # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'") + return fetch_generator_from_path(domain) + elif "status" in data and data["status"] == "error" and "message" in data: + print("WARNING: JSON response is an error:", data["message"]) + instances.set_last_error(domain, data["message"]) + return fetch_generator_from_path(domain) + elif "message" in data: + print("WARNING: JSON response contains only a message:", data["message"]) + instances.set_last_error(domain, data["message"]) + return fetch_generator_from_path(domain) + elif "software" not in data or "name" not in data["software"]: + # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...") + software = fetch_generator_from_path(domain) + # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'") + elif "software" in data and "name" in data["software"]: + # DEBUG: print("DEBUG: Found data[software][name] in JSON response") + software = data["software"]["name"] + + if software is None: + # DEBUG: print("DEBUG: Returning None - EXIT!") + return None + + sofware = tidyup.domain(software) + # DEBUG: print("DEBUG: sofware after tidyup.domain():", software) + + if software in ["akkoma", "rebased", "akkounfucked", "ched"]: + # DEBUG: print("DEBUG: Setting pleroma:", domain, software) + software = "pleroma" + elif software in ["hometown", "ecko"]: + # DEBUG: print("DEBUG: Setting mastodon:", domain, software) + software = "mastodon" + elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]: + # DEBUG: print("DEBUG: Setting misskey:", domain, software) + software = "misskey" + elif software == "runtube.re": + # DEBUG: print("DEBUG: Setting peertube:", domain, software) + software = "peertube" + elif software == "nextcloud social": + # DEBUG: print("DEBUG: Setting nextcloud:", domain, software) + software = "nextcloud" + elif software.find("/") > 0: + print("WARNING: Spliting of slash:", software) + software = tidyup.domain(software.split("/")[-1]) + elif software.find("|") > 0: + print("WARNING: Spliting of pipe:", software) + software = tidyup.domain(software.split("|")[0]) + elif "powered by" in software: + # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") + software = version.strip_powered_by(software) + elif isinstance(software, str) and " by " in software: + # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it") + software = version.strip_until(software, " by ") + elif isinstance(software, str) and " see " in software: + # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it") + software = version.strip_until(software, " see ") + + # DEBUG: print(f"DEBUG: software[]='{type(software)}'") + if software == "": + print("WARNING: tidyup.domain() left no software name behind:", domain) + software = None + + # DEBUG: print(f"DEBUG: software[]='{type(software)}'") + if str(software) == "": + # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...") + software = fetch_generator_from_path(domain) + elif len(str(software)) > 0 and ("." in software or " " in software): + # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") + software = version.remove(software) + + # DEBUG: print(f"DEBUG: software[]='{type(software)}'") + if isinstance(software, str) and "powered by" in software: + # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") + software = version.remove(version.strip_powered_by(software)) + + # DEBUG: print("DEBUG: Returning domain,software:", domain, software) + return software + +def find_domains(tag: bs4.element.Tag) -> list: + # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!") + if not isinstance(tag, bs4.element.Tag): + raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag") + elif len(tag.select("tr")) == 0: + raise KeyError("No table rows found in table!") + + domains = list() + for element in tag.select("tr"): + # DEBUG: print(f"DEBUG: element[]='{type(element)}'") + if not element.find("td"): + # DEBUG: print("DEBUG: Skipping element, no found") + continue + + domain = tidyup.domain(element.find("td").text) + reason = tidyup.reason(element.findAll("td")[1].text) + + # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'") + + if blacklist.is_blacklisted(domain): + print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!") + continue + elif domain == "gab.com/.ai, develop.gab.com": + # DEBUG: print("DEBUG: Multiple domains detected in one row") + domains.append({ + "domain": "gab.com", + "reason": reason, + }) + domains.append({ + "domain": "gab.ai", + "reason": reason, + }) + domains.append({ + "domain": "develop.gab.com", + "reason": reason, + }) + continue + elif not validators.domain(domain.split("/")[0]): + print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!") + continue + + # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...") + domains.append({ + "domain": domain, + "reason": reason, + }) + + # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!") + return domains + +def add_peers(rows: dict) -> list: + # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!") + peers = list() + for key in ["linked", "allowed", "blocked"]: + # DEBUG: print(f"DEBUG: Checking key='{key}'") + if key in rows and rows[key] is not None: + # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...") + for peer in rows[key]: + # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!") + peer = tidyup.domain(peer) + + # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!") + if blacklist.is_blacklisted(peer): + # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!") + continue + + # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...") + peers.append(peer) + + # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!") + return peers diff --git a/fba/http/network.py b/fba/http/network.py new file mode 100644 index 0000000..33ae3c6 --- /dev/null +++ b/fba/http/network.py @@ -0,0 +1,301 @@ +# Fedi API Block - An aggregator for fetching blocking data from fediverse nodes +# Copyright (C) 2023 Free Software Foundation +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import json +import reqto +import requests +import urllib3 +import validators + +from fba import fba + +from fba.helpers import config +from fba.helpers import cookies + +from fba.models import instances + +# HTTP headers for non-API requests +web_headers = { + "User-Agent": config.get("useragent"), +} + +# HTTP headers for API requests +api_headers = { + "User-Agent" : config.get("useragent"), + "Content-Type": "application/json", +} + +# Exceptions to always catch +exceptions = ( + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ConnectionError, + requests.exceptions.InvalidSchema, + requests.exceptions.InvalidURL, + requests.exceptions.Timeout, + requests.exceptions.TooManyRedirects, + UnicodeEncodeError, + urllib3.exceptions.LocationParseError +) + +def post_json_api(domain: str, path: str, data: str = "", headers: dict = {}) -> dict: + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',data='{data}',headers()={len(headers)} - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(path, str): + raise ValueError(f"path[]='{type(path)}' is not 'str'") + elif path == "": + raise ValueError("Parameter 'path' cannot be empty") + elif not isinstance(data, str): + raise ValueError(f"data[]='{type(data)}' is not 'str'") + elif not isinstance(headers, dict): + raise ValueError(f"headers[]='{type(headers)}' is not 'list'") + + json_reply = { + "status_code": 200, + } + + try: + # DEBUG: print(f"DEBUG: Sending POST to domain='{domain}',path='{path}',data='{data}',headers({len(headers)})={headers}") + response = reqto.post( + f"https://{domain}{path}", + data=data, + headers={**api_headers, **headers}, + timeout=(config.get("connection_timeout"), config.get("read_timeout")), + cookies=cookies.get_all(domain) if cookies.has(domain) else {} + ) + + json_reply["json"] = json_from_response(response) + + # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") + if not response.ok or response.status_code >= 400: + print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',data()={len(data)},response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") + json_reply["status_code"] = response.status_code + json_reply["error_message"] = response.reason + del json_reply["json"] + instances.set_last_error(domain, response) + + except exceptions as exception: + # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") + json_reply["status_code"] = 999 + json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" + json_reply["exception"] = exception + instances.set_last_error(domain, exception) + raise exception + + # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") + return json_reply + +def fetch_api_url(url: str, timeout: tuple) -> dict: + # DEBUG: print(f"DEBUG: url='{url}',timeout()={len(timeout)} - CALLED!") + if not isinstance(url, str): + raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'") + elif not isinstance(timeout, tuple): + raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") + + json_reply = { + "status_code": 200, + } + + try: + # DEBUG: print(f"DEBUG: Fetching url='{url}' ...") + response = fba.fetch_url(url, api_headers, timeout) + + json_reply["json"] = json_from_response(response) + + # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") + if not response.ok or response.status_code >= 400: + print(f"WARNING: Cannot query JSON API: url='{url}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") + json_reply["status_code"] = response.status_code + json_reply["error_message"] = response.reason + del json_reply["json"] + + except exceptions as exception: + # DEBUG: print(f"DEBUG: Fetching '{url}' failed. exception[{type(exception)}]='{str(exception)}'") + json_reply["status_code"] = 999 + json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" + json_reply["exception"] = exception + raise exception + + # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") + return json_reply + +def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict: + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',timeout()={len(timeout)} - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(path, str): + raise ValueError(f"path[]='{type(path)}' is not 'str'") + elif path == "": + raise ValueError("Parameter 'path' cannot be empty") + elif not isinstance(headers, dict): + raise ValueError(f"headers[]='{type(headers)}' is not 'list'") + elif not isinstance(timeout, tuple): + raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") + + json_reply = { + "status_code": 200, + } + + try: + # DEBUG: print(f"DEBUG: Sending GET to domain='{domain}',path='{path}',timeout({len(timeout)})={timeout}") + response = reqto.get( + f"https://{domain}{path}", + headers={**api_headers, **headers}, + timeout=timeout, + cookies=cookies.get_all(domain) if cookies.has(domain) else {} + ) + + except exceptions as exception: + # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") + json_reply["status_code"] = 999 + json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" + json_reply["exception"] = exception + instances.set_last_error(domain, exception) + raise exception + + json_reply["json"] = json_from_response(response) + + # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") + if not response.ok or response.status_code >= 400: + print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") + json_reply["status_code"] = response.status_code + json_reply["error_message"] = response.reason + del json_reply["json"] + instances.set_last_error(domain, response) + + # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") + return json_reply + +def send_bot_post(domain: str, blocklist: dict): + # DEBUG: print(f"DEBUG: domain='{domain}',blocklist()={len(blocklist)} - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(blocklist, dict): + raise ValueError(f"Parameter blocklist[]='{type(blocklist)}' is not 'dict'") + + message = f"{domain} has blocked the following instances:\n\n" + truncated = False + + if len(blocklist) > 20: + truncated = True + blocklist = blocklist[0 : 19] + + # DEBUG: print(f"DEBUG: blocklist()={len(blocklist)}") + for block in blocklist: + # DEBUG: print(f"DEBUG: block['{type(block)}']={block}") + if block["reason"] is None or block["reason"] == '': + message = message + block["blocked"] + " with unspecified reason\n" + else: + if len(block["reason"]) > 420: + block["reason"] = block["reason"][0:419] + "[…]" + + message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n' + + if truncated: + message = message + "(the list has been truncated to the first 20 entries)" + + botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}} + + req = reqto.post( + f"{config.get('bot_instance')}/api/v1/statuses", + data={ + "status" : message, + "visibility" : config.get('bot_visibility'), + "content_type": "text/plain" + }, + headers=botheaders, + timeout=10 + ).json() + + return True + +def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> requests.models.Response: + # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not validators.domain(domain.split("/")[0]): + raise ValueError(f"domain='{domain}' is not a valid domain") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") + elif domain.endswith(".tld"): + raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif not isinstance(path, str): + raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") + elif path == "": + raise ValueError("Parameter 'path' is empty") + elif not isinstance(headers, dict): + raise ValueError(f"headers[]='{type(headers)}' is not 'dict'") + elif not isinstance(timeout, tuple): + raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") + + try: + # DEBUG: print(f"DEBUG: Sending GET request to '{domain}{path}' ...") + response = reqto.get( + f"https://{domain}{path}", + headers=headers, + timeout=timeout, + cookies=cookies.get_all(domain) if cookies.has(domain) else {} + ) + + except exceptions as exception: + # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") + instances.set_last_error(domain, exception) + raise exception + + # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!") + return response + +def json_from_response(response: requests.models.Response) -> list: + # DEBUG: print(f"DEBUG: response[]='{type(response)}' - CALLED!") + if not isinstance(response, requests.models.Response): + raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'") + + data = list() + if response.text.strip() != "": + # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...") + try: + data = response.json() + except json.decoder.JSONDecodeError: + pass + + # DEBUG: print(f"DEBUG: data[]='{type(data)}' - EXIT!") + return data diff --git a/fba/models/instances.py b/fba/models/instances.py index ecb5454..571475f 100644 --- a/fba/models/instances.py +++ b/fba/models/instances.py @@ -21,13 +21,14 @@ import requests import validators from fba import fba -from fba import federation -from fba import network from fba.helpers import blacklist from fba.helpers import cache from fba.helpers import config +from fba.http import federation +from fba.http import network + from fba.models import error_log # Found info from node, such as nodeinfo URL, detection mode that needs to be diff --git a/fba/network.py b/fba/network.py deleted file mode 100644 index 33ae3c6..0000000 --- a/fba/network.py +++ /dev/null @@ -1,301 +0,0 @@ -# Fedi API Block - An aggregator for fetching blocking data from fediverse nodes -# Copyright (C) 2023 Free Software Foundation -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published -# by the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -import json -import reqto -import requests -import urllib3 -import validators - -from fba import fba - -from fba.helpers import config -from fba.helpers import cookies - -from fba.models import instances - -# HTTP headers for non-API requests -web_headers = { - "User-Agent": config.get("useragent"), -} - -# HTTP headers for API requests -api_headers = { - "User-Agent" : config.get("useragent"), - "Content-Type": "application/json", -} - -# Exceptions to always catch -exceptions = ( - requests.exceptions.ChunkedEncodingError, - requests.exceptions.ConnectionError, - requests.exceptions.InvalidSchema, - requests.exceptions.InvalidURL, - requests.exceptions.Timeout, - requests.exceptions.TooManyRedirects, - UnicodeEncodeError, - urllib3.exceptions.LocationParseError -) - -def post_json_api(domain: str, path: str, data: str = "", headers: dict = {}) -> dict: - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',data='{data}',headers()={len(headers)} - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(path, str): - raise ValueError(f"path[]='{type(path)}' is not 'str'") - elif path == "": - raise ValueError("Parameter 'path' cannot be empty") - elif not isinstance(data, str): - raise ValueError(f"data[]='{type(data)}' is not 'str'") - elif not isinstance(headers, dict): - raise ValueError(f"headers[]='{type(headers)}' is not 'list'") - - json_reply = { - "status_code": 200, - } - - try: - # DEBUG: print(f"DEBUG: Sending POST to domain='{domain}',path='{path}',data='{data}',headers({len(headers)})={headers}") - response = reqto.post( - f"https://{domain}{path}", - data=data, - headers={**api_headers, **headers}, - timeout=(config.get("connection_timeout"), config.get("read_timeout")), - cookies=cookies.get_all(domain) if cookies.has(domain) else {} - ) - - json_reply["json"] = json_from_response(response) - - # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") - if not response.ok or response.status_code >= 400: - print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',data()={len(data)},response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") - json_reply["status_code"] = response.status_code - json_reply["error_message"] = response.reason - del json_reply["json"] - instances.set_last_error(domain, response) - - except exceptions as exception: - # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") - json_reply["status_code"] = 999 - json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" - json_reply["exception"] = exception - instances.set_last_error(domain, exception) - raise exception - - # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") - return json_reply - -def fetch_api_url(url: str, timeout: tuple) -> dict: - # DEBUG: print(f"DEBUG: url='{url}',timeout()={len(timeout)} - CALLED!") - if not isinstance(url, str): - raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'") - elif not isinstance(timeout, tuple): - raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") - - json_reply = { - "status_code": 200, - } - - try: - # DEBUG: print(f"DEBUG: Fetching url='{url}' ...") - response = fba.fetch_url(url, api_headers, timeout) - - json_reply["json"] = json_from_response(response) - - # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") - if not response.ok or response.status_code >= 400: - print(f"WARNING: Cannot query JSON API: url='{url}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") - json_reply["status_code"] = response.status_code - json_reply["error_message"] = response.reason - del json_reply["json"] - - except exceptions as exception: - # DEBUG: print(f"DEBUG: Fetching '{url}' failed. exception[{type(exception)}]='{str(exception)}'") - json_reply["status_code"] = 999 - json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" - json_reply["exception"] = exception - raise exception - - # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") - return json_reply - -def get_json_api(domain: str, path: str, headers: dict, timeout: tuple) -> dict: - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',timeout()={len(timeout)} - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(path, str): - raise ValueError(f"path[]='{type(path)}' is not 'str'") - elif path == "": - raise ValueError("Parameter 'path' cannot be empty") - elif not isinstance(headers, dict): - raise ValueError(f"headers[]='{type(headers)}' is not 'list'") - elif not isinstance(timeout, tuple): - raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") - - json_reply = { - "status_code": 200, - } - - try: - # DEBUG: print(f"DEBUG: Sending GET to domain='{domain}',path='{path}',timeout({len(timeout)})={timeout}") - response = reqto.get( - f"https://{domain}{path}", - headers={**api_headers, **headers}, - timeout=timeout, - cookies=cookies.get_all(domain) if cookies.has(domain) else {} - ) - - except exceptions as exception: - # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") - json_reply["status_code"] = 999 - json_reply["error_message"] = f"exception['{type(exception)}']='{str(exception)}'" - json_reply["exception"] = exception - instances.set_last_error(domain, exception) - raise exception - - json_reply["json"] = json_from_response(response) - - # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},json_reply[]='{type(json_reply)}'") - if not response.ok or response.status_code >= 400: - print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',response.status_code='{response.status_code}',json_reply[]='{type(json_reply)}'") - json_reply["status_code"] = response.status_code - json_reply["error_message"] = response.reason - del json_reply["json"] - instances.set_last_error(domain, response) - - # DEBUG: print(f"DEBUG: Returning json_reply({len(json_reply)})=[]:{type(json_reply)}") - return json_reply - -def send_bot_post(domain: str, blocklist: dict): - # DEBUG: print(f"DEBUG: domain='{domain}',blocklist()={len(blocklist)} - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(blocklist, dict): - raise ValueError(f"Parameter blocklist[]='{type(blocklist)}' is not 'dict'") - - message = f"{domain} has blocked the following instances:\n\n" - truncated = False - - if len(blocklist) > 20: - truncated = True - blocklist = blocklist[0 : 19] - - # DEBUG: print(f"DEBUG: blocklist()={len(blocklist)}") - for block in blocklist: - # DEBUG: print(f"DEBUG: block['{type(block)}']={block}") - if block["reason"] is None or block["reason"] == '': - message = message + block["blocked"] + " with unspecified reason\n" - else: - if len(block["reason"]) > 420: - block["reason"] = block["reason"][0:419] + "[…]" - - message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n' - - if truncated: - message = message + "(the list has been truncated to the first 20 entries)" - - botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}} - - req = reqto.post( - f"{config.get('bot_instance')}/api/v1/statuses", - data={ - "status" : message, - "visibility" : config.get('bot_visibility'), - "content_type": "text/plain" - }, - headers=botheaders, - timeout=10 - ).json() - - return True - -def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> requests.models.Response: - # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(path, str): - raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'") - elif path == "": - raise ValueError("Parameter 'path' is empty") - elif not isinstance(headers, dict): - raise ValueError(f"headers[]='{type(headers)}' is not 'dict'") - elif not isinstance(timeout, tuple): - raise ValueError(f"timeout[]='{type(timeout)}' is not 'tuple'") - - try: - # DEBUG: print(f"DEBUG: Sending GET request to '{domain}{path}' ...") - response = reqto.get( - f"https://{domain}{path}", - headers=headers, - timeout=timeout, - cookies=cookies.get_all(domain) if cookies.has(domain) else {} - ) - - except exceptions as exception: - # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(exception)}]='{str(exception)}'") - instances.set_last_error(domain, exception) - raise exception - - # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!") - return response - -def json_from_response(response: requests.models.Response) -> list: - # DEBUG: print(f"DEBUG: response[]='{type(response)}' - CALLED!") - if not isinstance(response, requests.models.Response): - raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'") - - data = list() - if response.text.strip() != "": - # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...") - try: - data = response.json() - except json.decoder.JSONDecodeError: - pass - - # DEBUG: print(f"DEBUG: data[]='{type(data)}' - EXIT!") - return data diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index 1ebd0a0..20814a6 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -17,12 +17,12 @@ import bs4 import validators -from fba import network - from fba.helpers import blacklist from fba.helpers import config from fba.helpers import tidyup +from fba.http import network + from fba.models import instances def fetch_blocks(domain: str) -> dict: diff --git a/fba/networks/lemmy.py b/fba/networks/lemmy.py index cf44f42..7dd8f1d 100644 --- a/fba/networks/lemmy.py +++ b/fba/networks/lemmy.py @@ -21,13 +21,14 @@ import validators from fba import csrf from fba import fba -from fba import federation -from fba import network from fba.helpers import blacklist from fba.helpers import config from fba.helpers import tidyup +from fba.http import federation +from fba.http import network + from fba.models import blocks from fba.models import instances diff --git a/fba/networks/mastodon.py b/fba/networks/mastodon.py index 29718a0..a440c04 100644 --- a/fba/networks/mastodon.py +++ b/fba/networks/mastodon.py @@ -21,12 +21,13 @@ import validators from fba import csrf from fba import fba -from fba import network from fba.helpers import blacklist from fba.helpers import config from fba.helpers import tidyup +from fba.http import network + from fba.models import blocks from fba.models import instances diff --git a/fba/networks/misskey.py b/fba/networks/misskey.py index ff5da39..47a3de0 100644 --- a/fba/networks/misskey.py +++ b/fba/networks/misskey.py @@ -17,13 +17,14 @@ import json from fba import csrf -from fba import network from fba.helpers import blacklist from fba.helpers import config from fba.helpers import dicts from fba.helpers import tidyup +from fba.http import network + from fba.models import instances def fetch_peers(domain: str) -> list: diff --git a/fba/networks/peertube.py b/fba/networks/peertube.py index 13672ea..b9764c5 100644 --- a/fba/networks/peertube.py +++ b/fba/networks/peertube.py @@ -15,10 +15,11 @@ # along with this program. If not, see . from fba import csrf -from fba import network from fba.helpers import config +from fba.http import network + from fba.models import instances def fetch_peers(domain: str) -> list: diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py index f82575e..9d6dab0 100644 --- a/fba/networks/pleroma.py +++ b/fba/networks/pleroma.py @@ -20,13 +20,14 @@ import bs4 import validators from fba import fba -from fba import federation -from fba import network from fba.helpers import blacklist from fba.helpers import config from fba.helpers import tidyup +from fba.http import federation +from fba.http import network + from fba.models import blocks from fba.models import instances