From: Roland Häder Date: Wed, 21 Jun 2023 00:28:35 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=d05290ed05c19056b269cd8dd802dbef5b0384db;p=fba.git Continued: - implemented lemmy.fetch_blocks() - don't fetch generically, better only network.exceptions --- diff --git a/fba/commands.py b/fba/commands.py index ddeb8ec..f276a1f 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -40,6 +40,7 @@ from fba.models import blocks from fba.models import instances from fba.networks import friendica +from fba.networks import lemmy from fba.networks import mastodon from fba.networks import misskey from fba.networks import pleroma @@ -195,7 +196,7 @@ def fetch_blocks(args: argparse.Namespace): mastodon.fetch_blocks(blocker, origin, nodeinfo_url) elif software == "lemmy": print(f"INFO: blocker='{blocker}',software='{software}'") - #lemmy.fetch_blocks(blocker, origin, nodeinfo_url) + lemmy.fetch_blocks(blocker, origin, nodeinfo_url) elif software == "friendica" or software == "misskey": print(f"INFO: blocker='{blocker}',software='{software}'") diff --git a/fba/networks/lemmy.py b/fba/networks/lemmy.py index 5a43afc..63829bb 100644 --- a/fba/networks/lemmy.py +++ b/fba/networks/lemmy.py @@ -14,11 +14,19 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import inspect + +import bs4 +import validators + +from fba import blacklist from fba import config from fba import csrf +from fba import fba from fba import federation from fba import network +from fba.models import blocks from fba.models import instances def fetch_peers(domain: str) -> list: @@ -62,7 +70,7 @@ def fetch_peers(domain: str) -> list: print("WARNING: JSON response does not contain 'federated_instances':", domain) instances.set_last_error(domain, data) - except BaseException as exception: + except network.exceptions as exception: print(f"WARNING: Exception during fetching JSON: domain='{domain}',exception[{type(exception)}]:'{str(exception)}'") # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'") @@ -70,3 +78,109 @@ def fetch_peers(domain: str) -> list: # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) return peers + +def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): + # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!") + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + elif not isinstance(origin, str) and origin is not None: + raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'") + elif origin == "": + raise ValueError("Parameter 'origin' is empty") + elif not isinstance(nodeinfo_url, str): + raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'") + elif nodeinfo_url == "": + raise ValueError("Parameter 'nodeinfo_url' is empty") + + translations = [ + "blocked instances", + ] + + try: + # json endpoint for newer mastodongs + found_blocks = list() + blocklist = list() + + rows = { + "reject" : [], + "media_removal" : [], + "followers_only": [], + "report_removal": [], + } + + # DEBUG: print(f"DEBUG: Fetching /instances from domain='{domain}'") + response = network.fetch_response( + domain, + "/instances", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ) + + # DEBUG: print(f"DEBUG: response.ok='{response.ok}',response.status_code={response.status_code},response.text()={len(response.text)}") + if response.ok and response.status_code < 300 and response.text != "": + # DEBUG: print(f"DEBUG: Parsing {len(response.text)} Bytes ...") + + doc = bs4.BeautifulSoup(response.text, "html.parser") + # DEBUG: print(f"DEBUG: doc[]={type(doc)}") + + headers = doc.findAll("h5") + found = None + # DEBUG: print(f"DEBUG: Search in {len(headers)} header(s) ...") + for header in headers: + # DEBUG: print(f"DEBUG: header[]={type(header)}") + content = header.contents[0] + + # DEBUG: print(f"DEBUG: content='{content}'") + if content.lower() in translations: + # DEBUG: print("DEBUG: Found header with blocked instances - BREAK!") + found = header + break + + # DEBUG: print(f"DEBUG: found[]='{type(found)}'") + if found is None: + # DEBUG: print(f"DEBUG: domain='{domain}' is not blocking any instances - EXIT!") + return + + blocking = found.find_next("ul").findAll("a") + # DEBUG: print(f"DEBUG: Found {len(blocking)} blocked instance(s) ...") + for tag in blocking: + # DEBUG: print(f"DEBUG: tag[]='{type(tag)}'") + blocked = tag.contents[0] + + # DEBUG: print(f"DEBUG: blocked='{blocked}'") + if not validators.domain(blocked): + # DEBUG: print(f"DEBUG: blocked='{blocked}' is not a valid domain - SKIPPED!") + continue + elif blacklist.is_blacklisted(blocked): + # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!") + continue + elif blocked.endswith(".arpa"): + print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") + continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue + elif not instances.is_registered(blocked): + # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain) + instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) + + if not blocks.is_instance_blocked(domain, blocked, "reject"): + # DEBUG: print("DEBUG: Blocking:", domain, blocked) + blocks.add_instance(domain, blocked, None, "reject") + + found_blocks.append({ + "blocked": blocked, + "reason" : None + }) + else: + # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...") + blocks.update_last_seen(domain, blocked, "reject") + + # DEBUG: print("DEBUG: Committing changes ...") + fba.connection.commit() + except network.exceptions as exception: + print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'") + + # DEBUG: print("DEBUG: EXIT!") diff --git a/fba/networks/mastodon.py b/fba/networks/mastodon.py index 293b8dd..b0392ea 100644 --- a/fba/networks/mastodon.py +++ b/fba/networks/mastodon.py @@ -82,8 +82,8 @@ def fetch_blocks_from_about(domain: str) -> dict: # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!") break - except BaseException as exception: - print("ERROR: Cannot fetch from domain:", domain, exception) + except network.exceptions as exception: + print(f"ERROR: Cannot fetch from domain='{domain}',exception='{type(exception)}'") instances.set_last_error(domain, exception) break @@ -284,6 +284,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) @@ -295,6 +298,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain) instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py index 39ca927..f23393c 100644 --- a/fba/networks/pleroma.py +++ b/fba/networks/pleroma.py @@ -143,6 +143,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # Commit changes fba.connection.commit() @@ -213,6 +216,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # Commit changes fba.connection.commit() @@ -316,6 +322,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) @@ -388,6 +397,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) @@ -460,6 +472,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): elif blocked.endswith(".arpa"): print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue elif not instances.is_registered(blocked): # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) @@ -519,7 +534,7 @@ def fetch_blocks_from_about(domain: str) -> dict: # DEBUG: print(f"DEBUG: Found 'h2' header in path='{path}' - BREAK!") break - except BaseException as exception: + except network.exceptions as exception: print("ERROR: Cannot fetch from domain:", domain, exception) instances.set_last_error(domain, exception) break