From ddbf920c6e5963095a10f35abdd85f7b53a6f09e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 21 Jun 2023 03:16:26 +0200 Subject: [PATCH] Continued: - also tidyup blocked domain from Friendica, too - also check it against blacklist and any unwanted .arpa/.tld TLDs --- fba/networks/friendica.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index 0c881f7..393f659 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -15,10 +15,12 @@ # along with this program. If not, see . import bs4 +import validators from fba import config from fba import network +from fba.helpers import blacklist from fba.helpers import tidyup from fba.models import instances @@ -68,8 +70,24 @@ def fetch_blocks(domain: str) -> dict: # DEBUG: print(f"DEBUG: Found rows()={len(rows)}") for line in rows: # DEBUG: print(f"DEBUG: line='{line}'") + blocked = tidyup.domain(line.find_all("td")[0].text) + print(f"DEBUG: blocked='{blocked}'") + + if not validators.domain(blocked): + print(f"WARNING: blocked='{blocked}' is not a valid domain - SKIPPED!") + continue + elif blocked.endswith(".arpa"): + print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") + continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue + elif blacklist.is_blacklisted(blocked): + # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!") + continue + blocked.append({ - "domain": tidyup.domain(line.find_all("td")[0].text), + "domain": tidyup.domain(domaih), "reason": tidyup.reason(line.find_all("td")[1].text) }) # DEBUG: print("DEBUG: Next!") -- 2.39.5