From: Roland Häder <roland@mxchange.org> Date: Wed, 21 Jun 2023 01:16:26 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=ddbf920c6e5963095a10f35abdd85f7b53a6f09e;p=fba.git Continued: - also tidyup blocked domain from Friendica, too - also check it against blacklist and any unwanted .arpa/.tld TLDs --- diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index 0c881f7..393f659 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -15,10 +15,12 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. import bs4 +import validators from fba import config from fba import network +from fba.helpers import blacklist from fba.helpers import tidyup from fba.models import instances @@ -68,8 +70,24 @@ def fetch_blocks(domain: str) -> dict: # DEBUG: print(f"DEBUG: Found rows()={len(rows)}") for line in rows: # DEBUG: print(f"DEBUG: line='{line}'") + blocked = tidyup.domain(line.find_all("td")[0].text) + print(f"DEBUG: blocked='{blocked}'") + + if not validators.domain(blocked): + print(f"WARNING: blocked='{blocked}' is not a valid domain - SKIPPED!") + continue + elif blocked.endswith(".arpa"): + print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") + continue + elif blocked.endswith(".tld"): + print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!") + continue + elif blacklist.is_blacklisted(blocked): + # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!") + continue + blocked.append({ - "domain": tidyup.domain(line.find_all("td")[0].text), + "domain": tidyup.domain(domaih), "reason": tidyup.reason(line.find_all("td")[1].text) }) # DEBUG: print("DEBUG: Next!")