X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fnetworks%2Ffriendica.py;h=1434697ae1ac06146eb4b78d79eb275e961c38ef;hb=f9d221393d508c052c56cbc8abf04aa411776454;hp=b3f6bc40d342ceee218fa42b6567b4a2026ef940;hpb=88c47338c39987cc13590f6a1f221169f26a4cd1;p=fba.git diff --git a/fba/networks/friendica.py b/fba/networks/friendica.py index b3f6bc4..1434697 100644 --- a/fba/networks/friendica.py +++ b/fba/networks/friendica.py @@ -17,10 +17,9 @@ import logging import bs4 -import validators -from fba.helpers import blacklist from fba.helpers import config +from fba.helpers import domain as domain_helper from fba.helpers import tidyup from fba.http import network @@ -29,85 +28,80 @@ from fba.models import instances logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) -def fetch_blocks(domain: str) -> dict: - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") +def fetch_blocks(domain: str) -> list: + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) + + if not instances.is_registered(domain): + raise Exception(f"domain='{domain}' is not registered but function is invoked.") blocklist = list() block_tag = None try: - logger.debug("Fetching friendica blocks from domain:", domain) - doc = bs4.BeautifulSoup( - network.fetch_response( - domain, - "/friendica", - network.web_headers, - (config.get("connection_timeout"), config.get("read_timeout")) - ).text, - "html.parser", - ) + logger.debug("Fetching friendica blocks from domain='%s'", domain) + raw = network.fetch_response( + domain, + "/friendica", + network.web_headers, + (config.get("connection_timeout"), config.get("read_timeout")) + ).text + logger.debug("Parsing %d Bytes ...", len(raw)) + + doc = bs4.BeautifulSoup(raw, "html.parser",) logger.debug("doc[]='%s'", type(doc)) block_tag = doc.find(id="about_blocklist") + logger.debug("block_tag[%s]='%s'", type(block_tag), block_tag) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching instances (friendica) from domain='{domain}'") + logger.warning("Exception '%s' during fetching instances from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) - return dict() - # Prevents exceptions: + logger.debug("Returning empty list ... - EXIT!") + return list() + + logger.debug("block_tag[%s]='%s'", type(block_tag), block_tag) if block_tag is None: - logger.debug("Instance has no block list:", domain) - return dict() + logger.debug("Instance has no block list: domain='%s' - EXIT!", domain) + return list() table = block_tag.find("table") - logger.debug(f"table[]='{type(table)}'") - if table.find("tbody"): + logger.debug("table[]='%s'", type(table)) + if table is None: + logger.warning("domain='%s' has no table tag - EXIT !", domain) + return list() + elif table.find("tbody"): rows = table.find("tbody").find_all("tr") else: rows = table.find_all("tr") - logger.debug(f"Found rows()={len(rows)}") + logger.debug("Found rows()=%d", len(rows)) for line in rows: - logger.debug(f"line='{line}'") - blocked = tidyup.domain(line.find_all("td")[0].text) + logger.debug("line='%s'", line) + blocked = line.find_all("td")[0].text + logger.debug("blocked='%s'", blocked) + + blocked = tidyup.domain(blocked) if blocked != "" else None reason = tidyup.reason(line.find_all("td")[1].text) - logger.debug(f"blocked='{blocked}',reason='{reason}'") + logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) - if not validators.domain(blocked): - logger.warning(f"blocked='{blocked}' is not a valid domain - SKIPPED!") - continue - elif blocked.endswith(".arpa"): - logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.") - continue - elif blocked.endswith(".tld"): - logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!") + if blocked is None or blocked == "": + logger.warning("line[]='%s' returned empty blocked domain - SKIPPED!", type(line)) continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) + elif not domain_helper.is_wanted(blocked): + logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - logger.debug(f"Appending blocked='{blocked}',reason='{reason}'") + logger.debug("Appending blocked='%s',reason='%s'", blocked, reason) blocklist.append({ - "domain": tidyup.domain(blocked), - "reason": tidyup.reason(reason) + "blocker" : domain, + "blocked" : blocked, + "reason" : reason, + "block_level": "reject", }) - logger.debug("Next!") - logger.debug("Returning blocklist() for domain:", domain, len(blocklist)) - return { - "reject": blocklist - } + logger.debug("blocklist()=%d - EXIT!", len(blocklist)) + return blocklist