X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fnetworks%2Fpleroma.py;h=435d7ce2d3099ea38f5e4ca2cc51f93474365e6b;hb=2d6883fad6ba05ee90224274ec57979f47a568d9;hp=7899906708fba85266280d5280aaa5a0f789dca5;hpb=e795feb5e4a8239d45754dd3c053e0c7dcec1355;p=fba.git diff --git a/fba/networks/pleroma.py b/fba/networks/pleroma.py index 7899906..435d7ce 100644 --- a/fba/networks/pleroma.py +++ b/fba/networks/pleroma.py @@ -14,23 +14,20 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import inspect import logging import bs4 -import validators from fba import database from fba import utils -from fba.helpers import blacklist from fba.helpers import config +from fba.helpers import domain as domain_helper from fba.helpers import tidyup from fba.http import federation from fba.http import network -from fba.models import blocks from fba.models import instances logging.basicConfig(level=logging.INFO) @@ -39,58 +36,46 @@ logger = logging.getLogger(__name__) # Language mapping X -> English language_mapping = { # English -> English - "Reject": "Suspended servers", + "limited servers" : "followers_only", + "suspended servers": "reject", + "silenced servers" : "silenced", + "filtered media" : "filtered_media", } -def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): - logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(origin, str) and origin is not None: - raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'") - elif origin == "": - raise ValueError("Parameter 'origin' is empty") - elif not isinstance(nodeinfo_url, str): +def fetch_blocks(domain: str, nodeinfo_url: str) -> list: + logger.debug("domain='%s',nodeinfo_url='%s' - CALLED!", domain, nodeinfo_url) + domain_helper.raise_on(domain) + + if not isinstance(nodeinfo_url, str): raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'") elif nodeinfo_url == "": raise ValueError("Parameter 'nodeinfo_url' is empty") - # @TODO Unused blockdict blockdict = list() rows = None try: - logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'") + logger.debug("Fetching nodeinfo: domain='%s',nodeinfo_url='%s'", domain, nodeinfo_url) rows = federation.fetch_nodeinfo(domain, nodeinfo_url) except network.exceptions as exception: - logger.warning(f"Exception '{type(exception)}' during fetching nodeinfo") + logger.warning("Exception '%s' during fetching nodeinfo from domain='%s'", type(exception), domain) instances.set_last_error(domain, exception) if rows is None: - logger.warning("Could not fetch nodeinfo from domain:", domain) - return + logger.warning("Could not fetch nodeinfo from domain='%s'", domain) + return list() elif "metadata" not in rows: - logger.warning(f"rows()={len(rows)} does not have key 'metadata', domain='{domain}'") - return + logger.warning("rows()=%d does not have key 'metadata', domain='%s'", len(rows), domain) + return list() elif "federation" not in rows["metadata"]: - logger.warning(f"rows()={len(rows['metadata'])} does not have key 'federation', domain='{domain}'") - return + logger.warning("rows()=%d does not have key 'federation', domain='%s'", len(rows["metadata"]), domain) + return list() data = rows["metadata"]["federation"] found = False logger.debug("data[]='%s'", type(data)) if "mrf_simple" in data: - logger.debug("Found mrf_simple:", domain) + logger.debug("Found mrf_simple in API response from domain='%s'", domain) found = True for block_level, blocklist in ( { @@ -100,9 +85,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): } } ).items(): - logger.debug("block_level, blocklist():", block_level, len(blocklist)) + logger.debug("block_level='%s', blocklist()=%d", block_level, len(blocklist)) block_level = tidyup.domain(block_level) - logger.debug("BEFORE block_level:", block_level) + logger.debug("block_level='%s' - AFTER!", block_level) if block_level == "": logger.warning("block_level is now empty!") @@ -111,148 +96,80 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): logger.debug("domain='%s' skipping block_level='accept'", domain) continue - logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...") + block_level = utils.alias_block_level(block_level) + + logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(blocklist), domain, block_level) if len(blocklist) > 0: for blocked in blocklist: - logger.debug("BEFORE blocked:", blocked) + logger.debug("blocked='%s' - BEFORE!", blocked) blocked = tidyup.domain(blocked) - logger.debug("AFTER blocked:", blocked) + logger.debug("blocked='%s' - AFTER!", blocked) if blocked == "": - logger.warning("blocked is empty after tidyup.domain():", domain, block_level) + logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s' - SKIPPED!", domain, block_level) continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) + elif not utils.is_domain_wanted(blocked): + logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif blocked.count("*") > 0: - # Obscured domain name with no hash - row = instances.deobscure("*", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Obscured domain name with no hash - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - - logger.debug(f"blocked='{blocked}'") + + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain) + blocked = utils.deobfuscate_domain(blocked, domain) + + logger.debug("blocked='%s' - DEOBFUSCATED!", blocked) if not utils.is_domain_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif not instances.is_registered(blocked): - # Commit changes - logger.debug("Invoking commit() ...") - database.connection.commit() - - logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") - instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) - - if not blocks.is_instance_blocked(domain, blocked, block_level): - logger.debug("Blocking:", domain, blocked, block_level) - blocks.add_instance(domain, blocked, None, block_level) - - if block_level == "reject": - logger.debug("Adding to blockdict:", blocked) - blockdict.append({ - "blocked": blocked, - "reason" : None - }) - else: - logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...") - blocks.update_last_seen(domain, blocked, block_level) + + logger.debug("Appending blocker='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level) + blockdict.append({ + "blocker" : domain, + "blocked" : blocked, + "reason" : None, + "block_level": block_level, + }) + elif "quarantined_instances" in data: - logger.debug(f"Found 'quarantined_instances' in JSON response: domain='{domain}'") + logger.debug("Found 'quarantined_instances' in JSON response: domain='%s'", domain) found = True block_level = "quarantined" for blocked in data["quarantined_instances"]: - logger.debug("BEFORE blocked:", blocked) + logger.debug("blocked='%s' - BEFORE!", blocked) blocked = tidyup.domain(blocked) - logger.debug("AFTER blocked:", blocked) + logger.debug("blocked='%s' - AFTER!", blocked) if blocked == "": - logger.warning("blocked is empty after tidyup.domain():", domain, block_level) + logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level) continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) + elif not utils.is_domain_wanted(blocked): + logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif blocked.count("*") > 0: - # Obscured domain name with no hash - row = instances.deobscure("*", blocked) - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Obscured domain name with no hash - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain) + blocked = utils.deobfuscate_domain(blocked, domain) - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - - logger.debug(f"blocked='{blocked}'") + logger.debug("blocked='%s' - DEOBFUSCATED!", blocked) if not utils.is_domain_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif not instances.is_registered(blocked): - # Commit changes - logger.debug("Invoking commit() ...") - database.connection.commit() - - logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") - instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) - if not blocks.is_instance_blocked(domain, blocked, block_level): - logger.debug("Blocking:", domain, blocked, block_level) - blocks.add_instance(domain, blocked, None, block_level) + logger.debug("Appending blocker='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level) + blockdict.append({ + "blocker" : domain, + "blocked" : blocked, + "reason" : None, + "block_level": block_level, + }) - if block_level == "reject": - logger.debug("Adding to blockdict:", blocked) - blockdict.append({ - "blocked": blocked, - "reason" : None - }) - else: - logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...") - blocks.update_last_seen(domain, blocked, block_level) else: - logger.warning(f"Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'") + logger.warning("Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='%s'", domain) logger.debug("Invoking commit() ...") database.connection.commit() # Reasons if "mrf_simple_info" in data: - logger.debug("Found mrf_simple_info:", domain) + logger.debug("Found mrf_simple_info in API response: domain='%s'", domain) found = True for block_level, info in ( { @@ -260,255 +177,138 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str): **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {}) } ).items(): - logger.debug("block_level, info.items():", block_level, len(info.items())) + logger.debug("block_level='%s', info.items()=%d", block_level, len(info.items())) block_level = tidyup.domain(block_level) - logger.debug("BEFORE block_level:", block_level) + logger.debug("block_level='%s' - AFTER!", block_level) if block_level == "": logger.warning("block_level is now empty!") continue elif block_level == "accept": - logger.debug("domain='%s' skipping block_level='accept'", domain) + logger.debug("domain='%s': Skipping block_level='%s' ...", domain, block_level) continue - logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',block_level='{block_level}' ...") + block_level = utils.alias_block_level(block_level) + + logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(info.items()), domain, block_level) for blocked, reason in info.items(): - logger.debug(f"blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!") + logger.debug("blocked='%s',reason[%s]='%s' - BEFORE!", blocked, type(reason), reason) blocked = tidyup.domain(blocked) + logger.debug("blocked='%s' - AFTER!", blocked) if isinstance(reason, str): logger.debug("reason[] is a string") reason = tidyup.reason(reason) elif isinstance(reason, dict) and "reason" in reason: logger.debug("reason[] is a dict") - reason = tidyup.reason(reason["reason"]) + reason = tidyup.reason(reason["reason"]) if isinstance(reason["reason"], str) else None elif reason is not None: raise ValueError(f"Cannot handle reason[]='{type(reason)}'") logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) if blocked == "": - logger.warning("blocked is empty after tidyup.domain():", domain, block_level) + logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level) continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) - continue - elif blocked.count("*") > 0: - # Obscured domain name with no hash - row = instances.deobscure("*", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Obscured domain name with no hash - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - - logger.debug(f"blocked='{blocked}'") - if not utils.is_domain_wanted(blocked): + elif not utils.is_domain_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif not instances.is_registered(blocked): - logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") - instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) - logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'") - blocks.update_reason(reason, domain, blocked, block_level) + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain) + blocked = utils.deobfuscate_domain(blocked, domain) + logger.debug("blocked='%s' - DEOBFUSCATED!", blocked) - logger.debug(f"blockdict()={len(blockdict)}") - for entry in blockdict: - if entry["blocked"] == blocked: - logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'") - entry["reason"] = reason + logger.debug("Checking %d blockdict records ...", len(blockdict)) + for block in blockdict: + logger.debug("block[blocked]='%s',blocked='%s'", block["blocked"], blocked) + if block["blocked"] == blocked: + logger.debug("Updating reason='%s' for blocker='%s'", reason, block["blocked"]) + block["reason"] = reason elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]: - logger.debug(f"Found 'quarantined_instances_info' in JSON response: domain='{domain}'") + logger.debug("Found 'quarantined_instances_info' in JSON response: domain='%s'", domain) found = True block_level = "quarantined" #print(data["quarantined_instances_info"]) rows = data["quarantined_instances_info"]["quarantined_instances"] for blocked in rows: - logger.debug("BEFORE blocked:", blocked) + logger.debug("blocked='%s' - BEFORE!", blocked) blocked = tidyup.domain(blocked) - logger.debug("AFTER blocked:", blocked) + reason = tidyup.reason(rows[blocked]["reason"]) + logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) if blocked not in rows or "reason" not in rows[blocked]: - logger.warning(f"Cannot find blocked='{blocked}' in rows()={len(rows)},domain='{domain}'") + logger.warning("Cannot find blocked='%s' in rows()=%d,domain='%s' - BREAK!", blocked, len(rows), domain) break - - reason = rows[blocked]["reason"] - logger.debug(f"reason='{reason}'") - - if blocked == "": - logger.warning("blocked is empty after tidyup.domain():", domain, block_level) + elif blocked == "": + logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level) continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) + elif not utils.is_domain_wanted(blocked): + logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif blocked.count("*") > 0: - # Obscured domain name with no hash - row = instances.deobscure("*", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Obscured domain name with no hash - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", blocked, domain) + blocked = utils.deobfuscate_domain(blocked, domain) - logger.debug(f"blocked='{blocked}'") + logger.debug("blocked='%s' - DEOBFUSCATED!", blocked) if not utils.is_domain_wanted(blocked): logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked) continue - elif not instances.is_registered(blocked): - logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") - instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) - - logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'") - blocks.update_reason(reason, domain, blocked, block_level) - - logger.debug(f"blockdict()={len(blockdict)}") - for entry in blockdict: - if entry["blocked"] == blocked: - logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'") - entry["reason"] = reason + + logger.debug("Checking %d blockdict records ...", len(blockdict)) + for block in blockdict: + logger.debug("block[blocked]='%s',blocked='%s'", block["blocked"], blocked) + if block["blocked"] == blocked: + logger.debug("Updating reason='%s' for blocker='%s'", reason, block["blocked"]) + block["reason"] = reason else: - logger.warning(f"Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='{domain}'") + logger.warning("Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='%s'", domain) if not found: - logger.debug(f"Did not find any useable JSON elements, domain='{domain}', continuing with /about page ...") + logger.debug("Did not find any useable JSON elements, domain='%s', continuing with /about page ...", domain) blocklist = fetch_blocks_from_about(domain) - logger.debug(f"blocklist()={len(blocklist)}") + logger.debug("blocklist()=%d", len(blocklist)) if len(blocklist) > 0: - logger.info("Checking %d record(s) ...", len(blocklist)) + logger.info("Checking %d different blocklists ...", len(blocklist)) for block_level in blocklist: logger.debug("block_level='%s'", block_level) - rows = blocklist[block_level] - logger.debug(f"rows['{type(rows)}]()={len(rows)}'") - for record in rows: - logger.debug(f"record[]='{type(record)}'") - blocked = tidyup.domain(record["blocked"]) - reason = tidyup.reason(record["reason"]) - logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason) - if blocked == "": - logger.warning("blocked is empty after tidyup.domain():", domain, block_level) - continue - elif blacklist.is_blacklisted(blocked): - logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked) - continue - elif blocked.count("*") > 0: - # Obscured domain name with no hash - row = instances.deobscure("*", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - elif blocked.count("?") > 0: - # Obscured domain name with no hash - row = instances.deobscure("?", blocked) - - logger.debug("row[]='%s'", type(row)) - if row is None: - logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!") - continue - - logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'") - blocked = row[0] - origin = row[1] - nodeinfo_url = row[2] - - logger.debug(f"blocked='{blocked}'") - if not utils.is_domain_wanted(blocked): - logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked) + logger.debug("rows[%s]()=%d'", type(rows), len(rows)) + for block in rows: + logger.debug("Invoking utils.deobfuscate_domain(%s, %s) ...", block["blocked"], domain) + block["blocked"] = utils.deobfuscate_domain(block["blocked"], domain) + + logger.debug("block[blocked]='%s' - DEOBFUSCATED!", block["blocked"]) + if not utils.is_domain_wanted(block["blocked"]): + logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"]) continue - elif not instances.is_registered(blocked): - logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'") - instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url) - - if not blocks.is_instance_blocked(domain, blocked, block_level): - logger.debug("Blocking:", domain, blocked, block_level) - blocks.add_instance(domain, blocked, reason, block_level) - - if block_level == "reject": - logger.debug("Adding to blockdict:", blocked) - blockdict.append({ - "blocked": blocked, - "reason" : reason - }) - else: - logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...") - blocks.update_reason(reason, domain, blocked, block_level) - logger.debug("Invoking commit() ...") - database.connection.commit() + logger.debug("Appending blocker='%s',block[blocked]='%s',block[reason]='%s',block_level='%s' ...",domain, block["blocked"], block["reason"], block_level) + blockdict.append({ + "blocker" : domain, + "blocked" : block["blocked"], + "reason" : block["reason"], + "block_level": block_level, + }) - logger.debug("EXIT!") + logger.debug("blockdict()=%d - EXIT!", len(blockdict)) + return blockdict def fetch_blocks_from_about(domain: str) -> dict: - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - - logger.debug(f"Fetching mastodon blocks from domain='{domain}'") + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) + + logger.debug("Fetching mastodon blocks from domain='%s'", domain) doc = None for path in ["/instance/about/index.html"]: try: # Resetting doc type doc = None - logger.debug(f"Fetching path='{path}' from domain='{domain}' ...") + logger.debug("Fetching path='%s' from domain='%s' ...", path, domain) response = network.fetch_response( domain, path, @@ -516,12 +316,12 @@ def fetch_blocks_from_about(domain: str) -> dict: (config.get("connection_timeout"), config.get("read_timeout")) ) - logger.debug(f"response.ok='{response.ok}',response.status_code='{response.status_code}',response.text()={len(response.text)}") + logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text)) if not response.ok or response.text.strip() == "": - logger.warning(f"path='{path}' does not exist on domain='{domain}' - SKIPPED!") + logger.warning("path='%s' does not exist on domain='%s' - SKIPPED!", path, domain) continue - logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...") + logger.debug("Parsing response.text()=%d Bytes ...", len(response.text)) doc = bs4.BeautifulSoup( response.text, "html.parser", @@ -529,52 +329,61 @@ def fetch_blocks_from_about(domain: str) -> dict: logger.debug("doc[]='%s'", type(doc)) if doc.find("h2") is not None: - logger.debug(f"Found 'h2' header in path='{path}' - BREAK!") + logger.debug("Found 'h2' header in path='%s' - BREAK!", path) break except network.exceptions as exception: - logger.warning("Cannot fetch from domain:", domain, exception) + logger.warning("Cannot fetch from domain='%s',exception[%s]='%s'", domain, type(exception), str(exception)) instances.set_last_error(domain, exception) break blocklist = { - "Suspended servers": [], - "Filtered media" : [], - "Limited servers" : [], - "Silenced servers" : [], + "reject" : [], + "filtered_media": [], + "followers_only": [], + "silenced" : [], } logger.debug("doc[]='%s'", type(doc)) if doc is None: - logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!") - return blocklist + logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain) + return list() - for header in doc.find_all("h2"): - header_text = tidyup.reason(header.text) + headers = doc.find_all("h2") - logger.debug(f"header_text='{header_text}' - BEFORE!") - if header_text in language_mapping: - logger.debug(f"header_text='{header_text}' - FOUND!") - header_text = language_mapping[header_text] + logger.debug("headers[]='%s'", type(headers)) + if headers is None: + logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain) + return list() + + logger.info("Checking %d headers ...", len(headers)) + for header in headers: + logger.debug("header[%s]='%s'", type(header), header) + block_level = tidyup.reason(header.text).lower() + + logger.debug("block_level='%s' - BEFORE!", block_level) + if block_level in language_mapping: + logger.debug("block_level='%s' - FOUND!", block_level) + block_level = language_mapping[block_level].lower() else: - logger.warning(f"header_text='{header_text}' not found in language mapping table") + logger.warning("block_level='%s' not found in language mapping table", block_level) - logger.debug(f"header_text='{header_text} - AFTER!'") - if header_text in blocklist or header_text.lower() in blocklist: + logger.debug("block_level='%s - AFTER!'", block_level) + if block_level in blocklist: # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu - logger.debug(f"Found header_text='{header_text}', importing domain blocks ...") + logger.debug("Found block_level='%s', importing domain blocks ...", block_level) for line in header.find_next("table").find_all("tr")[1:]: - logger.debug(f"line[]='{type(line)}'") - blocklist[header_text].append({ - "blocked": tidyup.domain(line.find_all("td")[0].text), - "reason" : tidyup.reason(line.find_all("td")[1].text), + logger.debug("line[]='%s'", type(line)) + blocked = tidyup.domain(line.find_all("td")[0].text) + reason = tidyup.reason(line.find_all("td")[1].text) + + logger.debug("Appending block_level='%s',blocked='%s',reason='%s' ...", block_level, blocked, reason) + blocklist[block_level].append({ + "blocked": blocked, + "reason" : reason, }) else: - logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}") + logger.warning("block_level='%s' not found in blocklist()=%d", block_level, len(blocklist)) - logger.debug(f"Returning blocklist for domain='{domain}'") - return { - "reject" : blocklist["Suspended servers"], - "media_removal" : blocklist["Filtered media"], - "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"], - } + logger.debug("Returning blocklist for domain='%s' - EXIT!", domain) + return blocklist