From: Roland Häder Date: Sun, 25 Jun 2023 09:22:28 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=1a4c92ccd6cc124fc63b4f11130cdddcfb4696e0;p=fba.git Continued: - added command fetch_joinfediverse() to fetch domain blocks from climatejustice.social's wiki --- diff --git a/fba/boot.py b/fba/boot.py index d9f64d5..7843a30 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -127,6 +127,13 @@ def init_parser(): ) parser.set_defaults(command=commands.fetch_txt) + ### Fetch blocks from joinfediverse.wiki ### + parser = subparser_command.add_parser( + "fetch_joinfediverse", + help="Fetches FediBlock page from joinfediverse.wiki", + ) + parser.set_defaults(command=commands.fetch_joinfediverse) + ### Fetch blocks from fediverse.observer ### parser = subparser_command.add_parser( "fetch_observer", diff --git a/fba/commands.py b/fba/commands.py index f443520..4923bf5 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -860,7 +860,7 @@ def fetch_txt(args: argparse.Namespace) -> int: logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain) continue - logger.debug("domain='%s',row[blocker]='%s'", domain, row["blocker"]) + logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"]) processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name) logger.debug("processed='%s'", processed) @@ -909,3 +909,159 @@ def fetch_fedipact(args: argparse.Namespace) -> int: logger.debug("Success! - EXIT!") return 0 + +def fetch_joinfediverse(args: argparse.Namespace) -> int: + logger.debug("args[]='%s' - CALLED!", type(args)) + locking.acquire() + + raw = utils.fetch_url("https://joinfediverse.wiki/FediBlock", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw)) + + doc = bs4.BeautifulSoup(raw, "html.parser") + logger.debug("doc[]='%s'", type(doc)) + + tables = doc.findAll("table", {"class": "wikitable"}) + + logger.info("Analyzing %d table(s) ...", len(tables)) + blocklist = list() + for table in tables: + logger.debug("table[]='%s'", type(table)) + + rows = table.findAll("tr") + logger.info("Checking %d row(s) ...", len(rows)) + block_headers = dict() + for row in rows: + #logger.debug("row[%s]='%s'", type(row), row) + + headers = row.findAll("th") + #logger.debug("Found headers()=%d header(s)", len(headers)) + if len(headers) > 1: + block_headers = dict() + cnt = 0 + for header in headers: + cnt = cnt + 1 + #logger.debug("header[]='%s',cnt=%d", type(header), cnt) + text = header.contents[0] + + #logger.debug("text[]='%s'", type(text)) + if not isinstance(text, str): + #logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text)) + continue + elif validators.domain(text.strip()): + #logger.debug("text='%s' is a domain - SKIPPED!", text.strip()) + continue + + text = tidyup.domain(text.strip()) + #logger.debug("text='%s'", text) + if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]: + logger.debug("Found header: '%s'=%d", text, cnt) + block_headers[cnt] = text + elif len(block_headers) == 0: + #logger.debug("row is not scrapable - SKIPPED!") + continue + elif len(block_headers) > 0: + logger.debug("Found a row with %d scrapable headers ...", len(block_headers)) + cnt = 0 + block = dict() + + for element in row.find_all(["th", "td"]): + cnt = cnt + 1 + logger.debug("element[]='%s',cnt=%d", type(element), cnt) + if cnt in block_headers: + logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt]) + + text = element.text.strip() + key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked" + + logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text) + if key in ["domain", "instance"]: + block[key] = text + elif key == "reason": + block[key] = tidyup.reason(text) + elif key == "subdomain(s)": + block[key] = list() + if text != "": + block[key] = text.split("/") + else: + logger.debug("key='%s'", key) + block[key] = text + + logger.debug("block()=%d ...", len(block)) + if len(block) > 0: + logger.debug("Appending block()=%d ...", len(block)) + blocklist.append(block) + + logger.debug("blocklist()=%d", len(blocklist)) + + database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'") + domains = database.cursor.fetchall() + + logger.debug("domains(%d)[]='%s'", len(domains), type(domains)) + blocking = list() + for block in blocklist: + logger.debug("block='%s'", block) + if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0: + origin = block["blocked"] + for subdomain in block["subdomain(s)"]: + block["blocked"] = subdomain + "." + origin + blocking.append(block) + else: + blocking.append(block) + + logger.debug("blocking()=%d", blocking) + for block in blocking: + block["blocked"] = tidyup.domain(block["blocked"]) + + if not utils.is_domain_wanted(block["blocked"]): + logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + elif instances.is_recent(block["blocked"]): + logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"]) + continue + + logger.info("Proccessing blocked='%s' ...", block["blocked"]) + processed = utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name) + + blockdict = list() + for blocker in domains: + blocker = blocker[0] + logger.debug("blocker[%s]='%s'", type(blocker), blocker) + + for block in blocking: + block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None + + if not utils.is_domain_wanted(block["blocked"]): + logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"]) + continue + + logger.debug("blocked='%s',reason='%s'", block['blocked'], block['reason']) + if not blocks.is_instance_blocked(blocker, block['blocked'], "reject"): + logger.debug("Invoking blocks.add_instance(%s, %s, %s, %s)", blocker, block['blocked'], block["reason"], "reject") + blocks.add_instance(blocker, block['blocked'], block["reason"], "reject") + + logger.debug("block_level='%s',config[bot_enabled]='%s'", "reject", config.get("bot_enabled")) + if config.get("bot_enabled"): + logger.debug("blocker='%s' has blocked '%s' with reason='%s' - Adding to bot notification ...", blocker, block['blocked'], block["reason"]) + blockdict.append({ + "blocked": block['blocked'], + "reason" : block["reason"], + }) + else: + logger.debug("Updating block last seen and reason for blocker='%s',blocked='%s' ...", blocker, block['blocked']) + blocks.update_last_seen(blocker, block['blocked'], "reject") + blocks.update_reason(block["reason"], blocker, block['blocked'], "reject") + + if instances.has_pending(blocker): + logger.debug("Flushing updates for blocker='%s' ...", blocker) + instances.update_data(blocker) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("config[bot_enabled]='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict)) + if config.get("bot_enabled") and len(blockdict) > 0: + logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict)) + network.send_bot_post(blocker, blockdict) + + logger.debug("Success! - EXIT!") + return 0