From 5caa049f20a6530ae2357d1b33d697c1a52f8b73 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Wed, 17 May 2023 05:09:59 +0200 Subject: [PATCH] Continued: - encapsulation, still tons of duplicate records in `instances` table --- api.py | 16 +-- fetch_blocks.py | 291 +++++++++++++++++++++++++++--------------------- 2 files changed, 173 insertions(+), 134 deletions(-) diff --git a/api.py b/api.py index 8e95e58..5df098f 100644 --- a/api.py +++ b/api.py @@ -24,7 +24,7 @@ def get_hash(domain: str) -> str: def info(): conn = sqlite3.connect("blocks.db") c = conn.cursor() - c.execute("SELECT (SELECT count(domain) FROM instances), (SELECT count(domain) FROM instances WHERE software in ('pleroma', 'mastodon', 'misskey', 'gotosocial', 'friendica')), (SELECT count(blocker) FROM blocks)") + c.execute("SELECT (SELECT count(domain) FROM instances), (SELECT count(domain) FROM instances WHERE software IN ('pleroma', 'mastodon', 'misskey', 'gotosocial', 'friendica')), (SELECT count(blocker) FROM blocks)") known, indexed, blocks = c.fetchone() c.close() return { @@ -42,11 +42,11 @@ def top(blocked: int = None, blockers: int = None): elif blocked != None: if blocked > 500: raise HTTPException(status_code=400, detail="Too many results") - c.execute("SELECT blocked, count(blocked) FROM blocks WHERE block_level = 'reject' group by blocked order by count(blocked) desc limit ?", (blocked,)) + c.execute("SELECT blocked, count(blocked) FROM blocks WHERE block_level = 'reject' group by blocked ORDER BY count(blocked) DESC LIMIT ?", (blocked,)) elif blockers != None: if blockers > 500: raise HTTPException(status_code=400, detail="Too many results") - c.execute("SELECT blocker, count(blocker) FROM blocks WHERE block_level = 'reject' group by blocker order by count(blocker) desc limit ?", (blockers,)) + c.execute("SELECT blocker, count(blocker) FROM blocks WHERE block_level = 'reject' group by blocker ORDER BY count(blocker) DESC LIMIT ?", (blockers,)) scores = c.fetchall() c.close() @@ -70,12 +70,12 @@ def blocked(domain: str = None, reason: str = None, reverse: str = None): if domain != None: wildchar = "*." + ".".join(domain.split(".")[-domain.count("."):]) punycode = domain.encode('idna').decode('utf-8') - c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? order by first_added asc", + c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_added asc", (domain, "*." + domain, wildchar, get_hash(domain), punycode, "*." + punycode)) elif reverse != None: - c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocker = ? order by first_added asc", (reverse,)) + c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocker = ? ORDER BY first_added asc", (reverse,)) else: - c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE reason like ? AND reason != '' order by first_added asc", ("%"+reason+"%",)) + c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE reason like ? AND reason != '' ORDER BY first_added asc", ("%"+reason+"%",)) blocks = c.fetchall() c.close() @@ -163,10 +163,10 @@ def rss(request: Request, domain: str = None): if domain != None: wildchar = "*." + ".".join(domain.split(".")[-domain.count("."):]) punycode = domain.encode('idna').decode('utf-8') - c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? order by first_added desc limit 50", + c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_added DESC LIMIT 50", (domain, "*." + domain, wildchar, get_hash(domain), punycode, "*." + punycode)) else: - c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks order by first_added desc limit 50") + c.execute("SELECT blocker, blocked, block_level, reason, first_added, last_seen FROM blocks ORDER BY first_added DESC LIMIT 50") blocks = c.fetchall() c.close() diff --git a/fetch_blocks.py b/fetch_blocks.py index 4bbe4ea..58f8659 100644 --- a/fetch_blocks.py +++ b/fetch_blocks.py @@ -16,6 +16,55 @@ headers = { "user-agent": config["useragent"] } +def update_block_reason(reason: str, blocker: str, blocked: str, block_level: str): + # NOISY: print("--- Updating block reason:", reason, blocker, blocked, block_level) + c.execute( + "UPDATE blocks SET reason = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND reason = ''", + ( + reason, + blocker, + blocked, + block_level + ), + ) + +def update_last_seen(last_seen: int, blocker: str, blocked: str, block_level: str): + # NOISY: print("--- Updating last_seen:", last_seen, blocker, blocked, block_level) + c.execute( + "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ?", + ( + last_seen, + blocker, + blocked, + block_level + ) + ) + +def block_instance(blocker: str, blocked: str, reason: str, block_level: str, first_added: int, last_seen: int): + print("--- New block:", blocker, blocked, reason, block_level, first_added, last_seen) + c.execute( + "INSERT INTO blocks SELECT ?, ?, ?, ?, ?, ?", + ( + blocker, + blocked, + reason, + block_level, + first_added, + last_seen + ), + ) + +def add_instance (domain: str): + print("--- Adding new instance:", domain) + c.execute( + "INSERT INTO instances SELECT ?, ?, ?", + ( + blocked, + get_hash(blocked), + get_type(blocked) + ), + ) + def send_bot_post(instance: str, blocks: dict): message = instance + " has blocked the following instances:\n\n" truncated = False @@ -104,13 +153,19 @@ def get_friendica_blocks(domain: str) -> dict: return {} blocklist = doc.find(id="about_blocklist") + + # Prevents exceptions: + if blocklist is None: + print("Instance has no block list:", domain) + return {} + for line in blocklist.find("table").find_all("tr")[1:]: - blocks.append( - { - "domain": line.find_all("td")[0].text.strip(), - "reason": line.find_all("td")[1].text.strip() - } - ) + blocks.append( + { + "domain": line.find_all("td")[0].text.strip(), + "reason": line.find_all("td")[1].text.strip() + } + ) return { "reject": blocks @@ -227,7 +282,8 @@ conn = sqlite3.connect("blocks.db") c = conn.cursor() c.execute( - "SELECT domain, software FROM instances WHERE software in ('pleroma', 'mastodon', 'friendica', 'misskey', 'gotosocial')" + "SELECT domain, software FROM instances WHERE domain='tooting.intensifi.es'" + #"SELECT domain, software FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'gotosocial')" ) for blocker, software in c.fetchall(): @@ -247,35 +303,39 @@ for blocker, software in c.fetchall(): ).items(): for blocked in blocks: blocked = tidyup(blocked) + if blocked == "": + print("WARNING: blocked is empty after tidyup():", blocker, block_level) continue + if blocked.count("*") > 1: # -ACK!-oma also started obscuring domains without hash c.execute( - "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) + "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) ) searchres = c.fetchone() if searchres != None: blocked = searchres[0] c.execute( - "SELECT domain FROM instances WHERE domain = ?", (blocked,) + "SELECT domain FROM instances WHERE domain = ?", (blocked) ) + if c.fetchone() == None: - c.execute( - "INSERT INTO instances SELECT ?, ?, ?", - (blocked, get_hash(blocked), get_type(blocked)), - ) + add_instance(blocked) + timestamp = int(time()) c.execute( - "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", - (blocker, blocked, block_level), + "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", + ( + blocker, + blocked, + block_level + ), ) if c.fetchone() == None: - c.execute( - "INSERT INTO blocks SELECT ?, ?, '', ?, ?, ?", - (blocker, blocked, block_level, timestamp, timestamp), - ) + block_instance(blocker, blocked, reason, block_level, timestamp, timestamp) + if block_level == "reject": blockdict.append( { @@ -283,10 +343,7 @@ for blocker, software in c.fetchall(): "reason": None }) else: - c.execute( - "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ?", - (timestamp, blocker, blocked, block_level) - ) + update_last_seen(timestamp, blocker, blocked, block_level) conn.commit() # Reasons if "mrf_simple_info" in federation: @@ -298,20 +355,23 @@ for blocker, software in c.fetchall(): ).items(): for blocked, reason in info.items(): blocked = tidyup(blocked) + if blocked == "": + print("WARNING: blocked is empty after tidyup():", blocker, block_level) continue + if blocked.count("*") > 1: # same domain guess as above, but for reasons field c.execute( - "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) + "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) ) searchres = c.fetchone() + if searchres != None: blocked = searchres[0] - c.execute( - "UPDATE blocks SET reason = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND reason = ''", - (reason["reason"], blocker, blocked, block_level), - ) + + update_block_reason(reason["reason"], blocker, blocked, block_level) + for entry in blockdict: if entry["blocked"] == blocked: entry["reason"] = reason["reason"] @@ -345,6 +405,7 @@ for blocker, software in c.fetchall(): blocks = get( f"https://{blocker}/api/v1/instance/domain_blocks", headers=reqheaders, timeout=5 ).json() + for block in blocks: entry = {'domain': block['domain'], 'hash': block['digest'], 'reason': block['comment']} if block['severity'] == 'suspend': @@ -355,6 +416,8 @@ for blocker, software in c.fetchall(): json['media_removal'].append(entry) elif block['severity'] == 'reject_reports': json['report_removal'].append(entry) + else: + print("WARNING: Unknown severity:", block['severity'], block['domain']) except: json = get_mastodon_blocks(blocker) @@ -362,19 +425,18 @@ for blocker, software in c.fetchall(): for instance in blocks: blocked, blocked_hash, reason = instance.values() blocked = tidyup(blocked) + if blocked.count("*") <= 1: c.execute( - "SELECT hash FROM instances WHERE hash = ?", (blocked_hash,) + "SELECT hash FROM instances WHERE hash = ?", (blocked_hash,) ) + if c.fetchone() == None: - c.execute( - "INSERT INTO instances SELECT ?, ?, ?", - (blocked, get_hash(blocked), get_type(blocked)), - ) + add_instance(blocked) else: # Doing the hash search for instance names as well to tidy up DB c.execute( - "SELECT domain FROM instances WHERE hash = ?", (blocked_hash,) + "SELECT domain FROM instances WHERE hash = ?", (blocked_hash,) ) searchres = c.fetchone() if searchres != None: @@ -382,21 +444,16 @@ for blocker, software in c.fetchall(): timestamp = int(time()) c.execute( - "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", - (blocker, blocked if blocked.count("*") <= 1 else blocked_hash, block_level), + "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", + ( + blocker, + blocked if blocked.count("*") <= 1 else blocked_hash, + block_level + ), ) if c.fetchone() == None: - c.execute( - "INSERT INTO blocks SELECT ?, ?, ?, ?, ?, ?", - ( - blocker, - blocked if blocked.count("*") <= 1 else blocked_hash, - reason, - block_level, - timestamp, - timestamp, - ), - ) + block_instance(blocker, blocked if blocked.count("*") <= 1 else blocked_hash, reason, block_level, timestamp, timestamp) + if block_level == "reject": blockdict.append( { @@ -404,15 +461,11 @@ for blocker, software in c.fetchall(): "reason": reason }) else: - c.execute( - "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ?", - (timestamp, blocker, blocked if blocked.count("*") <= 1 else blocked_hash, block_level), - ) + update_last_seen(timestamp, blocker, blocked if blocked.count("*") <= 1 else blocked_hash, block_level) + if reason != '': - c.execute( - "UPDATE blocks SET reason = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND reason = ''", - (reason, blocker, blocked if blocked.count("*") <= 1 else blocked_hash, block_level), - ) + update_block_reason(reason, blocker, blocked if blocked.count("*") <= 1 else blocked_hash, block_level) + conn.commit() except Exception as e: print("error:", e, blocker) @@ -428,10 +481,11 @@ for blocker, software in c.fetchall(): blocked, reason = instance.values() blocked = tidyup(blocked) + print("BEFORE-blocked:", blocked) if blocked.count("*") > 0: # Some friendica servers also obscure domains without hash c.execute( - "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) + "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) ) searchres = c.fetchone() if searchres != None: @@ -440,38 +494,28 @@ for blocker, software in c.fetchall(): if blocked.count("?") > 0: # Some obscure them with question marks, not sure if that's dependent on version or not c.execute( - "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("?", "_"),) + "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("?", "_"),) ) searchres = c.fetchone() if searchres != None: blocked = searchres[0] + print("AFTER-blocked:", blocked) c.execute( - "SELECT domain FROM instances WHERE domain = ?", (blocked,) + "SELECT domain FROM instances WHERE domain = ?", (blocked,) ) + if c.fetchone() == None: - c.execute( - "INSERT INTO instances SELECT ?, ?, ?", - (blocked, get_hash(blocked), get_type(blocked)), - ) + add_instance(blocked) timestamp = int(time()) c.execute( - "SELECT * FROM blocks WHERE blocker = ? AND blocked = ?", + "SELECT * FROM blocks WHERE blocker = ? AND blocked = ?", (blocker, blocked), ) if c.fetchone() == None: - c.execute( - "INSERT INTO blocks SELECT ?, ?, ?, ?, ?, ?", - ( - blocker, - blocked, - reason, - block_level, - timestamp, - timestamp - ), - ) + block_instance(blocker, blocked, reason, block_level, timestamp, timestamp) + if block_level == "reject": blockdict.append( { @@ -479,15 +523,11 @@ for blocker, software in c.fetchall(): "reason": reason }) else: - c.execute( - "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ?", - (timestamp, blocker, blocked, block_level), - ) + update_last_seen(timestamp, blocker, blocked, block_level) + if reason != '': - c.execute( - "UPDATE blocks SET reason = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND reason = ''", - (reason, blocker, blocked, block_level), - ) + update_block_reason(reason, blocker, blocked, block_level) + conn.commit() except Exception as e: print("error:", e, blocker) @@ -498,56 +538,55 @@ for blocker, software in c.fetchall(): federation = get( f"https://{blocker}/api/v1/instance/peers?filter=suspended", headers=headers, timeout=5 ).json() - for peer in federation: - blocked = peer["domain"].lower() - if blocked.count("*") > 0: - # GTS does not have hashes for obscured domains, so we have to guess it - c.execute( - "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) - ) - searchres = c.fetchone() - if searchres != None: - blocked = searchres[0] + if (federation == None): + print("WARNING: No valid response:", blocker); + else: + for peer in federation: + blocked = peer["domain"].lower() + + if blocked.count("*") > 0: + # GTS does not have hashes for obscured domains, so we have to guess it + c.execute( + "SELECT domain FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", (blocked.replace("*", "_"),) + ) + searchres = c.fetchone() + + if searchres != None: + blocked = searchres[0] - c.execute( - "SELECT domain FROM instances WHERE domain = ?", (blocked,) - ) - if c.fetchone() == None: - c.execute( - "INSERT INTO instances SELECT ?, ?, ?", - (blocked, get_hash(blocked), get_type(blocked)), - ) - c.execute( - "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", - (blocker, blocked, "reject"), - ) - timestamp = int(time()) - if c.fetchone() == None: - c.execute( - "INSERT INTO blocks SELECT ?, ?, ?, ?, ?, ?", - (blocker, blocked, "", "reject", timestamp, timestamp), - ) - blockdict.append( - { - "blocked": blocked, - "reason": None - }) - else: c.execute( - "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ?", - (timestamp, blocker, blocked, "reject"), + "SELECT domain FROM instances WHERE domain = ?", (blocked,) ) - if "public_comment" in peer: - reason = peer["public_comment"] + + if c.fetchone() == None: + add_instance(blocked) + c.execute( - "UPDATE blocks SET reason = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND reason = ''", - (reason, blocker, blocked, "reject"), + "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ?", + (blocker, blocked, "reject"), ) - for entry in blockdict: - if entry["blocked"] == blocked: - entry["reason"] = reason - conn.commit() + timestamp = int(time()) + + if c.fetchone() == None: + block_instance(blocker, blocked, "", "reject", timestamp, timestamp) + + blockdict.append( + { + "blocked": blocked, + "reason": None + }) + else: + update_last_seen(timestamp, blocker, blocked, "reject") + + if "public_comment" in peer: + reason = peer["public_comment"] + update_block_reason(reason, blocker, blocked, "reject") + + for entry in blockdict: + if entry["blocked"] == blocked: + entry["reason"] = reason + conn.commit() except Exception as e: print("error:", e, blocker) -- 2.39.5