From 577a74342aba8e4562b659ee0b474a8350d4c152 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 30 May 2023 07:54:22 +0200 Subject: [PATCH] Continued: - cache access can be very noisy, others maybe not so much - also check for og:site_name to "guess" the software type, old Mastodon (2.x.x) versions don't provide nodeinfo data - remove " hosted on " and following (typical for og:site_name from Mastodon) --- fba.py | 88 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/fba.py b/fba.py index d4f13b1..1fa0ee7 100644 --- a/fba.py +++ b/fba.py @@ -135,20 +135,20 @@ def is_cache_initialized(key: str) -> bool: return key in cache def set_all_cache_key(key: str, rows: list, value: any): - # DEBUG: print(f"DEBUG: key='{key}',rows()={len(rows)},value[]={type(value)} - CALLED!") + # NOISY-DEBUG: print(f"DEBUG: key='{key}',rows()={len(rows)},value[]={type(value)} - CALLED!") if not is_cache_initialized(key): - # DEBUG: print(f"DEBUG: Cache for key='{key}' not initialized.") + # NOISY-DEBUG: print(f"DEBUG: Cache for key='{key}' not initialized.") cache[key] = {} for sub in rows: - # DEBUG: print(f"DEBUG: Setting key='{key}',sub[{type(sub)}]='{sub}'") + # NOISY-DEBUG: print(f"DEBUG: Setting key='{key}',sub[{type(sub)}]='{sub}'") if isinstance(sub, tuple): cache[key][sub[0]] = value else: print(f"WARNING: Unsupported type row[]='{type(row)}'") - # DEBUG: print("DEBUG: EXIT!") + # NOISY-DEBUG: print("DEBUG: EXIT!") def set_cache_key(key: str, sub: str, value: any): if not is_cache_initialized(key): @@ -260,6 +260,26 @@ def strip_powered_by(software: str) -> str: # DEBUG: print(f"DEBUG: software='{software}' - EXIT!") return software +def strip_hosted_on(software: str) -> str: + # DEBUG: print(f"DEBUG: software='{software}' - CALLED!") + if software == "": + print(f"ERROR: Bad method call, 'software' is empty") + raise Exception("Parameter 'software' is empty") + elif not "hosted on" in software: + print(f"WARNING: Cannot find 'hosted on' in '{software}'!") + return software + + end = software.find("hosted on ") + # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'") + + software = software[0, start].strip() + # DEBUG: print(f"DEBUG: software='{software}'") + + software = strip_until(software, " - ") + + # DEBUG: print(f"DEBUG: software='{software}' - EXIT!") + return software + def strip_until(software: str, until: str) -> str: # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!") if software == "": @@ -269,7 +289,7 @@ def strip_until(software: str, until: str) -> str: print(f"ERROR: Bad method call, 'until' is empty") raise Exception("Parameter 'until' is empty") elif not until in software: - print(f"WARNING: Cannot find 'powered by' in '{software}'!") + print(f"WARNING: Cannot find '{until}' in '{software}'!") return software # Next, strip until part @@ -393,7 +413,7 @@ def log_error(domain: str, res: any): ]) # Cleanup old entries - # DEBUG: print(f"DEBUG: Purging old records (distance: {config['error_log_cleanup'])") + # DEBUG: print(f"DEBUG: Purging old records (distance: {config['error_log_cleanup']})") cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config["error_log_cleanup"]]) except BaseException as e: print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'") @@ -438,7 +458,7 @@ def update_last_error(domain: str, res: any): # DEBUG: print("DEBUG: EXIT!") def update_last_instance_fetch(domain: str): - #print("DEBUG: Updating last_instance_fetch for domain:", domain) + # DEBUG: print("DEBUG: Updating last_instance_fetch for domain:", domain) try: cursor.execute("UPDATE instances SET last_instance_fetch = ?, last_updated = ? WHERE domain = ? LIMIT 1", [ time.time(), @@ -454,7 +474,7 @@ def update_last_instance_fetch(domain: str): sys.exit(255) connection.commit() - #print("DEBUG: EXIT!") + # DEBUG: print("DEBUG: EXIT!") def update_last_nodeinfo(domain: str): # DEBUG: print("DEBUG: Updating last_nodeinfo for domain:", domain) @@ -533,7 +553,7 @@ def get_peers(domain: str, software: str) -> list: # DEBUG: print(f"DEBUG: Adding peer: '{row['host']}'") peers.append(row["host"]) - #print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") update_last_instance_fetch(domain) # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) @@ -562,7 +582,7 @@ def get_peers(domain: str, software: str) -> list: except BaseException as e: print(f"WARNING: Exception during fetching JSON: domain='{domain}',exception[{type(e)}]:'{str(e)}'") - #print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") update_last_instance_fetch(domain) # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) @@ -601,7 +621,7 @@ def get_peers(domain: str, software: str) -> list: except BaseException as e: print(f"WARNING: Exception during fetching JSON: domain='{domain}',exception[{type(e)}]:'{str(e)}'") - #print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") update_last_instance_fetch(domain) # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) @@ -623,7 +643,7 @@ def get_peers(domain: str, software: str) -> list: print("WARNING: Could not reach any JSON API:", domain) update_last_error(domain, res) elif res.ok and isinstance(data, list): - print(f"DEBUG: domain='{domain}' returned a list: '{data}'") + # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'") sys.exit(255) elif "federated_instances" in data: # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'") @@ -640,7 +660,7 @@ def get_peers(domain: str, software: str) -> list: print("WARNING: Some error during get():", domain, e) update_last_error(domain, e) - #print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") update_last_instance_fetch(domain) # DEBUG: print("DEBUG: Returning peers[]:", type(peers)) @@ -668,7 +688,7 @@ def fetch_nodeinfo(domain: str, path: str = None) -> list: # DEBUG: print("DEBUG: Fetching nodeinfo from domain,path:", domain, path) nodeinfo = fetch_wellknown_nodeinfo(domain) - # DEBUG: print("DEBUG: nodeinfo:", len(nodeinfo)) + # DEBUG: print("DEBUG: nodeinfo:", nodeinfo) if len(nodeinfo) > 0: # DEBUG: print("DEBUG: Returning auto-discovered nodeinfo:", len(nodeinfo)) @@ -686,7 +706,7 @@ def fetch_nodeinfo(domain: str, path: str = None) -> list: data = {} for request in requests: if path != None and path != "" and request != path: - print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!") + # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!") continue try: @@ -770,15 +790,22 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: doc = bs4.BeautifulSoup(res.text, "html.parser") # DEBUG: print("DEBUG: doc[]:", type(doc)) - tag = doc.find("meta", {"name": "generator"}) + generator = doc.find("meta", {"name": "generator"}) + site_name = doc.find("meta", {"property": "og:site_name"}) - # DEBUG: print(f"DEBUG: tag[{type(tag)}: {tag}") - if isinstance(tag, bs4.element.Tag): - # DEBUG: print("DEBUG: Found generator meta tag: ", domain) - software = tidyup(tag.get("content")) + # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'") + if isinstance(generator, bs4.element.Tag): + # DEBUG: print("DEBUG: Found generator meta tag:", domain) + software = tidyup(generator.get("content")) print(f"INFO: domain='{domain}' is generated by '{software}'") nodeinfos["detection_mode"][domain] = "GENERATOR" remove_pending_error(domain) + elif isinstance(site_name, bs4.element.Tag): + # DEBUG: print("DEBUG: Found property=og:site_name:", domain) + sofware = tidyup(site_name.get("content")) + print(f"INFO: domain='{domain}' has og:site_name='{software}'") + nodeinfos["detection_mode"][domain] = "SITE_NAME" + remove_pending_error(domain) except BaseException as e: # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e) @@ -794,9 +821,12 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: software = remove_version(software) # DEBUG: print(f"DEBUG: software[]={type(software)}") - if type(software) is str and "powered by" in software: + if type(software) is str and " powered by " in software: # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") software = remove_version(strip_powered_by(software)) + elif type(software) is str and " hosted on " in software: + # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it") + software = remove_version(strip_hosted_on(software)) elif type(software) is str and " by " in software: # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it") software = strip_until(software, " by ") @@ -824,8 +854,12 @@ def determine_software(domain: str, path: str = None) -> str: print("WARNING: JSON response is an error:", data["message"]) update_last_error(domain, data["message"]) return fetch_generator_from_path(domain) + elif "message" in data: + print("WARNING: JSON response contains only a message:", data["message"]) + update_last_error(domain, data["message"]) + return fetch_generator_from_path(domain) elif "software" not in data or "name" not in data["software"]: - # DEBUG: print(f"DEBUG: JSON response from {domain} does not include [software][name], fetching / ...") + # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...") software = fetch_generator_from_path(domain) # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!") @@ -956,9 +990,9 @@ def block_instance(blocker: str, blocked: str, reason: str, block_level: str): # DEBUG: print("DEBUG: EXIT!") def is_instance_registered(domain: str) -> bool: - # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!") + # NOISY-DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!") if not is_cache_initialized("is_registered"): - # DEBUG: print(f"DEBUG: Cache for 'is_registered' not initialized, fetching all rows ...") + # NOISY-DEBUG: print(f"DEBUG: Cache for 'is_registered' not initialized, fetching all rows ...") try: cursor.execute("SELECT domain FROM instances") @@ -971,7 +1005,7 @@ def is_instance_registered(domain: str) -> bool: # Is cache found? registered = is_cache_key_set("is_registered", domain) - # DEBUG: print(f"DEBUG: registered='{registered}' - EXIT!") + # NOISY-DEBUG: print(f"DEBUG: registered='{registered}' - EXIT!") return registered def add_instance(domain: str, origin: str, originator: str, path: str = None): @@ -986,7 +1020,7 @@ def add_instance(domain: str, origin: str, originator: str, path: str = None): software = determine_software(domain, path) # DEBUG: print("DEBUG: Determined software:", software) - print(f"INFO: Adding instance {domain} (origin: {origin})") + print(f"INFO: Adding instance domain='{domain}' (origin='{origin}',software='{software}')") try: cursor.execute( "INSERT INTO instances (domain, origin, originator, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)", @@ -1239,7 +1273,7 @@ def get_misskey_blocks(domain: str) -> dict: offset = 0 break - #print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") + # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...") update_last_instance_fetch(domain) # DEBUG: print("DEBUG: Returning for domain,blocked(),suspended():", domain, len(blocks["blocked"]), len(blocks["suspended"])) -- 2.39.5