From 5d699e50168a56c86cd35c9962f51730053d9585 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Sat, 17 Jun 2023 11:47:19 +0200 Subject: [PATCH] Continued: - in the end, a missing "t" caused a lot of 'sofware' to be None (NULL) now --- fba/federation.py | 18 +++++++++++------- fba/helpers/tidyup.py | 6 ++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fba/federation.py b/fba/federation.py index eebbfd7..55a0b21 100644 --- a/fba/federation.py +++ b/fba/federation.py @@ -364,16 +364,20 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: site_name = doc.find("meta", {"property": "og:site_name"}) # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'") - if isinstance(generator, bs4.element.Tag): - # DEBUG: print("DEBUG: Found generator meta tag:", domain) + if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str): + print("DEBUG: Found generator meta tag:", domain) software = tidyup.domain(generator.get("content")) - print(f"INFO: domain='{domain}' is generated by '{software}'") - instances.set_detection_mode(domain, "GENERATOR") + # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") + if software is not None and software != "": + print(f"INFO: domain='{domain}' is generated by '{software}'") + instances.set_detection_mode(domain, "GENERATOR") elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str): # DEBUG: print("DEBUG: Found property=og:site_name:", domain) - sofware = tidyup.domain(site_name.get("content")) - print(f"INFO: domain='{domain}' has og:site_name='{software}'") - instances.set_detection_mode(domain, "SITE_NAME") + software = tidyup.domain(site_name.get("content")) + # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'") + if software is not None and software != "": + print(f"INFO: domain='{domain}' has og:site_name='{software}'") + instances.set_detection_mode(domain, "SITE_NAME") # DEBUG: print(f"DEBUG: software[]='{type(software)}'") if isinstance(software, str) and software == "": diff --git a/fba/helpers/tidyup.py b/fba/helpers/tidyup.py index 32f3e9e..cdbfa40 100644 --- a/fba/helpers/tidyup.py +++ b/fba/helpers/tidyup.py @@ -33,22 +33,28 @@ def domain(string: str) -> str: # All lower-case and strip spaces out + last dot string = string.lower().strip().rstrip(".") + # DEBUG: print(f"DEBUG: string='{string}' - #1") # No port number string = re.sub("\:\d+$", "", string) + # DEBUG: print(f"DEBUG: string='{string}' - #2") # No protocol, sometimes without the slashes string = re.sub("^https?\:(\/*)", "", string) + # DEBUG: print(f"DEBUG: string='{string}' - #3") # No trailing slash string = re.sub("\/$", "", string) + # DEBUG: print(f"DEBUG: string='{string}' - #4") # No @ or : sign string = re.sub("^\@", "", string) string = string.split(":")[0] + # DEBUG: print(f"DEBUG: string='{string}' - #4") # No individual users in block lists string = re.sub("(.+)\@", "", string) + # DEBUG: print(f"DEBUG: string='{string}' - #5") if string.find("/profile/"): string = string.split("/profile/")[0] elif string.find("/users/"): -- 2.39.5