From a7b7cb2553881fc6f5282a4da26e8d59fc4ad874 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Fri, 26 May 2023 06:31:49 +0200 Subject: [PATCH] Continued: - also try to remove 'powered by ' and other self-advertisement from software type --- fba.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/fba.py b/fba.py index 334da1a..08e489f 100644 --- a/fba.py +++ b/fba.py @@ -91,7 +91,11 @@ cursor = connection.cursor() # Pattern instance for version numbers patterns = [ + # semantic version number (with v|V) prefix) re.compile("^(?Pv|V{0,1})(\.{0,1})(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)(\.(?P0|[1-9]\d*)(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"), + # non-sematic, e.g. 1.2.3.4 + re.compile("^(?Pv|V{0,1})(\.{0,1})(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)(\.(?P0|[1-9]\d*)(\.(?P0|[1-9]\d*))?)$"), + # non-sematic, e.g. 2023-05 re.compile("^(?P[1-9]{1}[0-9]{3})\.(?P[0-9]{2})$") ] @@ -117,6 +121,7 @@ def remove_version(software: str) -> str: for pattern in patterns: # Run match() match = pattern.match(version) + # NOISY-DEBUG: print(f"DEBUG: match[]={type(match)}") if type(match) is re.Match: break @@ -135,6 +140,27 @@ def remove_version(software: str) -> str: # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!") return software +def strip_powered_by(software: str) -> str: + # NOISY-DEBUG: print(f"DEBUG: software='{software}' - CALLED!") + if not "powered by" in software: + print(f"WARNING: Cannot find 'powered by' in '{software}'!") + return software + + start = software.find("powered by ") + # NOISY-DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'") + + software = software[start + 11:].strip() + # NOISY-DEBUG: print(f"DEBUG: software='{software}'") + + # Next, strip of ' - ' part + end = software.find(" - ") + # NOISY-DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'") + if end > 0: + software = software[0:end].strip() + + # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!") + return software + def is_blacklisted(domain: str) -> bool: blacklisted = False for peer in blacklist: @@ -491,16 +517,20 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str: # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}") if type(software) is str and software == "": - print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'") + # NOISY-DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'") software = None elif type(software) is str and "." in software: # NOISY-DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") software = remove_version(software) + # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}") + if type(software) is str and "powered by" in software: + # NOISY-DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") + software = remove_version(strip_powered_by(software)) + # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!") return software - def determine_software(domain: str) -> str: # NOISY-DEBUG: print("DEBUG: Determining software for domain:", domain) software = None @@ -543,6 +573,9 @@ def determine_software(domain: str) -> str: elif software.find("|") > 0: print("WARNING: Spliting of pipe:", software) software = tidyup(software.split("|")[0]); + elif "powered by" in software: + print(f"DEBUG: software='{software}' has 'powered by' in it") + software = strip_powered_by(software) # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}") if software == "": @@ -557,6 +590,11 @@ def determine_software(domain: str) -> str: # NOISY-DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...") software = remove_version(software) + # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}") + if type(software) is str and "powered by" in software: + # NOISY-DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it") + software = remove_version(strip_powered_by(software)) + # NOISY-DEBUG: print("DEBUG: Returning domain,software:", domain, software) return software -- 2.39.5