]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Fri, 26 May 2023 04:31:49 +0000 (06:31 +0200)
committerRoland Häder <roland@mxchange.org>
Fri, 26 May 2023 04:31:49 +0000 (06:31 +0200)
- also try to remove 'powered by ' and other self-advertisement from software
  type

fba.py

diff --git a/fba.py b/fba.py
index 334da1acf03c1c38fdaba7dc39383657a611ab5b..08e489fc521e18edd13c7034ebfa4d1808db0cdc 100644 (file)
--- a/fba.py
+++ b/fba.py
@@ -91,7 +91,11 @@ cursor = connection.cursor()
 
 # Pattern instance for version numbers
 patterns = [
+    # semantic version number (with v|V) prefix)
     re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)(\.(?P<patch>0|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
+    # non-sematic, e.g. 1.2.3.4
+    re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)(\.(?P<patch>0|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
+    # non-sematic, e.g. 2023-05
     re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})$")
 ]
 
@@ -117,6 +121,7 @@ def remove_version(software: str) -> str:
     for pattern in patterns:
         # Run match()
         match = pattern.match(version)
+
         # NOISY-DEBUG: print(f"DEBUG: match[]={type(match)}")
         if type(match) is re.Match:
             break
@@ -135,6 +140,27 @@ def remove_version(software: str) -> str:
     # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
     return software
 
+def strip_powered_by(software: str) -> str:
+    # NOISY-DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
+    if not "powered by" in software:
+        print(f"WARNING: Cannot find 'powered by' in '{software}'!")
+        return software
+
+    start = software.find("powered by ")
+    # NOISY-DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
+
+    software = software[start + 11:].strip()
+    # NOISY-DEBUG: print(f"DEBUG: software='{software}'")
+
+    # Next, strip of ' - ' part
+    end = software.find(" - ")
+    # NOISY-DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
+    if end > 0:
+        software = software[0:end].strip()
+
+    # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
+    return software
+
 def is_blacklisted(domain: str) -> bool:
     blacklisted = False
     for peer in blacklist:
@@ -491,16 +517,20 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
 
     # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}")
     if type(software) is str and software == "":
-        print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
+        # NOISY-DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
         software = None
     elif type(software) is str and "." in software:
         # NOISY-DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
         software = remove_version(software)
 
+    # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}")
+    if type(software) is str and "powered by" in software:
+        # NOISY-DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
+        software = remove_version(strip_powered_by(software))
+
     # NOISY-DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
     return software
 
-
 def determine_software(domain: str) -> str:
     # NOISY-DEBUG: print("DEBUG: Determining software for domain:", domain)
     software = None
@@ -543,6 +573,9 @@ def determine_software(domain: str) -> str:
     elif software.find("|") > 0:
         print("WARNING: Spliting of pipe:", software)
         software = tidyup(software.split("|")[0]);
+    elif "powered by" in software:
+        print(f"DEBUG: software='{software}' has 'powered by' in it")
+        software = strip_powered_by(software)
 
     # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}")
     if software == "":
@@ -557,6 +590,11 @@ def determine_software(domain: str) -> str:
         # NOISY-DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
         software = remove_version(software)
 
+    # NOISY-DEBUG: print(f"DEBUG: software[]={type(software)}")
+    if type(software) is str and "powered by" in software:
+        # NOISY-DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
+        software = remove_version(strip_powered_by(software))
+
     # NOISY-DEBUG: print("DEBUG: Returning domain,software:", domain, software)
     return software