1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
26 from urllib.parse import urlparse
28 from fba import blacklist
30 from fba import config
31 from fba import instances
32 from fba import network
34 from fba.federation import lemmy
35 from fba.federation import misskey
36 from fba.federation import peertube
38 # Array with pending errors needed to be written to database
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 # HTTP headers for non-API requests
56 "User-Agent": config.get("useragent"),
59 # HTTP headers for API requests
61 "User-Agent": config.get("useragent"),
62 "Content-Type": "application/json",
66 connection = sqlite3.connect("blocks.db")
67 cursor = connection.cursor()
69 # Pattern instance for version numbers
71 # semantic version number (with v|V) prefix)
72 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
73 # non-sematic, e.g. 1.2.3.4
74 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
75 # non-sematic, e.g. 2023-05[-dev]
76 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
77 # non-semantic, e.g. abcdef0
78 re.compile("^[a-f0-9]{7}$"),
81 ##### Other functions #####
83 def is_primitive(var: any) -> bool:
84 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
85 return type(var) in {int, str, float, bool} or var == None
87 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
88 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
89 if type(domain) != str:
90 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
92 raise ValueError(f"Parameter 'domain' is empty")
93 elif type(origin) != str and origin != None:
94 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
95 elif software == None:
96 print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
97 software = determine_software(domain, path)
98 print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
99 elif type(software) != str:
100 raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
101 elif type(script) != str:
102 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
104 raise ValueError(f"Parameter 'domain' is empty")
106 if not instances.is_registered(domain):
107 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
108 instances.add(domain, origin, script, path)
110 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
111 peerlist = fetch_peers(domain, software)
113 if (peerlist is None):
114 print("ERROR: Cannot fetch peers:", domain)
116 elif instances.has_pending_instance_data(domain):
117 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
118 instances.update_data(domain)
120 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
121 for instance in peerlist:
123 # Skip "None" types as tidup() cannot parse them
126 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
127 instance = tidyup_domain(instance)
128 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
131 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
133 elif not validators.domain(instance.split("/")[0]):
134 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
136 elif blacklist.is_blacklisted(instance):
137 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
140 # DEBUG: print("DEBUG: Handling instance:", instance)
142 if not instances.is_registered(instance):
143 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
144 instances.add(instance, domain, script)
145 except BaseException as e:
146 print(f"ERROR: instance='{instance}',exception[{type(e)}]:'{str(e)}'")
149 # DEBUG: print("DEBUG: EXIT!")
151 def add_peers(rows: dict) -> list:
152 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
154 for key in ["linked", "allowed", "blocked"]:
155 # DEBUG: print(f"DEBUG: Checking key='{key}'")
156 if key in rows and rows[key] != None:
157 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
158 for peer in rows[key]:
159 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
160 peer = tidyup_domain(peer)
162 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
163 if blacklist.is_blacklisted(peer):
164 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
167 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
170 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
173 def remove_version(software: str) -> str:
174 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
175 if not "." in software and " " not in software:
176 print(f"WARNING: software='{software}' does not contain a version number.")
181 temp = software.split(";")[0]
182 elif "," in software:
183 temp = software.split(",")[0]
184 elif " - " in software:
185 temp = software.split(" - ")[0]
187 # DEBUG: print(f"DEBUG: software='{software}'")
190 version = temp.split(" ")[-1]
191 elif "/" in software:
192 version = temp.split("/")[-1]
193 elif "-" in software:
194 version = temp.split("-")[-1]
196 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
201 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
202 for pattern in patterns:
204 match = pattern.match(version)
206 # DEBUG: print(f"DEBUG: match[]={type(match)}")
207 if type(match) is re.Match:
210 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
211 if type(match) is not re.Match:
212 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
215 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
216 end = len(temp) - len(version) - 1
218 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
219 software = temp[0:end].strip()
220 if " version" in software:
221 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
222 software = strip_until(software, " version")
224 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
227 def strip_powered_by(software: str) -> str:
228 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
230 print(f"ERROR: Bad method call, 'software' is empty")
231 raise Exception("Parameter 'software' is empty")
232 elif not "powered by" in software:
233 print(f"WARNING: Cannot find 'powered by' in '{software}'!")
236 start = software.find("powered by ")
237 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
239 software = software[start + 11:].strip()
240 # DEBUG: print(f"DEBUG: software='{software}'")
242 software = strip_until(software, " - ")
244 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
247 def strip_hosted_on(software: str) -> str:
248 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
250 print(f"ERROR: Bad method call, 'software' is empty")
251 raise Exception("Parameter 'software' is empty")
252 elif not "hosted on" in software:
253 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
256 end = software.find("hosted on ")
257 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
259 software = software[0, start].strip()
260 # DEBUG: print(f"DEBUG: software='{software}'")
262 software = strip_until(software, " - ")
264 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
267 def strip_until(software: str, until: str) -> str:
268 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
270 print(f"ERROR: Bad method call, 'software' is empty")
271 raise Exception("Parameter 'software' is empty")
273 print(f"ERROR: Bad method call, 'until' is empty")
274 raise Exception("Parameter 'until' is empty")
275 elif not until in software:
276 print(f"WARNING: Cannot find '{until}' in '{software}'!")
279 # Next, strip until part
280 end = software.find(until)
282 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
284 software = software[0:end].strip()
286 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
289 def remove_pending_error(domain: str):
290 if type(domain) != str:
291 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
293 raise ValueError(f"Parameter 'domain' is empty")
296 # Prevent updating any pending errors, nodeinfo was found
297 del pending_errors[domain]
302 # DEBUG: print("DEBUG: EXIT!")
304 def get_hash(domain: str) -> str:
305 if type(domain) != str:
306 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
308 raise ValueError(f"Parameter 'domain' is empty")
310 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
312 def log_error(domain: str, response: requests.models.Response):
313 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
314 if type(domain) != str:
315 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
317 raise ValueError(f"Parameter 'domain' is empty")
320 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
321 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
322 response = str(response)
324 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
325 if type(response) is str:
326 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
332 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
334 response.status_code,
339 # Cleanup old entries
340 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
341 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
342 except BaseException as e:
343 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
346 # DEBUG: print("DEBUG: EXIT!")
348 def fetch_peers(domain: str, software: str) -> list:
349 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
350 if type(domain) != str:
351 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
353 raise ValueError(f"Parameter 'domain' is empty")
354 elif type(software) != str and software != None:
355 raise ValueError(f"software[]={type(software)} is not 'str'")
357 if software == "misskey":
358 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
359 return misskey.fetch_peers(domain)
360 elif software == "lemmy":
361 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
362 return lemmy.fetch_peers(domain)
363 elif software == "peertube":
364 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
365 return peertube.fetch_peers(domain)
367 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
370 response = network.fetch_response(domain, "/api/v1/instance/peers", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
372 data = json_from_response(response)
374 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
375 if not response.ok or response.status_code >= 400:
376 # DEBUG: print(f"DEBUG: Was not able to fetch peers, trying alternative ...")
377 response = network.fetch_response(domain, "/api/v3/site", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
379 data = json_from_response(response)
380 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
381 if not response.ok or response.status_code >= 400:
382 print("WARNING: Could not reach any JSON API:", domain)
383 instances.update_last_error(domain, response)
384 elif response.ok and isinstance(data, list):
385 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
387 elif "federated_instances" in data:
388 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
389 peers = peers + add_peers(data["federated_instances"])
390 # DEBUG: print("DEBUG: Added instance(s) to peers")
392 print("WARNING: JSON response does not contain 'federated_instances':", domain)
393 instances.update_last_error(domain, response)
395 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
398 except BaseException as e:
399 print("WARNING: Some error during get():", domain, e)
400 instances.update_last_error(domain, e)
402 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
403 instances.set("total_peers", domain, len(peers))
405 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
406 instances.update_last_instance_fetch(domain)
408 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
411 def fetch_nodeinfo(domain: str, path: str = None) -> list:
412 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
413 if type(domain) != str:
414 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
416 raise ValueError(f"Parameter 'domain' is empty")
417 elif type(path) != str and path != None:
418 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
420 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
421 nodeinfo = fetch_wellknown_nodeinfo(domain)
423 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
424 if len(nodeinfo) > 0:
425 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
429 "/nodeinfo/2.1.json",
431 "/nodeinfo/2.0.json",
438 for request in request_paths:
439 if path != None and path != "" and path != f"https://{domain}{path}":
440 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
444 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
445 response = network.fetch_response(domain, request, api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
447 data = json_from_response(response)
448 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
449 if response.ok and isinstance(data, dict):
450 # DEBUG: print("DEBUG: Success:", request)
451 instances.set("detection_mode", domain, "STATIC_CHECK")
452 instances.set("nodeinfo_url" , domain, request)
454 elif response.ok and isinstance(data, list):
455 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
457 elif not response.ok or response.status_code >= 400:
458 print("WARNING: Failed fetching nodeinfo from domain:", domain)
459 instances.update_last_error(domain, response)
462 except BaseException as e:
463 # DEBUG: print("DEBUG: Cannot fetch API request:", request)
464 instances.update_last_error(domain, e)
467 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
470 def fetch_wellknown_nodeinfo(domain: str) -> list:
471 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
472 if type(domain) != str:
473 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
475 raise ValueError(f"Parameter 'domain' is empty")
477 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
481 response = network.fetch_response(domain, "/.well-known/nodeinfo", api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
483 data = json_from_response(response)
484 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
485 if response.ok and isinstance(data, dict):
487 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
488 if "links" in nodeinfo:
489 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
490 for link in nodeinfo["links"]:
491 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
492 if link["rel"] in nodeinfo_identifier:
493 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
494 response = fetch_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
496 data = json_from_response(response)
497 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
498 if response.ok and isinstance(data, dict):
499 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
500 instances.set("detection_mode", domain, "AUTO_DISCOVERY")
501 instances.set("nodeinfo_url" , domain, link["href"])
504 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
506 print("WARNING: nodeinfo does not contain 'links':", domain)
508 except BaseException as e:
509 print("WARNING: Failed fetching .well-known info:", domain)
510 instances.update_last_error(domain, e)
513 # DEBUG: print("DEBUG: Returning data[]:", type(data))
516 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
517 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
518 if type(domain) != str:
519 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
521 raise ValueError(f"Parameter 'domain' is empty")
522 elif type(path) != str:
523 raise ValueError(f"path[]={type(path)} is not 'str'")
525 raise ValueError(f"Parameter 'domain' is empty")
527 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
531 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
532 response = network.fetch_response(domain, path, headers, (config.get("connection_timeout"), config.get("read_timeout")))
534 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
535 if response.ok and response.status_code < 300 and len(response.text) > 0:
536 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
537 doc = bs4.BeautifulSoup(response.text, "html.parser")
539 # DEBUG: print("DEBUG: doc[]:", type(doc))
540 generator = doc.find("meta", {"name": "generator"})
541 site_name = doc.find("meta", {"property": "og:site_name"})
543 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
544 if isinstance(generator, bs4.element.Tag):
545 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
546 software = tidyup_domain(generator.get("content"))
547 print(f"INFO: domain='{domain}' is generated by '{software}'")
548 instances.set("detection_mode", domain, "GENERATOR")
549 remove_pending_error(domain)
550 elif isinstance(site_name, bs4.element.Tag):
551 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
552 sofware = tidyup_domain(site_name.get("content"))
553 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
554 instances.set("detection_mode", domain, "SITE_NAME")
555 remove_pending_error(domain)
557 except BaseException as e:
558 # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e)
559 instances.update_last_error(domain, e)
562 # DEBUG: print(f"DEBUG: software[]={type(software)}")
563 if type(software) is str and software == "":
564 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
566 elif type(software) is str and ("." in software or " " in software):
567 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
568 software = remove_version(software)
570 # DEBUG: print(f"DEBUG: software[]={type(software)}")
571 if type(software) is str and " powered by " in software:
572 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
573 software = remove_version(strip_powered_by(software))
574 elif type(software) is str and " hosted on " in software:
575 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
576 software = remove_version(strip_hosted_on(software))
577 elif type(software) is str and " by " in software:
578 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
579 software = strip_until(software, " by ")
580 elif type(software) is str and " see " in software:
581 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
582 software = strip_until(software, " see ")
584 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
587 def determine_software(domain: str, path: str = None) -> str:
588 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
589 if type(domain) != str:
590 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
592 raise ValueError(f"Parameter 'domain' is empty")
593 elif type(path) != str and path != None:
594 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
596 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
599 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
600 data = fetch_nodeinfo(domain, path)
602 # DEBUG: print("DEBUG: data[]:", type(data))
603 if not isinstance(data, dict) or len(data) == 0:
604 # DEBUG: print("DEBUG: Could not determine software type:", domain)
605 return fetch_generator_from_path(domain)
607 # DEBUG: print("DEBUG: data():", len(data), data)
608 if "status" in data and data["status"] == "error" and "message" in data:
609 print("WARNING: JSON response is an error:", data["message"])
610 instances.update_last_error(domain, data["message"])
611 return fetch_generator_from_path(domain)
612 elif "message" in data:
613 print("WARNING: JSON response contains only a message:", data["message"])
614 instances.update_last_error(domain, data["message"])
615 return fetch_generator_from_path(domain)
616 elif "software" not in data or "name" not in data["software"]:
617 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
618 software = fetch_generator_from_path(domain)
620 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
623 software = tidyup_domain(data["software"]["name"])
625 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
626 if software in ["akkoma", "rebased"]:
627 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
629 elif software in ["hometown", "ecko"]:
630 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
631 software = "mastodon"
632 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
633 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
635 elif software.find("/") > 0:
636 print("WARNING: Spliting of slash:", software)
637 software = tidup_domain(software.split("/")[-1]);
638 elif software.find("|") > 0:
639 print("WARNING: Spliting of pipe:", software)
640 software = tidyup_domain(software.split("|")[0]);
641 elif "powered by" in software:
642 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
643 software = strip_powered_by(software)
644 elif type(software) is str and " by " in software:
645 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
646 software = strip_until(software, " by ")
647 elif type(software) is str and " see " in software:
648 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
649 software = strip_until(software, " see ")
651 # DEBUG: print(f"DEBUG: software[]={type(software)}")
653 print("WARNING: tidyup_domain() left no software name behind:", domain)
656 # DEBUG: print(f"DEBUG: software[]={type(software)}")
657 if str(software) == "":
658 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
659 software = fetch_generator_from_path(domain)
660 elif len(str(software)) > 0 and ("." in software or " " in software):
661 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
662 software = remove_version(software)
664 # DEBUG: print(f"DEBUG: software[]={type(software)}")
665 if type(software) is str and "powered by" in software:
666 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
667 software = remove_version(strip_powered_by(software))
669 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
672 def tidyup_reason(reason: str) -> str:
673 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
674 if type(reason) != str:
675 raise ValueError(f"Parameter reason[]={type(reason)} is not 'str'")
678 reason = reason.strip()
681 reason = re.sub("â", "\"", reason)
683 # DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
686 def tidyup_domain(domain: str) -> str:
687 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
688 if type(domain) != str:
689 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
691 # All lower-case and strip spaces out + last dot
692 domain = domain.lower().strip().rstrip(".")
695 domain = re.sub("\:\d+$", "", domain)
697 # No protocol, sometimes without the slashes
698 domain = re.sub("^https?\:(\/*)", "", domain)
701 domain = re.sub("\/$", "", domain)
704 domain = re.sub("^\@", "", domain)
706 # No individual users in block lists
707 domain = re.sub("(.+)\@", "", domain)
708 if domain.find("/profile/"):
709 domain = domain.split("/profile/")[0]
710 elif domain.find("/users/"):
711 domain = domain.split("/users/")[0]
713 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
716 def json_from_response(response: requests.models.Response) -> list:
717 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
718 if not isinstance(response, requests.models.Response):
719 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
722 if response.text.strip() != "":
723 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
725 data = response.json()
726 except json.decoder.JSONDecodeError:
729 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
732 def has_key(keys: list, search: str, value: any) -> bool:
733 # DEBUG: print(f"DEBUG: keys()={len(keys)},search='{search}',value[]='{type(value)}' - CALLED!")
734 if type(keys) != list:
735 raise ValueError(f"Parameter keys[]='{type(keys)}' is not 'list'")
736 elif type(search) != str:
737 raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
739 raise ValueError("Parameter 'search' is empty")
742 # DEBUG: print(f"DEBUG: Checking keys()={len(keys)} ...")
744 # DEBUG: print(f"DEBUG: key['{type(key)}']={key}")
745 if type(key) != dict:
746 raise ValueError(f"key[]='{type(key)}' is not 'dict'")
747 elif not search in key:
748 raise KeyError(f"Cannot find search='{search}'")
749 elif key[search] == value:
753 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
756 def find_domains(tag: bs4.element.Tag) -> list:
757 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
758 if not isinstance(tag, bs4.element.Tag):
759 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
760 elif not isinstance(tag, bs4.element.Tag):
761 raise KeyError("Cannot find table with instances!")
762 elif len(tag.select("tr")) == 0:
763 raise KeyError("No table rows found in table!")
766 for element in tag.select("tr"):
767 # DEBUG: print(f"DEBUG: element[]={type(element)}")
768 if not element.find("td"):
769 # DEBUG: print("DEBUG: Skipping element, no <td> found")
772 domain = tidyup_domain(element.find("td").text)
773 reason = tidyup_reason(element.findAll("td")[1].text)
775 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
777 if blacklist.is_blacklisted(domain):
778 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
780 elif domain == "gab.com/.ai, develop.gab.com":
781 # DEBUG: print(f"DEBUG: Multiple domains detected in one row")
791 "domain": "develop.gab.com",
795 elif not validators.domain(domain):
796 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
799 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
805 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
808 def fetch_url(url: str, headers: dict, timeout: list) -> requests.models.Response:
809 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
811 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
813 raise ValueError("Parameter 'url' is empty")
815 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
816 components = urlparse(url)
818 # Invoke other function, avoid trailing ?
819 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
820 if components.query != "":
821 response = network.fetch_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
823 response = network.fetch_response(components.hostname, f"{components.path}", headers, timeout)
825 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")