1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from urllib.parse import urlparse
29 from fba import blacklist
30 from fba import config
31 from fba import instances
32 from fba import network
34 from fba.federation import lemmy
35 from fba.federation import misskey
36 from fba.federation import peertube
38 # Array with pending errors needed to be written to database
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 connection = sqlite3.connect("blocks.db")
56 cursor = connection.cursor()
58 # Pattern instance for version numbers
60 # semantic version number (with v|V) prefix)
61 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
62 # non-sematic, e.g. 1.2.3.4
63 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
64 # non-sematic, e.g. 2023-05[-dev]
65 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
66 # non-semantic, e.g. abcdef0
67 re.compile("^[a-f0-9]{7}$"),
70 ##### Other functions #####
72 def is_primitive(var: any) -> bool:
73 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
74 return type(var) in {int, str, float, bool} or var is None
76 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
77 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
78 if not isinstance(domain, str):
79 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
81 raise ValueError("Parameter 'domain' is empty")
82 elif not isinstance(origin, str) and origin is not None:
83 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
84 elif software is None:
85 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
86 software = determine_software(domain, path)
87 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
88 elif not isinstance(software, str):
89 raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
90 elif not isinstance(script, str):
91 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
93 raise ValueError("Parameter 'domain' is empty")
95 if not instances.is_registered(domain):
96 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
97 instances.add(domain, origin, script, path)
99 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
100 peerlist = fetch_peers(domain, software)
103 print("ERROR: Cannot fetch peers:", domain)
105 elif instances.has_pending_instance_data(domain):
106 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
107 instances.update_data(domain)
109 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
110 for instance in peerlist:
112 # Skip "None" types as tidup() cannot parse them
115 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
116 instance = tidyup_domain(instance)
117 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
120 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
122 elif not validators.domain(instance.split("/")[0]):
123 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
125 elif blacklist.is_blacklisted(instance):
126 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
129 # DEBUG: print("DEBUG: Handling instance:", instance)
131 if not instances.is_registered(instance):
132 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
133 instances.add(instance, domain, script)
134 except BaseException as exception:
135 print(f"ERROR: instance='{instance}',exception[{type(exception)}]:'{str(exception)}'")
138 # DEBUG: print("DEBUG: EXIT!")
140 def add_peers(rows: dict) -> list:
141 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
143 for key in ["linked", "allowed", "blocked"]:
144 # DEBUG: print(f"DEBUG: Checking key='{key}'")
145 if key in rows and rows[key] is not None:
146 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
147 for peer in rows[key]:
148 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
149 peer = tidyup_domain(peer)
151 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
152 if blacklist.is_blacklisted(peer):
153 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
156 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
159 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
162 def remove_version(software: str) -> str:
163 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
164 if not "." in software and " " not in software:
165 print(f"WARNING: software='{software}' does not contain a version number.")
170 temp = software.split(";")[0]
171 elif "," in software:
172 temp = software.split(",")[0]
173 elif " - " in software:
174 temp = software.split(" - ")[0]
176 # DEBUG: print(f"DEBUG: software='{software}'")
179 version = temp.split(" ")[-1]
180 elif "/" in software:
181 version = temp.split("/")[-1]
182 elif "-" in software:
183 version = temp.split("-")[-1]
185 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
189 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
190 for pattern in patterns:
192 match = pattern.match(version)
194 # DEBUG: print(f"DEBUG: match[]={type(match)}")
195 if isinstance(match, re.Match):
196 # DEBUG: print(f"DEBUG: version='{version}' is matching pattern='{pattern}'")
199 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
200 if not isinstance(match, re.Match):
201 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
204 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
205 end = len(temp) - len(version) - 1
207 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
208 software = temp[0:end].strip()
209 if " version" in software:
210 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
211 software = strip_until(software, " version")
213 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
216 def strip_powered_by(software: str) -> str:
217 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
218 if not isinstance(software, str):
219 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
221 raise ValueError("Parameter 'software' is empty")
222 elif not "powered by" in software:
223 print(f"WARNING: Cannot find 'powered by' in software='{software}'!")
226 start = software.find("powered by ")
227 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
229 software = software[start + 11:].strip()
230 # DEBUG: print(f"DEBUG: software='{software}'")
232 software = strip_until(software, " - ")
234 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
237 def strip_hosted_on(software: str) -> str:
238 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
239 if not isinstance(software, str):
240 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
242 raise ValueError("Parameter 'software' is empty")
243 elif not "hosted on" in software:
244 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
247 end = software.find("hosted on ")
248 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
250 software = software[0, end].strip()
251 # DEBUG: print(f"DEBUG: software='{software}'")
253 software = strip_until(software, " - ")
255 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
258 def strip_until(software: str, until: str) -> str:
259 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
260 if not isinstance(software, str):
261 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
263 raise ValueError("Parameter 'software' is empty")
264 elif not isinstance(until, str):
265 raise ValueError(f"Parameter until[]='{type(until)}' is not 'str'")
267 raise ValueError("Parameter 'until' is empty")
268 elif not until in software:
269 print(f"WARNING: Cannot find '{until}' in '{software}'!")
272 # Next, strip until part
273 end = software.find(until)
275 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
277 software = software[0:end].strip()
279 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
282 def remove_pending_error(domain: str):
283 if not isinstance(domain, str):
284 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
286 raise ValueError("Parameter 'domain' is empty")
289 # Prevent updating any pending errors, nodeinfo was found
290 del pending_errors[domain]
295 # DEBUG: print("DEBUG: EXIT!")
297 def get_hash(domain: str) -> str:
298 if not isinstance(domain, str):
299 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
301 raise ValueError("Parameter 'domain' is empty")
303 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
305 def log_error(domain: str, response: requests.models.Response):
306 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
307 if not isinstance(domain, str):
308 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
310 raise ValueError("Parameter 'domain' is empty")
311 elif config.get("write_error_log").lower() != "true":
312 # DEBUG: print(f"DEBUG: Writing to error_log is disabled in configuruation file - EXIT!")
316 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
317 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
318 response = f"response[{type(response)}]='{str(response)}'"
320 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
321 if isinstance(response, str):
322 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
328 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
330 response.status_code,
335 # Cleanup old entries
336 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
337 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
338 except BaseException as exception:
339 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
342 # DEBUG: print("DEBUG: EXIT!")
344 def fetch_peers(domain: str, software: str) -> list:
345 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
346 if not isinstance(domain, str):
347 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
349 raise ValueError("Parameter 'domain' is empty")
350 elif not isinstance(software, str) and software is not None:
351 raise ValueError(f"software[]={type(software)} is not 'str'")
353 if software == "misskey":
354 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
355 return misskey.fetch_peers(domain)
356 elif software == "lemmy":
357 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
358 return lemmy.fetch_peers(domain)
359 elif software == "peertube":
360 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
361 return peertube.fetch_peers(domain)
363 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
365 response = network.fetch_response(domain, "/api/v1/instance/peers", network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
366 # DEBUG: print(f"DEBUG: response[]='{type(response)}'")
368 data = json_from_response(response)
369 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
371 if not response.ok or response.status_code >= 400:
372 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
373 response = network.fetch_response(domain, "/api/v3/site", network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
375 data = json_from_response(response)
376 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
377 if not response.ok or response.status_code >= 400:
378 print("WARNING: Could not reach any JSON API:", domain)
379 instances.update_last_error(domain, response)
380 elif response.ok and isinstance(data, list):
381 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
383 elif "federated_instances" in data:
384 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
385 peers = peers + add_peers(data["federated_instances"])
386 # DEBUG: print("DEBUG: Added instance(s) to peers")
388 print("WARNING: JSON response does not contain 'federated_instances':", domain)
389 instances.update_last_error(domain, response)
391 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
394 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
395 instances.set_data("total_peers", domain, len(peers))
397 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
398 instances.update_last_instance_fetch(domain)
400 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
403 def fetch_nodeinfo(domain: str, path: str = None) -> list:
404 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
405 if not isinstance(domain, str):
406 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
408 raise ValueError("Parameter 'domain' is empty")
409 elif not isinstance(path, str) and path is not None:
410 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
412 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
413 nodeinfo = fetch_wellknown_nodeinfo(domain)
415 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
416 if len(nodeinfo) > 0:
417 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
421 "/nodeinfo/2.1.json",
423 "/nodeinfo/2.0.json",
430 for request in request_paths:
431 if path is not None and path != "" and path != f"https://{domain}{path}":
432 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
435 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
436 response = network.fetch_response(domain, request, network.api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
438 data = json_from_response(response)
439 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
440 if response.ok and isinstance(data, dict):
441 # DEBUG: print("DEBUG: Success:", request)
442 instances.set_data("detection_mode", domain, "STATIC_CHECK")
443 instances.set_data("nodeinfo_url" , domain, request)
445 elif response.ok and isinstance(data, list):
446 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
448 elif not response.ok or response.status_code >= 400:
449 print("WARNING: Failed fetching nodeinfo from domain:", domain)
450 instances.update_last_error(domain, response)
453 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
456 def fetch_wellknown_nodeinfo(domain: str) -> list:
457 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
458 if not isinstance(domain, str):
459 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
461 raise ValueError("Parameter 'domain' is empty")
463 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
466 response = network.fetch_response(domain, "/.well-known/nodeinfo", network.api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
468 data = json_from_response(response)
469 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
470 if response.ok and isinstance(data, dict):
472 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
473 if "links" in nodeinfo:
474 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
475 for link in nodeinfo["links"]:
476 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
477 if link["rel"] in nodeinfo_identifier:
478 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
479 response = fetch_url(link["href"], network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
481 data = json_from_response(response)
482 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
483 if response.ok and isinstance(data, dict):
484 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
485 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
486 instances.set_data("nodeinfo_url" , domain, link["href"])
489 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
491 print("WARNING: nodeinfo does not contain 'links':", domain)
493 # DEBUG: print("DEBUG: Returning data[]:", type(data))
496 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
497 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
498 if not isinstance(domain, str):
499 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
501 raise ValueError("Parameter 'domain' is empty")
502 elif not isinstance(path, str):
503 raise ValueError(f"path[]={type(path)} is not 'str'")
505 raise ValueError("Parameter 'path' is empty")
507 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
510 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
511 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
513 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
514 if response.ok and response.status_code < 300 and len(response.text) > 0:
515 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
516 doc = bs4.BeautifulSoup(response.text, "html.parser")
518 # DEBUG: print("DEBUG: doc[]:", type(doc))
519 generator = doc.find("meta", {"name": "generator"})
520 site_name = doc.find("meta", {"property": "og:site_name"})
522 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
523 if isinstance(generator, bs4.element.Tag):
524 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
525 software = tidyup_domain(generator.get("content"))
526 print(f"INFO: domain='{domain}' is generated by '{software}'")
527 instances.set_data("detection_mode", domain, "GENERATOR")
528 remove_pending_error(domain)
529 elif isinstance(site_name, bs4.element.Tag):
530 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
531 sofware = tidyup_domain(site_name.get("content"))
532 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
533 instances.set_data("detection_mode", domain, "SITE_NAME")
534 remove_pending_error(domain)
536 # DEBUG: print(f"DEBUG: software[]={type(software)}")
537 if isinstance(software, str) and software == "":
538 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
540 elif isinstance(software, str) and ("." in software or " " in software):
541 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
542 software = remove_version(software)
544 # DEBUG: print(f"DEBUG: software[]={type(software)}")
545 if isinstance(software, str) and " powered by " in software:
546 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
547 software = remove_version(strip_powered_by(software))
548 elif isinstance(software, str) and " hosted on " in software:
549 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
550 software = remove_version(strip_hosted_on(software))
551 elif isinstance(software, str) and " by " in software:
552 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
553 software = strip_until(software, " by ")
554 elif isinstance(software, str) and " see " in software:
555 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
556 software = strip_until(software, " see ")
558 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
561 def determine_software(domain: str, path: str = None) -> str:
562 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
563 if not isinstance(domain, str):
564 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
566 raise ValueError("Parameter 'domain' is empty")
567 elif not isinstance(path, str) and path is not None:
568 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
570 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
573 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
574 data = fetch_nodeinfo(domain, path)
576 # DEBUG: print("DEBUG: data[]:", type(data))
577 if not isinstance(data, dict) or len(data) == 0:
578 # DEBUG: print("DEBUG: Could not determine software type:", domain)
579 return fetch_generator_from_path(domain)
581 # DEBUG: print("DEBUG: data():", len(data), data)
582 if "status" in data and data["status"] == "error" and "message" in data:
583 print("WARNING: JSON response is an error:", data["message"])
584 instances.update_last_error(domain, data["message"])
585 return fetch_generator_from_path(domain)
586 elif "message" in data:
587 print("WARNING: JSON response contains only a message:", data["message"])
588 instances.update_last_error(domain, data["message"])
589 return fetch_generator_from_path(domain)
590 elif "software" not in data or "name" not in data["software"]:
591 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
592 software = fetch_generator_from_path(domain)
594 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
597 software = tidyup_domain(data["software"]["name"])
599 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
600 if software in ["akkoma", "rebased"]:
601 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
603 elif software in ["hometown", "ecko"]:
604 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
605 software = "mastodon"
606 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
607 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
609 elif software.find("/") > 0:
610 print("WARNING: Spliting of slash:", software)
611 software = tidyup_domain(software.split("/")[-1])
612 elif software.find("|") > 0:
613 print("WARNING: Spliting of pipe:", software)
614 software = tidyup_domain(software.split("|")[0])
615 elif "powered by" in software:
616 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
617 software = strip_powered_by(software)
618 elif isinstance(software, str) and " by " in software:
619 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
620 software = strip_until(software, " by ")
621 elif isinstance(software, str) and " see " in software:
622 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
623 software = strip_until(software, " see ")
625 # DEBUG: print(f"DEBUG: software[]={type(software)}")
627 print("WARNING: tidyup_domain() left no software name behind:", domain)
630 # DEBUG: print(f"DEBUG: software[]={type(software)}")
631 if str(software) == "":
632 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
633 software = fetch_generator_from_path(domain)
634 elif len(str(software)) > 0 and ("." in software or " " in software):
635 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
636 software = remove_version(software)
638 # DEBUG: print(f"DEBUG: software[]={type(software)}")
639 if isinstance(software, str) and "powered by" in software:
640 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
641 software = remove_version(strip_powered_by(software))
643 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
646 def tidyup_reason(reason: str) -> str:
647 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
648 if not isinstance(reason, str):
649 raise ValueError(f"Parameter reason[]={type(reason)} is not 'str'")
652 reason = reason.strip()
655 reason = re.sub("â", "\"", reason)
657 # DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
660 def tidyup_domain(domain: str) -> str:
661 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
662 if not isinstance(domain, str):
663 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
665 # All lower-case and strip spaces out + last dot
666 domain = domain.lower().strip().rstrip(".")
669 domain = re.sub("\:\d+$", "", domain)
671 # No protocol, sometimes without the slashes
672 domain = re.sub("^https?\:(\/*)", "", domain)
675 domain = re.sub("\/$", "", domain)
678 domain = re.sub("^\@", "", domain)
680 # No individual users in block lists
681 domain = re.sub("(.+)\@", "", domain)
682 if domain.find("/profile/"):
683 domain = domain.split("/profile/")[0]
684 elif domain.find("/users/"):
685 domain = domain.split("/users/")[0]
687 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
690 def json_from_response(response: requests.models.Response) -> list:
691 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
692 if not isinstance(response, requests.models.Response):
693 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
696 if response.text.strip() != "":
697 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
699 data = response.json()
700 except json.decoder.JSONDecodeError:
703 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
706 def has_key(lists: list, key: str, value: any) -> bool:
707 # DEBUG: print(f"DEBUG: lists()={len(lists)},key='{key}',value[]='{type(value)}' - CALLED!")
708 if not isinstance(lists, list):
709 raise ValueError(f"Parameter lists[]='{type(lists)}' is not 'list'")
710 elif not isinstance(key, str):
711 raise ValueError(f"Parameter key[]='{type(key)}' is not 'str'")
713 raise ValueError("Parameter 'key' is empty")
716 # DEBUG: print(f"DEBUG: Checking lists()={len(lists)} ...")
718 # DEBUG: print(f"DEBUG: row['{type(row)}']={row}")
719 if not isinstance(row, dict):
720 raise ValueError(f"row[]='{type(row)}' is not 'dict'")
722 raise KeyError(f"Cannot find key='{key}'")
723 elif row[key] == value:
727 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
730 def find_domains(tag: bs4.element.Tag) -> list:
731 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
732 if not isinstance(tag, bs4.element.Tag):
733 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
734 elif len(tag.select("tr")) == 0:
735 raise KeyError("No table rows found in table!")
738 for element in tag.select("tr"):
739 # DEBUG: print(f"DEBUG: element[]={type(element)}")
740 if not element.find("td"):
741 # DEBUG: print("DEBUG: Skipping element, no <td> found")
744 domain = tidyup_domain(element.find("td").text)
745 reason = tidyup_reason(element.findAll("td")[1].text)
747 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
749 if blacklist.is_blacklisted(domain):
750 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
752 elif domain == "gab.com/.ai, develop.gab.com":
753 # DEBUG: print("DEBUG: Multiple domains detected in one row")
763 "domain": "develop.gab.com",
767 elif not validators.domain(domain):
768 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
771 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
777 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
780 def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Response:
781 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
782 if not isinstance(url, str):
783 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
785 raise ValueError("Parameter 'url' is empty")
786 elif not isinstance(headers, dict):
787 raise ValueError(f"Parameter headers[]='{type(headers)}' is not 'dict'")
788 elif not isinstance(timeout, tuple):
789 raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not 'tuple'")
791 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
792 components = urlparse(url)
794 # Invoke other function, avoid trailing ?
795 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
796 if components.query != "":
797 response = network.fetch_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
799 response = network.fetch_response(components.hostname, f"{components.path}", headers, timeout)
801 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")