1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
28 from urllib.parse import urlparse
30 from fba import blacklist
32 from fba import config
33 from fba import instances
35 from fba.federation import lemmy
36 from fba.federation import misskey
37 from fba.federation import peertube
39 # Array with pending errors needed to be written to database
43 # "rel" identifiers (no real URLs)
44 nodeinfo_identifier = [
45 "https://nodeinfo.diaspora.software/ns/schema/2.1",
46 "https://nodeinfo.diaspora.software/ns/schema/2.0",
47 "https://nodeinfo.diaspora.software/ns/schema/1.1",
48 "https://nodeinfo.diaspora.software/ns/schema/1.0",
49 "http://nodeinfo.diaspora.software/ns/schema/2.1",
50 "http://nodeinfo.diaspora.software/ns/schema/2.0",
51 "http://nodeinfo.diaspora.software/ns/schema/1.1",
52 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 # HTTP headers for non-API requests
57 "User-Agent": config.get("useragent"),
60 # HTTP headers for API requests
62 "User-Agent": config.get("useragent"),
63 "Content-Type": "application/json",
66 # URL for fetching peers
67 get_peers_url = "/api/v1/instance/peers"
70 connection = sqlite3.connect("blocks.db")
71 cursor = connection.cursor()
73 # Pattern instance for version numbers
75 # semantic version number (with v|V) prefix)
76 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
77 # non-sematic, e.g. 1.2.3.4
78 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
79 # non-sematic, e.g. 2023-05[-dev]
80 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
81 # non-semantic, e.g. abcdef0
82 re.compile("^[a-f0-9]{7}$"),
85 ##### Other functions #####
87 def is_primitive(var: any) -> bool:
88 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
89 return type(var) in {int, str, float, bool} or var == None
91 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
92 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
93 if type(domain) != str:
94 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
96 raise ValueError(f"Parameter 'domain' is empty")
97 elif type(origin) != str and origin != None:
98 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
99 elif type(script) != str:
100 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
102 raise ValueError(f"Parameter 'domain' is empty")
104 if not is_instance_registered(domain):
105 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
106 add_instance(domain, origin, script, path)
108 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
109 peerlist = get_peers(domain, software)
111 if (peerlist is None):
112 print("ERROR: Cannot fetch peers:", domain)
114 elif instances.has_pending_instance_data(domain):
115 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
116 instances.update_instance_data(domain)
118 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
119 for instance in peerlist:
121 # Skip "None" types as tidup() cannot parse them
124 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
125 instance = tidyup_domain(instance)
126 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
129 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
131 elif not validators.domain(instance.split("/")[0]):
132 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
134 elif blacklist.is_blacklisted(instance):
135 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
138 # DEBUG: print("DEBUG: Handling instance:", instance)
140 if not is_instance_registered(instance):
141 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
142 add_instance(instance, domain, script)
143 except BaseException as e:
144 print(f"ERROR: instance='{instance}',exception[{type(e)}]:'{str(e)}'")
147 # DEBUG: print("DEBUG: EXIT!")
149 def add_peers(rows: dict) -> list:
150 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
152 for key in ["linked", "allowed", "blocked"]:
153 # DEBUG: print(f"DEBUG: Checking key='{key}'")
154 if key in rows and rows[key] != None:
155 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
156 for peer in rows[key]:
157 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
158 peer = tidyup_domain(peer)
160 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
161 if blacklist.is_blacklisted(peer):
162 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
165 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
168 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
171 def remove_version(software: str) -> str:
172 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
173 if not "." in software and " " not in software:
174 print(f"WARNING: software='{software}' does not contain a version number.")
179 temp = software.split(";")[0]
180 elif "," in software:
181 temp = software.split(",")[0]
182 elif " - " in software:
183 temp = software.split(" - ")[0]
185 # DEBUG: print(f"DEBUG: software='{software}'")
188 version = temp.split(" ")[-1]
189 elif "/" in software:
190 version = temp.split("/")[-1]
191 elif "-" in software:
192 version = temp.split("-")[-1]
194 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
199 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
200 for pattern in patterns:
202 match = pattern.match(version)
204 # DEBUG: print(f"DEBUG: match[]={type(match)}")
205 if type(match) is re.Match:
208 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
209 if type(match) is not re.Match:
210 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
213 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
214 end = len(temp) - len(version) - 1
216 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
217 software = temp[0:end].strip()
218 if " version" in software:
219 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
220 software = strip_until(software, " version")
222 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
225 def strip_powered_by(software: str) -> str:
226 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
228 print(f"ERROR: Bad method call, 'software' is empty")
229 raise Exception("Parameter 'software' is empty")
230 elif not "powered by" in software:
231 print(f"WARNING: Cannot find 'powered by' in '{software}'!")
234 start = software.find("powered by ")
235 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
237 software = software[start + 11:].strip()
238 # DEBUG: print(f"DEBUG: software='{software}'")
240 software = strip_until(software, " - ")
242 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
245 def strip_hosted_on(software: str) -> str:
246 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
248 print(f"ERROR: Bad method call, 'software' is empty")
249 raise Exception("Parameter 'software' is empty")
250 elif not "hosted on" in software:
251 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
254 end = software.find("hosted on ")
255 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
257 software = software[0, start].strip()
258 # DEBUG: print(f"DEBUG: software='{software}'")
260 software = strip_until(software, " - ")
262 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
265 def strip_until(software: str, until: str) -> str:
266 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
268 print(f"ERROR: Bad method call, 'software' is empty")
269 raise Exception("Parameter 'software' is empty")
271 print(f"ERROR: Bad method call, 'until' is empty")
272 raise Exception("Parameter 'until' is empty")
273 elif not until in software:
274 print(f"WARNING: Cannot find '{until}' in '{software}'!")
277 # Next, strip until part
278 end = software.find(until)
280 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
282 software = software[0:end].strip()
284 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
287 def remove_pending_error(domain: str):
288 if type(domain) != str:
289 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
291 raise ValueError(f"Parameter 'domain' is empty")
294 # Prevent updating any pending errors, nodeinfo was found
295 del pending_errors[domain]
300 # DEBUG: print("DEBUG: EXIT!")
302 def get_hash(domain: str) -> str:
303 if type(domain) != str:
304 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
306 raise ValueError(f"Parameter 'domain' is empty")
308 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
310 def log_error(domain: str, response: requests.models.Response):
311 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
312 if type(domain) != str:
313 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
315 raise ValueError(f"Parameter 'domain' is empty")
318 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
319 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
320 response = str(response)
322 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
323 if type(response) is str:
324 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
330 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
332 response.status_code,
337 # Cleanup old entries
338 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
339 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
340 except BaseException as e:
341 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
344 # DEBUG: print("DEBUG: EXIT!")
346 def update_last_error(domain: str, response: requests.models.Response):
347 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
348 if type(domain) != str:
349 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
351 raise ValueError(f"Parameter 'domain' is empty")
353 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
354 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
355 response = f"{type}:str(response)"
357 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
358 if type(response) is str:
359 # DEBUG: print(f"DEBUG: Setting last_error_details='{response}'");
360 instances.set("last_status_code" , domain, 999)
361 instances.set("last_error_details", domain, response)
363 # DEBUG: print(f"DEBUG: Setting last_error_details='{response.reason}'");
364 instances.set("last_status_code" , domain, response.status_code)
365 instances.set("last_error_details", domain, response.reason)
367 # Running pending updated
368 # DEBUG: print(f"DEBUG: Invoking instances.update_instance_data({domain}) ...")
369 instances.update_instance_data(domain)
371 log_error(domain, response)
373 # DEBUG: print("DEBUG: EXIT!")
375 def update_last_nodeinfo(domain: str):
376 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
377 if type(domain) != str:
378 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
380 raise ValueError(f"Parameter 'domain' is empty")
382 # DEBUG: print("DEBUG: Updating last_nodeinfo for domain:", domain)
383 instances.set("last_nodeinfo", domain, time.time())
384 instances.set("last_updated" , domain, time.time())
386 # Running pending updated
387 # DEBUG: print(f"DEBUG: Invoking instances.update_instance_data({domain}) ...")
388 instances.update_instance_data(domain)
390 # DEBUG: print("DEBUG: EXIT!")
392 def get_peers(domain: str, software: str) -> list:
393 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
394 if type(domain) != str:
395 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
397 raise ValueError(f"Parameter 'domain' is empty")
398 elif type(software) != str and software != None:
399 raise ValueError(f"software[]={type(software)} is not 'str'")
401 if software == "misskey":
402 print(f"DEBUG: Invoking misskey.get_peers({domain}) ...")
403 return misskey.get_peers(domain)
404 elif software == "lemmy":
405 print(f"DEBUG: Invoking lemmy.get_peers({domain}) ...")
406 return lemmy.get_peers(domain)
407 elif software == "peertube":
408 print(f"DEBUG: Invoking peertube.get_peers({domain}) ...")
409 return peertube.get_peers(domain)
411 # DEBUG: print(f"DEBUG: Fetching get_peers_url='{get_peers_url}' from '{domain}',software='{software}' ...")
414 response = get_response(domain, get_peers_url, api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
416 data = json_from_response(response)
418 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
419 if not response.ok or response.status_code >= 400:
420 # DEBUG: print(f"DEBUG: Was not able to fetch '{get_peers_url}', trying alternative ...")
421 response = get_response(domain, "/api/v3/site", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
423 data = json_from_response(response)
424 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
425 if not response.ok or response.status_code >= 400:
426 print("WARNING: Could not reach any JSON API:", domain)
427 update_last_error(domain, response)
428 elif response.ok and isinstance(data, list):
429 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
431 elif "federated_instances" in data:
432 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
433 peers = peers + add_peers(data["federated_instances"])
434 # DEBUG: print("DEBUG: Added instance(s) to peers")
436 print("WARNING: JSON response does not contain 'federated_instances':", domain)
437 update_last_error(domain, response)
439 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
442 except BaseException as e:
443 print("WARNING: Some error during get():", domain, e)
444 update_last_error(domain, e)
446 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
447 instances.set("total_peers", domain, len(peers))
449 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
450 instances.update_last_instance_fetch(domain)
452 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
455 def post_json_api(domain: str, path: str, parameter: str, extra_headers: dict = {}) -> dict:
456 if type(domain) != str:
457 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
459 raise ValueError(f"Parameter 'domain' is empty")
460 elif type(path) != str:
461 raise ValueError(f"path[]={type(path)} is not 'str'")
463 raise ValueError("Parameter 'path' cannot be empty")
464 elif type(parameter) != str:
465 raise ValueError(f"parameter[]={type(parameter)} is not 'str'")
467 # DEBUG: print("DEBUG: Sending POST to domain,path,parameter:", domain, path, parameter, extra_headers)
470 response = reqto.post(
471 f"https://{domain}{path}",
473 headers={**api_headers, **extra_headers},
474 timeout=(config.get("connection_timeout"), config.get("read_timeout"))
477 data = json_from_response(response)
478 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
479 if not response.ok or response.status_code >= 400:
480 print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',parameter()={len(parameter)},response.status_code='{response.status_code}',data[]='{type(data)}'")
481 update_last_error(domain, response)
483 except BaseException as e:
484 print(f"WARNING: Some error during post(): domain='{domain}',path='{path}',parameter()={len(parameter)},exception[{type(e)}]:'{str(e)}'")
486 # DEBUG: print(f"DEBUG: Returning data({len(data)})=[]:{type(data)}")
489 def fetch_nodeinfo(domain: str, path: str = None) -> list:
490 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
491 if type(domain) != str:
492 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
494 raise ValueError(f"Parameter 'domain' is empty")
495 elif type(path) != str and path != None:
496 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
498 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
499 nodeinfo = fetch_wellknown_nodeinfo(domain)
501 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
502 if len(nodeinfo) > 0:
503 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
507 "/nodeinfo/2.1.json",
509 "/nodeinfo/2.0.json",
516 for request in request_paths:
517 if path != None and path != "" and path != f"https://{domain}{path}":
518 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
522 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
523 response = get_response(domain, request, api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
525 data = json_from_response(response)
526 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
527 if response.ok and isinstance(data, dict):
528 # DEBUG: print("DEBUG: Success:", request)
529 instances.set("detection_mode", domain, "STATIC_CHECK")
530 instances.set("nodeinfo_url" , domain, request)
532 elif response.ok and isinstance(data, list):
533 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
535 elif not response.ok or response.status_code >= 400:
536 print("WARNING: Failed fetching nodeinfo from domain:", domain)
537 update_last_error(domain, response)
540 except BaseException as e:
541 # DEBUG: print("DEBUG: Cannot fetch API request:", request)
542 update_last_error(domain, e)
545 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
548 def fetch_wellknown_nodeinfo(domain: str) -> list:
549 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
550 if type(domain) != str:
551 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
553 raise ValueError(f"Parameter 'domain' is empty")
555 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
559 response = get_response(domain, "/.well-known/nodeinfo", api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
561 data = json_from_response(response)
562 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
563 if response.ok and isinstance(data, dict):
565 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
566 if "links" in nodeinfo:
567 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
568 for link in nodeinfo["links"]:
569 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
570 if link["rel"] in nodeinfo_identifier:
571 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
572 response = get_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
574 data = json_from_response(response)
575 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
576 if response.ok and isinstance(data, dict):
577 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
578 instances.set("detection_mode", domain, "AUTO_DISCOVERY")
579 instances.set("nodeinfo_url" , domain, link["href"])
582 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
584 print("WARNING: nodeinfo does not contain 'links':", domain)
586 except BaseException as e:
587 print("WARNING: Failed fetching .well-known info:", domain)
588 update_last_error(domain, e)
591 # DEBUG: print("DEBUG: Returning data[]:", type(data))
594 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
595 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
596 if type(domain) != str:
597 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
599 raise ValueError(f"Parameter 'domain' is empty")
600 elif type(path) != str:
601 raise ValueError(f"path[]={type(path)} is not 'str'")
603 raise ValueError(f"Parameter 'domain' is empty")
605 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
609 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
610 response = get_response(domain, path, headers, (config.get("connection_timeout"), config.get("read_timeout")))
612 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
613 if response.ok and response.status_code < 300 and len(response.text) > 0:
614 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
615 doc = bs4.BeautifulSoup(response.text, "html.parser")
617 # DEBUG: print("DEBUG: doc[]:", type(doc))
618 generator = doc.find("meta", {"name": "generator"})
619 site_name = doc.find("meta", {"property": "og:site_name"})
621 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
622 if isinstance(generator, bs4.element.Tag):
623 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
624 software = tidyup_domain(generator.get("content"))
625 print(f"INFO: domain='{domain}' is generated by '{software}'")
626 instances.set("detection_mode", domain, "GENERATOR")
627 remove_pending_error(domain)
628 elif isinstance(site_name, bs4.element.Tag):
629 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
630 sofware = tidyup_domain(site_name.get("content"))
631 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
632 instances.set("detection_mode", domain, "SITE_NAME")
633 remove_pending_error(domain)
635 except BaseException as e:
636 # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e)
637 update_last_error(domain, e)
640 # DEBUG: print(f"DEBUG: software[]={type(software)}")
641 if type(software) is str and software == "":
642 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
644 elif type(software) is str and ("." in software or " " in software):
645 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
646 software = remove_version(software)
648 # DEBUG: print(f"DEBUG: software[]={type(software)}")
649 if type(software) is str and " powered by " in software:
650 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
651 software = remove_version(strip_powered_by(software))
652 elif type(software) is str and " hosted on " in software:
653 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
654 software = remove_version(strip_hosted_on(software))
655 elif type(software) is str and " by " in software:
656 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
657 software = strip_until(software, " by ")
658 elif type(software) is str and " see " in software:
659 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
660 software = strip_until(software, " see ")
662 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
665 def determine_software(domain: str, path: str = None) -> str:
666 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
667 if type(domain) != str:
668 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
670 raise ValueError(f"Parameter 'domain' is empty")
671 elif type(path) != str and path != None:
672 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
674 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
677 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
678 data = fetch_nodeinfo(domain, path)
680 # DEBUG: print("DEBUG: data[]:", type(data))
681 if not isinstance(data, dict) or len(data) == 0:
682 # DEBUG: print("DEBUG: Could not determine software type:", domain)
683 return fetch_generator_from_path(domain)
685 # DEBUG: print("DEBUG: data():", len(data), data)
686 if "status" in data and data["status"] == "error" and "message" in data:
687 print("WARNING: JSON response is an error:", data["message"])
688 update_last_error(domain, data["message"])
689 return fetch_generator_from_path(domain)
690 elif "message" in data:
691 print("WARNING: JSON response contains only a message:", data["message"])
692 update_last_error(domain, data["message"])
693 return fetch_generator_from_path(domain)
694 elif "software" not in data or "name" not in data["software"]:
695 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
696 software = fetch_generator_from_path(domain)
698 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
701 software = tidyup_domain(data["software"]["name"])
703 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
704 if software in ["akkoma", "rebased"]:
705 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
707 elif software in ["hometown", "ecko"]:
708 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
709 software = "mastodon"
710 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
711 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
713 elif software.find("/") > 0:
714 print("WARNING: Spliting of slash:", software)
715 software = software.split("/")[-1];
716 elif software.find("|") > 0:
717 print("WARNING: Spliting of pipe:", software)
718 software = tidyup_domain(software.split("|")[0]);
719 elif "powered by" in software:
720 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
721 software = strip_powered_by(software)
722 elif type(software) is str and " by " in software:
723 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
724 software = strip_until(software, " by ")
725 elif type(software) is str and " see " in software:
726 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
727 software = strip_until(software, " see ")
729 # DEBUG: print(f"DEBUG: software[]={type(software)}")
731 print("WARNING: tidyup_domain() left no software name behind:", domain)
734 # DEBUG: print(f"DEBUG: software[]={type(software)}")
735 if str(software) == "":
736 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
737 software = fetch_generator_from_path(domain)
738 elif len(str(software)) > 0 and ("." in software or " " in software):
739 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
740 software = remove_version(software)
742 # DEBUG: print(f"DEBUG: software[]={type(software)}")
743 if type(software) is str and "powered by" in software:
744 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
745 software = remove_version(strip_powered_by(software))
747 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
750 def is_instance_registered(domain: str) -> bool:
751 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
752 if type(domain) != str:
753 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
755 raise ValueError(f"Parameter 'domain' is empty")
757 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
758 if not cache.key_exists("is_registered"):
759 # DEBUG: print(f"DEBUG: Cache for 'is_registered' not initialized, fetching all rows ...")
761 cursor.execute("SELECT domain FROM instances")
764 cache.set_all("is_registered", cursor.fetchall(), True)
765 except BaseException as e:
766 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
770 registered = cache.sub_key_exists("is_registered", domain)
772 # DEBUG: print(f"DEBUG: registered='{registered}' - EXIT!")
775 def add_instance(domain: str, origin: str, originator: str, path: str = None):
776 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',originator='{originator}',path='{path}' - CALLED!")
777 if type(domain) != str:
778 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
780 raise ValueError(f"Parameter 'domain' is empty")
781 elif type(origin) != str and origin != None:
782 raise ValueError(f"origin[]={type(origin)} is not 'str'")
783 elif type(originator) != str:
784 raise ValueError(f"originator[]={type(originator)} is not 'str'")
785 elif originator == "":
786 raise ValueError(f"originator cannot be empty")
787 elif not validators.domain(domain.split("/")[0]):
788 raise ValueError(f"Bad domain name='{domain}'")
789 elif origin is not None and not validators.domain(origin.split("/")[0]):
790 raise ValueError(f"Bad origin name='{origin}'")
791 elif blacklist.is_blacklisted(domain):
792 raise Exception(f"domain='{domain}' is blacklisted, but method invoked")
794 # DEBUG: print("DEBUG: domain,origin,originator,path:", domain, origin, originator, path)
795 software = determine_software(domain, path)
796 # DEBUG: print("DEBUG: Determined software:", software)
798 print(f"INFO: Adding instance domain='{domain}' (origin='{origin}',software='{software}')")
801 "INSERT INTO instances (domain, origin, originator, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)",
812 cache.set_sub_key("is_registered", domain, True)
814 if instances.has_pending_instance_data(domain):
815 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo being updated ...")
816 instances.set("last_status_code" , domain, None)
817 instances.set("last_error_details", domain, None)
818 instances.update_instance_data(domain)
819 remove_pending_error(domain)
821 if domain in pending_errors:
822 # DEBUG: print("DEBUG: domain has pending error being updated:", domain)
823 update_last_error(domain, pending_errors[domain])
824 remove_pending_error(domain)
826 except BaseException as e:
827 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
830 # DEBUG: print("DEBUG: Updating nodeinfo for domain:", domain)
831 update_last_nodeinfo(domain)
833 # DEBUG: print("DEBUG: EXIT!")
835 def send_bot_post(instance: str, blocks: dict):
836 # DEBUG: print(f"DEBUG: instance={instance},blocks()={len(blocks)} - CALLED!")
837 if type(domain) != str:
838 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
840 raise ValueError("Parameter 'domain' is empty")
841 elif type(blocks) != dict:
842 raise ValueError(f"Parameter blocks[]='{type(blocks)}' is not 'dict'")
844 message = instance + " has blocked the following instances:\n\n"
849 blocks = blocks[0 : 19]
851 # DEBUG: print(f"DEBUG: blocks()={len(blocks)}")
853 # DEBUG: print(f"DEBUG: block['{type(block)}']={block}")
854 if block["reason"] == None or block["reason"] == '':
855 message = message + block["blocked"] + " with unspecified reason\n"
857 if len(block["reason"]) > 420:
858 block["reason"] = block["reason"][0:419] + "[…]"
860 message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n'
863 message = message + "(the list has been truncated to the first 20 entries)"
865 botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}}
868 f"{config.get('bot_instance')}/api/v1/statuses",
871 "visibility" : config.get('bot_visibility'),
872 "content_type": "text/plain"
880 def fetch_friendica_blocks(domain: str) -> dict:
881 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
882 if type(domain) != str:
883 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
885 raise ValueError(f"Parameter 'domain' is empty")
887 # DEBUG: print("DEBUG: Fetching friendica blocks from domain:", domain)
891 doc = bs4.BeautifulSoup(
892 get_response(domain, "/friendica", headers, (config.get("connection_timeout"), config.get("read_timeout"))).text,
895 except BaseException as e:
896 print("WARNING: Failed to fetch /friendica from domain:", domain, e)
897 update_last_error(domain, e)
900 blocklist = doc.find(id="about_blocklist")
902 # Prevents exceptions:
903 if blocklist is None:
904 # DEBUG: print("DEBUG: Instance has no block list:", domain)
907 for line in blocklist.find("table").find_all("tr")[1:]:
908 # DEBUG: print(f"DEBUG: line='{line}'")
910 "domain": tidyup_domain(line.find_all("td")[0].text),
911 "reason": tidyup_domain(line.find_all("td")[1].text)
914 # DEBUG: print("DEBUG: Returning blocks() for domain:", domain, len(blocks))
919 def fetch_misskey_blocks(domain: str) -> dict:
920 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
921 if type(domain) != str:
922 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
924 raise ValueError(f"Parameter 'domain' is empty")
926 # DEBUG: print("DEBUG: Fetching misskey blocks from domain:", domain)
933 step = config.get("misskey_limit")
935 # iterating through all "suspended" (follow-only in its terminology)
936 # instances page-by-page, since that troonware doesn't support
937 # sending them all at once
939 # DEBUG: print(f"DEBUG: Fetching offset='{offset}' from '{domain}' ...")
941 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
942 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
951 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
952 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
957 "offset" : offset - 1
962 # DEBUG: print("DEBUG: fetched():", len(fetched))
963 if len(fetched) == 0:
964 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
966 elif len(fetched) != config.get("misskey_limit"):
967 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
968 offset = offset + (config.get("misskey_limit") - len(fetched))
970 # DEBUG: print("DEBUG: Raising offset by step:", step)
971 offset = offset + step
974 for instance in fetched:
976 if instance["isSuspended"] and not has_key(blocks["suspended"], "domain", instance):
978 blocks["suspended"].append(
980 "domain": tidyup_domain(instance["host"]),
981 # no reason field, nothing
986 # DEBUG: print(f"DEBUG: count={count}")
988 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
991 except BaseException as e:
992 print("WARNING: Caught error, exiting loop:", domain, e)
993 update_last_error(domain, e)
998 # same shit, different asshole ("blocked" aka full suspend)
1001 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
1002 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
1011 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
1012 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
1017 "offset" : offset - 1
1022 # DEBUG: print("DEBUG: fetched():", len(fetched))
1023 if len(fetched) == 0:
1024 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
1026 elif len(fetched) != config.get("misskey_limit"):
1027 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
1028 offset = offset + (config.get("misskey_limit") - len(fetched))
1030 # DEBUG: print("DEBUG: Raising offset by step:", step)
1031 offset = offset + step
1034 for instance in fetched:
1036 if instance["isBlocked"] and not has_key(blocks["blocked"], "domain", instance):
1038 blocks["blocked"].append({
1039 "domain": tidyup_domain(instance["host"]),
1043 # DEBUG: print(f"DEBUG: count={count}")
1045 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
1048 except BaseException as e:
1049 print("ERROR: Exception during POST:", domain, e)
1050 update_last_error(domain, e)
1054 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
1055 instances.update_last_instance_fetch(domain)
1057 # DEBUG: print("DEBUG: Returning for domain,blocked(),suspended():", domain, len(blocks["blocked"]), len(blocks["suspended"]))
1059 "reject" : blocks["blocked"],
1060 "followers_only": blocks["suspended"]
1063 def tidyup_reason(reason: str) -> str:
1064 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
1065 if type(reason) != str:
1066 raise ValueError(f"Parameter reason[]={type(reason)} is not expected")
1069 reason = reason.strip()
1072 reason = re.sub("â", "\"", reason)
1074 ## DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
1077 def tidyup_domain(domain: str) -> str:
1078 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
1079 if type(domain) != str:
1080 raise ValueError(f"Parameter domain[]={type(domain)} is not expected")
1082 # All lower-case and strip spaces out + last dot
1083 domain = domain.lower().strip().rstrip(".")
1086 domain = re.sub("\:\d+$", "", domain)
1088 # No protocol, sometimes without the slashes
1089 domain = re.sub("^https?\:(\/*)", "", domain)
1092 domain = re.sub("\/$", "", domain)
1095 domain = re.sub("^\@", "", domain)
1097 # No individual users in block lists
1098 domain = re.sub("(.+)\@", "", domain)
1100 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
1103 def json_from_response(response: requests.models.Response) -> list:
1104 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
1105 if not isinstance(response, requests.models.Response):
1106 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
1109 if response.text.strip() != "":
1110 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
1112 data = response.json()
1113 except json.decoder.JSONDecodeError:
1116 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
1119 def get_response(domain: str, path: str, headers: dict, timeout: list) -> requests.models.Response:
1120 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!")
1121 if type(domain) != str:
1122 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
1124 raise ValueError("Parameter 'domain' is empty")
1125 elif type(path) != str:
1126 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
1128 raise ValueError("Parameter 'path' is empty")
1131 # DEBUG: print(f"DEBUG: Sending request to '{domain}{path}' ...")
1132 response = reqto.get(
1133 f"https://{domain}{path}",
1137 except requests.exceptions.ConnectionError as e:
1138 # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(e)}]='{str(e)}'")
1139 update_last_error(domain, e)
1142 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")
1145 def has_key(keys: list, search: str, value: any) -> bool:
1146 # DEBUG: print(f"DEBUG: keys()={len(keys)},search='{search}',value[]='{type(value)}' - CALLED!")
1147 if type(keys) != list:
1148 raise ValueError(f"Parameter keys[]='{type(keys)}' is not 'list'")
1149 elif type(search) != str:
1150 raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
1152 raise ValueError("Parameter 'search' is empty")
1155 # DEBUG: print(f"DEBUG: Checking keys()={len(keys)} ...")
1157 # DEBUG: print(f"DEBUG: key['{type(key)}']={key}")
1158 if type(key) != dict:
1159 raise ValueError(f"key[]='{type(key)}' is not 'dict'")
1160 elif not search in key:
1161 raise KeyError(f"Cannot find search='{search}'")
1162 elif key[search] == value:
1166 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
1169 def find_domains(tag: bs4.element.Tag) -> list:
1170 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
1171 if not isinstance(tag, bs4.element.Tag):
1172 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
1173 elif not isinstance(tag, bs4.element.Tag):
1174 raise KeyError("Cannot find table with instances!")
1175 elif len(tag.select("tr")) == 0:
1176 raise KeyError("No table rows found in table!")
1179 for element in tag.select("tr"):
1180 # DEBUG: print(f"DEBUG: element[]={type(element)}")
1181 if not element.find("td"):
1182 # DEBUG: print("DEBUG: Skipping element, no <td> found")
1185 domain = tidyup_domain(element.find("td").text)
1186 reason = tidyup_reason(element.findAll("td")[1].text)
1188 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
1190 if blacklist.is_blacklisted(domain):
1191 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
1193 elif domain == "gab.com/.ai, develop.gab.com":
1194 # DEBUG: print(f"DEBUG: Multiple domains detected in one row")
1196 "domain": "gab.com",
1204 "domain": "develop.gab.com",
1208 elif not validators.domain(domain):
1209 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
1212 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
1218 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
1221 def get_url(url: str, headers: dict, timeout: list) -> requests.models.Response:
1222 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
1223 if type(url) != str:
1224 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
1226 raise ValueError("Parameter 'url' is empty")
1228 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
1229 components = urlparse(url)
1231 # Invoke other function, avoid trailing ?
1232 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
1233 if components.query != "":
1234 response = get_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
1236 response = get_response(components.hostname, f"{components.path}", headers, timeout)
1238 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")