1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
28 from urllib.parse import urlparse
30 from fba import blacklist
32 from fba import config
33 from fba import instances
35 from fba.federation import lemmy
36 from fba.federation import misskey
37 from fba.federation import peertube
39 # Array with pending errors needed to be written to database
43 # "rel" identifiers (no real URLs)
44 nodeinfo_identifier = [
45 "https://nodeinfo.diaspora.software/ns/schema/2.1",
46 "https://nodeinfo.diaspora.software/ns/schema/2.0",
47 "https://nodeinfo.diaspora.software/ns/schema/1.1",
48 "https://nodeinfo.diaspora.software/ns/schema/1.0",
49 "http://nodeinfo.diaspora.software/ns/schema/2.1",
50 "http://nodeinfo.diaspora.software/ns/schema/2.0",
51 "http://nodeinfo.diaspora.software/ns/schema/1.1",
52 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 # HTTP headers for non-API requests
57 "User-Agent": config.get("useragent"),
60 # HTTP headers for API requests
62 "User-Agent": config.get("useragent"),
63 "Content-Type": "application/json",
66 # URL for fetching peers
67 get_peers_url = "/api/v1/instance/peers"
70 connection = sqlite3.connect("blocks.db")
71 cursor = connection.cursor()
73 # Pattern instance for version numbers
75 # semantic version number (with v|V) prefix)
76 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
77 # non-sematic, e.g. 1.2.3.4
78 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
79 # non-sematic, e.g. 2023-05[-dev]
80 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
81 # non-semantic, e.g. abcdef0
82 re.compile("^[a-f0-9]{7}$"),
85 ##### Other functions #####
87 def is_primitive(var: any) -> bool:
88 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
89 return type(var) in {int, str, float, bool} or var == None
91 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
92 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
93 if type(domain) != str:
94 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
96 raise ValueError(f"Parameter 'domain' is empty")
97 elif type(origin) != str and origin != None:
98 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
99 elif type(script) != str:
100 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
102 raise ValueError(f"Parameter 'domain' is empty")
104 if not is_instance_registered(domain):
105 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
106 add_instance(domain, origin, script, path)
108 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
109 peerlist = get_peers(domain, software)
111 if (peerlist is None):
112 print("ERROR: Cannot fetch peers:", domain)
114 elif instances.has_pending_instance_data(domain):
115 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
116 instances.update_instance_data(domain)
118 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
119 for instance in peerlist:
121 # Skip "None" types as tidup() cannot parse them
124 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
125 instance = tidyup_domain(instance)
126 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
129 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
131 elif not validators.domain(instance.split("/")[0]):
132 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
134 elif blacklist.is_blacklisted(instance):
135 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
138 # DEBUG: print("DEBUG: Handling instance:", instance)
140 if not is_instance_registered(instance):
141 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
142 add_instance(instance, domain, script)
143 except BaseException as e:
144 print(f"ERROR: instance='{instance}',exception[{type(e)}]:'{str(e)}'")
147 # DEBUG: print("DEBUG: EXIT!")
149 def add_peers(rows: dict) -> list:
150 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
152 for key in ["linked", "allowed", "blocked"]:
153 # DEBUG: print(f"DEBUG: Checking key='{key}'")
154 if key in rows and rows[key] != None:
155 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
156 for peer in rows[key]:
157 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
158 peer = tidyup_domain(peer)
160 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
161 if blacklist.is_blacklisted(peer):
162 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
165 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
168 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
171 def remove_version(software: str) -> str:
172 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
173 if not "." in software and " " not in software:
174 print(f"WARNING: software='{software}' does not contain a version number.")
179 temp = software.split(";")[0]
180 elif "," in software:
181 temp = software.split(",")[0]
182 elif " - " in software:
183 temp = software.split(" - ")[0]
185 # DEBUG: print(f"DEBUG: software='{software}'")
188 version = temp.split(" ")[-1]
189 elif "/" in software:
190 version = temp.split("/")[-1]
191 elif "-" in software:
192 version = temp.split("-")[-1]
194 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
199 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
200 for pattern in patterns:
202 match = pattern.match(version)
204 # DEBUG: print(f"DEBUG: match[]={type(match)}")
205 if type(match) is re.Match:
208 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
209 if type(match) is not re.Match:
210 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
213 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
214 end = len(temp) - len(version) - 1
216 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
217 software = temp[0:end].strip()
218 if " version" in software:
219 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
220 software = strip_until(software, " version")
222 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
225 def strip_powered_by(software: str) -> str:
226 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
228 print(f"ERROR: Bad method call, 'software' is empty")
229 raise Exception("Parameter 'software' is empty")
230 elif not "powered by" in software:
231 print(f"WARNING: Cannot find 'powered by' in '{software}'!")
234 start = software.find("powered by ")
235 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
237 software = software[start + 11:].strip()
238 # DEBUG: print(f"DEBUG: software='{software}'")
240 software = strip_until(software, " - ")
242 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
245 def strip_hosted_on(software: str) -> str:
246 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
248 print(f"ERROR: Bad method call, 'software' is empty")
249 raise Exception("Parameter 'software' is empty")
250 elif not "hosted on" in software:
251 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
254 end = software.find("hosted on ")
255 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
257 software = software[0, start].strip()
258 # DEBUG: print(f"DEBUG: software='{software}'")
260 software = strip_until(software, " - ")
262 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
265 def strip_until(software: str, until: str) -> str:
266 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
268 print(f"ERROR: Bad method call, 'software' is empty")
269 raise Exception("Parameter 'software' is empty")
271 print(f"ERROR: Bad method call, 'until' is empty")
272 raise Exception("Parameter 'until' is empty")
273 elif not until in software:
274 print(f"WARNING: Cannot find '{until}' in '{software}'!")
277 # Next, strip until part
278 end = software.find(until)
280 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
282 software = software[0:end].strip()
284 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
287 def remove_pending_error(domain: str):
288 if type(domain) != str:
289 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
291 raise ValueError(f"Parameter 'domain' is empty")
294 # Prevent updating any pending errors, nodeinfo was found
295 del pending_errors[domain]
300 # DEBUG: print("DEBUG: EXIT!")
302 def get_hash(domain: str) -> str:
303 if type(domain) != str:
304 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
306 raise ValueError(f"Parameter 'domain' is empty")
308 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
310 def log_error(domain: str, response: requests.models.Response):
311 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
312 if type(domain) != str:
313 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
315 raise ValueError(f"Parameter 'domain' is empty")
318 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
319 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
320 response = str(response)
322 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
323 if type(response) is str:
324 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
330 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
332 response.status_code,
337 # Cleanup old entries
338 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
339 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
340 except BaseException as e:
341 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
344 # DEBUG: print("DEBUG: EXIT!")
346 def update_last_error(domain: str, response: requests.models.Response):
347 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
348 if type(domain) != str:
349 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
351 raise ValueError(f"Parameter 'domain' is empty")
353 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
354 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
355 response = f"{type}:str(response)"
357 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
358 if type(response) is str:
359 # DEBUG: print(f"DEBUG: Setting last_error_details='{response}'");
360 instances.set("last_status_code" , domain, 999)
361 instances.set("last_error_details", domain, response)
363 # DEBUG: print(f"DEBUG: Setting last_error_details='{response.reason}'");
364 instances.set("last_status_code" , domain, response.status_code)
365 instances.set("last_error_details", domain, response.reason)
367 # Running pending updated
368 # DEBUG: print(f"DEBUG: Invoking instances.update_instance_data({domain}) ...")
369 instances.update_instance_data(domain)
371 log_error(domain, response)
373 # DEBUG: print("DEBUG: EXIT!")
375 def update_last_nodeinfo(domain: str):
376 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
377 if type(domain) != str:
378 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
380 raise ValueError(f"Parameter 'domain' is empty")
382 # DEBUG: print("DEBUG: Updating last_nodeinfo for domain:", domain)
383 instances.set("last_nodeinfo", domain, time.time())
384 instances.set("last_updated" , domain, time.time())
386 # Running pending updated
387 # DEBUG: print(f"DEBUG: Invoking instances.update_instance_data({domain}) ...")
388 instances.update_instance_data(domain)
390 # DEBUG: print("DEBUG: EXIT!")
392 def get_peers(domain: str, software: str) -> list:
393 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
394 if type(domain) != str:
395 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
397 raise ValueError(f"Parameter 'domain' is empty")
398 elif type(software) != str and software != None:
399 raise ValueError(f"software[]={type(software)} is not 'str'")
401 if software == "misskey":
402 # DEBUG: print(f"DEBUG: Invoking misskey.get_peers({domain}) ...")
403 return misskey.get_peers(domain)
404 elif software == "lemmy":
405 # DEBUG: print(f"DEBUG: Invoking lemmy.get_peers({domain}) ...")
406 return lemmy.get_peers(domain)
407 elif software == "peertube":
408 # DEBUG: print(f"DEBUG: Invoking peertube.get_peers({domain}) ...")
409 return peertube.get_peers(domain)
411 # DEBUG: print(f"DEBUG: Fetching get_peers_url='{get_peers_url}' from '{domain}',software='{software}' ...")
414 response = get_response(domain, get_peers_url, api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
416 data = json_from_response(response)
418 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
419 if not response.ok or response.status_code >= 400:
420 # DEBUG: print(f"DEBUG: Was not able to fetch '{get_peers_url}', trying alternative ...")
421 response = get_response(domain, "/api/v3/site", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
423 data = json_from_response(response)
424 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
425 if not response.ok or response.status_code >= 400:
426 print("WARNING: Could not reach any JSON API:", domain)
427 update_last_error(domain, response)
428 elif response.ok and isinstance(data, list):
429 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
431 elif "federated_instances" in data:
432 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
433 peers = peers + add_peers(data["federated_instances"])
434 # DEBUG: print("DEBUG: Added instance(s) to peers")
436 print("WARNING: JSON response does not contain 'federated_instances':", domain)
437 update_last_error(domain, response)
439 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
442 except BaseException as e:
443 print("WARNING: Some error during get():", domain, e)
444 update_last_error(domain, e)
446 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
447 instances.set("total_peers", domain, len(peers))
449 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
450 instances.update_last_instance_fetch(domain)
452 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
455 def post_json_api(domain: str, path: str, parameter: str, extra_headers: dict = {}) -> dict:
456 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',parameter='{parameter}',extra_headers()={len(extra_headers)} - CALLED!")
457 if type(domain) != str:
458 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
460 raise ValueError(f"Parameter 'domain' is empty")
461 elif type(path) != str:
462 raise ValueError(f"path[]={type(path)} is not 'str'")
464 raise ValueError("Parameter 'path' cannot be empty")
465 elif type(parameter) != str:
466 raise ValueError(f"parameter[]={type(parameter)} is not 'str'")
468 # DEBUG: print("DEBUG: Sending POST to domain,path,parameter:", domain, path, parameter, extra_headers)
471 response = reqto.post(
472 f"https://{domain}{path}",
474 headers={**api_headers, **extra_headers},
475 timeout=(config.get("connection_timeout"), config.get("read_timeout"))
478 data = json_from_response(response)
479 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
480 if not response.ok or response.status_code >= 400:
481 print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',parameter()={len(parameter)},response.status_code='{response.status_code}',data[]='{type(data)}'")
482 update_last_error(domain, response)
484 except BaseException as e:
485 print(f"WARNING: Some error during post(): domain='{domain}',path='{path}',parameter()={len(parameter)},exception[{type(e)}]:'{str(e)}'")
487 # DEBUG: print(f"DEBUG: Returning data({len(data)})=[]:{type(data)}")
490 def fetch_nodeinfo(domain: str, path: str = None) -> list:
491 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
492 if type(domain) != str:
493 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
495 raise ValueError(f"Parameter 'domain' is empty")
496 elif type(path) != str and path != None:
497 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
499 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
500 nodeinfo = fetch_wellknown_nodeinfo(domain)
502 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
503 if len(nodeinfo) > 0:
504 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
508 "/nodeinfo/2.1.json",
510 "/nodeinfo/2.0.json",
517 for request in request_paths:
518 if path != None and path != "" and path != f"https://{domain}{path}":
519 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
523 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
524 response = get_response(domain, request, api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
526 data = json_from_response(response)
527 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
528 if response.ok and isinstance(data, dict):
529 # DEBUG: print("DEBUG: Success:", request)
530 instances.set("detection_mode", domain, "STATIC_CHECK")
531 instances.set("nodeinfo_url" , domain, request)
533 elif response.ok and isinstance(data, list):
534 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
536 elif not response.ok or response.status_code >= 400:
537 print("WARNING: Failed fetching nodeinfo from domain:", domain)
538 update_last_error(domain, response)
541 except BaseException as e:
542 # DEBUG: print("DEBUG: Cannot fetch API request:", request)
543 update_last_error(domain, e)
546 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
549 def fetch_wellknown_nodeinfo(domain: str) -> list:
550 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
551 if type(domain) != str:
552 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
554 raise ValueError(f"Parameter 'domain' is empty")
556 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
560 response = get_response(domain, "/.well-known/nodeinfo", api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
562 data = json_from_response(response)
563 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
564 if response.ok and isinstance(data, dict):
566 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
567 if "links" in nodeinfo:
568 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
569 for link in nodeinfo["links"]:
570 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
571 if link["rel"] in nodeinfo_identifier:
572 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
573 response = get_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
575 data = json_from_response(response)
576 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
577 if response.ok and isinstance(data, dict):
578 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
579 instances.set("detection_mode", domain, "AUTO_DISCOVERY")
580 instances.set("nodeinfo_url" , domain, link["href"])
583 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
585 print("WARNING: nodeinfo does not contain 'links':", domain)
587 except BaseException as e:
588 print("WARNING: Failed fetching .well-known info:", domain)
589 update_last_error(domain, e)
592 # DEBUG: print("DEBUG: Returning data[]:", type(data))
595 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
596 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
597 if type(domain) != str:
598 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
600 raise ValueError(f"Parameter 'domain' is empty")
601 elif type(path) != str:
602 raise ValueError(f"path[]={type(path)} is not 'str'")
604 raise ValueError(f"Parameter 'domain' is empty")
606 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
610 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
611 response = get_response(domain, path, headers, (config.get("connection_timeout"), config.get("read_timeout")))
613 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
614 if response.ok and response.status_code < 300 and len(response.text) > 0:
615 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
616 doc = bs4.BeautifulSoup(response.text, "html.parser")
618 # DEBUG: print("DEBUG: doc[]:", type(doc))
619 generator = doc.find("meta", {"name": "generator"})
620 site_name = doc.find("meta", {"property": "og:site_name"})
622 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
623 if isinstance(generator, bs4.element.Tag):
624 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
625 software = tidyup_domain(generator.get("content"))
626 print(f"INFO: domain='{domain}' is generated by '{software}'")
627 instances.set("detection_mode", domain, "GENERATOR")
628 remove_pending_error(domain)
629 elif isinstance(site_name, bs4.element.Tag):
630 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
631 sofware = tidyup_domain(site_name.get("content"))
632 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
633 instances.set("detection_mode", domain, "SITE_NAME")
634 remove_pending_error(domain)
636 except BaseException as e:
637 # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e)
638 update_last_error(domain, e)
641 # DEBUG: print(f"DEBUG: software[]={type(software)}")
642 if type(software) is str and software == "":
643 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
645 elif type(software) is str and ("." in software or " " in software):
646 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
647 software = remove_version(software)
649 # DEBUG: print(f"DEBUG: software[]={type(software)}")
650 if type(software) is str and " powered by " in software:
651 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
652 software = remove_version(strip_powered_by(software))
653 elif type(software) is str and " hosted on " in software:
654 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
655 software = remove_version(strip_hosted_on(software))
656 elif type(software) is str and " by " in software:
657 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
658 software = strip_until(software, " by ")
659 elif type(software) is str and " see " in software:
660 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
661 software = strip_until(software, " see ")
663 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
666 def determine_software(domain: str, path: str = None) -> str:
667 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
668 if type(domain) != str:
669 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
671 raise ValueError(f"Parameter 'domain' is empty")
672 elif type(path) != str and path != None:
673 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
675 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
678 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
679 data = fetch_nodeinfo(domain, path)
681 # DEBUG: print("DEBUG: data[]:", type(data))
682 if not isinstance(data, dict) or len(data) == 0:
683 # DEBUG: print("DEBUG: Could not determine software type:", domain)
684 return fetch_generator_from_path(domain)
686 # DEBUG: print("DEBUG: data():", len(data), data)
687 if "status" in data and data["status"] == "error" and "message" in data:
688 print("WARNING: JSON response is an error:", data["message"])
689 update_last_error(domain, data["message"])
690 return fetch_generator_from_path(domain)
691 elif "message" in data:
692 print("WARNING: JSON response contains only a message:", data["message"])
693 update_last_error(domain, data["message"])
694 return fetch_generator_from_path(domain)
695 elif "software" not in data or "name" not in data["software"]:
696 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
697 software = fetch_generator_from_path(domain)
699 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
702 software = tidyup_domain(data["software"]["name"])
704 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
705 if software in ["akkoma", "rebased"]:
706 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
708 elif software in ["hometown", "ecko"]:
709 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
710 software = "mastodon"
711 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
712 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
714 elif software.find("/") > 0:
715 print("WARNING: Spliting of slash:", software)
716 software = tidup_domain(software.split("/")[-1]);
717 elif software.find("|") > 0:
718 print("WARNING: Spliting of pipe:", software)
719 software = tidyup_domain(software.split("|")[0]);
720 elif "powered by" in software:
721 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
722 software = strip_powered_by(software)
723 elif type(software) is str and " by " in software:
724 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
725 software = strip_until(software, " by ")
726 elif type(software) is str and " see " in software:
727 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
728 software = strip_until(software, " see ")
730 # DEBUG: print(f"DEBUG: software[]={type(software)}")
732 print("WARNING: tidyup_domain() left no software name behind:", domain)
735 # DEBUG: print(f"DEBUG: software[]={type(software)}")
736 if str(software) == "":
737 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
738 software = fetch_generator_from_path(domain)
739 elif len(str(software)) > 0 and ("." in software or " " in software):
740 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
741 software = remove_version(software)
743 # DEBUG: print(f"DEBUG: software[]={type(software)}")
744 if type(software) is str and "powered by" in software:
745 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
746 software = remove_version(strip_powered_by(software))
748 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
751 def is_instance_registered(domain: str) -> bool:
752 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
753 if type(domain) != str:
754 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
756 raise ValueError(f"Parameter 'domain' is empty")
758 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
759 if not cache.key_exists("is_registered"):
760 # DEBUG: print(f"DEBUG: Cache for 'is_registered' not initialized, fetching all rows ...")
762 cursor.execute("SELECT domain FROM instances")
765 cache.set_all("is_registered", cursor.fetchall(), True)
766 except BaseException as e:
767 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
771 registered = cache.sub_key_exists("is_registered", domain)
773 # DEBUG: print(f"DEBUG: registered='{registered}' - EXIT!")
776 def add_instance(domain: str, origin: str, originator: str, path: str = None):
777 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',originator='{originator}',path='{path}' - CALLED!")
778 if type(domain) != str:
779 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
781 raise ValueError(f"Parameter 'domain' is empty")
782 elif type(origin) != str and origin != None:
783 raise ValueError(f"origin[]={type(origin)} is not 'str'")
784 elif type(originator) != str:
785 raise ValueError(f"originator[]={type(originator)} is not 'str'")
786 elif originator == "":
787 raise ValueError(f"originator cannot be empty")
788 elif not validators.domain(domain.split("/")[0]):
789 raise ValueError(f"Bad domain name='{domain}'")
790 elif origin is not None and not validators.domain(origin.split("/")[0]):
791 raise ValueError(f"Bad origin name='{origin}'")
792 elif blacklist.is_blacklisted(domain):
793 raise Exception(f"domain='{domain}' is blacklisted, but method invoked")
795 # DEBUG: print("DEBUG: domain,origin,originator,path:", domain, origin, originator, path)
796 software = determine_software(domain, path)
797 # DEBUG: print("DEBUG: Determined software:", software)
799 print(f"INFO: Adding instance domain='{domain}' (origin='{origin}',software='{software}')")
802 "INSERT INTO instances (domain, origin, originator, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)",
813 cache.set_sub_key("is_registered", domain, True)
815 if instances.has_pending_instance_data(domain):
816 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo being updated ...")
817 instances.set("last_status_code" , domain, None)
818 instances.set("last_error_details", domain, None)
819 instances.update_instance_data(domain)
820 remove_pending_error(domain)
822 if domain in pending_errors:
823 # DEBUG: print("DEBUG: domain has pending error being updated:", domain)
824 update_last_error(domain, pending_errors[domain])
825 remove_pending_error(domain)
827 except BaseException as e:
828 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
831 # DEBUG: print("DEBUG: Updating nodeinfo for domain:", domain)
832 update_last_nodeinfo(domain)
834 # DEBUG: print("DEBUG: EXIT!")
836 def send_bot_post(instance: str, blocks: dict):
837 # DEBUG: print(f"DEBUG: instance={instance},blocks()={len(blocks)} - CALLED!")
838 if type(domain) != str:
839 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
841 raise ValueError("Parameter 'domain' is empty")
842 elif type(blocks) != dict:
843 raise ValueError(f"Parameter blocks[]='{type(blocks)}' is not 'dict'")
845 message = instance + " has blocked the following instances:\n\n"
850 blocks = blocks[0 : 19]
852 # DEBUG: print(f"DEBUG: blocks()={len(blocks)}")
854 # DEBUG: print(f"DEBUG: block['{type(block)}']={block}")
855 if block["reason"] == None or block["reason"] == '':
856 message = message + block["blocked"] + " with unspecified reason\n"
858 if len(block["reason"]) > 420:
859 block["reason"] = block["reason"][0:419] + "[…]"
861 message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n'
864 message = message + "(the list has been truncated to the first 20 entries)"
866 botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}}
869 f"{config.get('bot_instance')}/api/v1/statuses",
872 "visibility" : config.get('bot_visibility'),
873 "content_type": "text/plain"
881 def fetch_friendica_blocks(domain: str) -> dict:
882 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
883 if type(domain) != str:
884 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
886 raise ValueError(f"Parameter 'domain' is empty")
888 # DEBUG: print("DEBUG: Fetching friendica blocks from domain:", domain)
892 doc = bs4.BeautifulSoup(
893 get_response(domain, "/friendica", headers, (config.get("connection_timeout"), config.get("read_timeout"))).text,
896 except BaseException as e:
897 print("WARNING: Failed to fetch /friendica from domain:", domain, e)
898 update_last_error(domain, e)
901 blocklist = doc.find(id="about_blocklist")
903 # Prevents exceptions:
904 if blocklist is None:
905 # DEBUG: print("DEBUG: Instance has no block list:", domain)
908 for line in blocklist.find("table").find_all("tr")[1:]:
909 # DEBUG: print(f"DEBUG: line='{line}'")
911 "domain": tidyup_domain(line.find_all("td")[0].text),
912 "reason": tidyup_domain(line.find_all("td")[1].text)
915 # DEBUG: print("DEBUG: Returning blocks() for domain:", domain, len(blocks))
920 def fetch_misskey_blocks(domain: str) -> dict:
921 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
922 if type(domain) != str:
923 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
925 raise ValueError(f"Parameter 'domain' is empty")
927 # DEBUG: print("DEBUG: Fetching misskey blocks from domain:", domain)
934 step = config.get("misskey_limit")
936 # iterating through all "suspended" (follow-only in its terminology)
937 # instances page-by-page, since that troonware doesn't support
938 # sending them all at once
940 # DEBUG: print(f"DEBUG: Fetching offset='{offset}' from '{domain}' ...")
942 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
943 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
952 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
953 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
958 "offset" : offset - 1
963 # DEBUG: print("DEBUG: fetched():", len(fetched))
964 if len(fetched) == 0:
965 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
967 elif len(fetched) != config.get("misskey_limit"):
968 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
969 offset = offset + (config.get("misskey_limit") - len(fetched))
971 # DEBUG: print("DEBUG: Raising offset by step:", step)
972 offset = offset + step
975 for instance in fetched:
977 if instance["isSuspended"] and not has_key(blocks["suspended"], "domain", instance):
979 blocks["suspended"].append(
981 "domain": tidyup_domain(instance["host"]),
982 # no reason field, nothing
987 # DEBUG: print(f"DEBUG: count={count}")
989 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
992 except BaseException as e:
993 print("WARNING: Caught error, exiting loop:", domain, e)
994 update_last_error(domain, e)
999 # same shit, different asshole ("blocked" aka full suspend)
1002 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
1003 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
1012 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
1013 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
1018 "offset" : offset - 1
1023 # DEBUG: print("DEBUG: fetched():", len(fetched))
1024 if len(fetched) == 0:
1025 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
1027 elif len(fetched) != config.get("misskey_limit"):
1028 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
1029 offset = offset + (config.get("misskey_limit") - len(fetched))
1031 # DEBUG: print("DEBUG: Raising offset by step:", step)
1032 offset = offset + step
1035 for instance in fetched:
1037 if instance["isBlocked"] and not has_key(blocks["blocked"], "domain", instance):
1039 blocks["blocked"].append({
1040 "domain": tidyup_domain(instance["host"]),
1044 # DEBUG: print(f"DEBUG: count={count}")
1046 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
1049 except BaseException as e:
1050 print("ERROR: Exception during POST:", domain, e)
1051 update_last_error(domain, e)
1055 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
1056 instances.update_last_instance_fetch(domain)
1058 # DEBUG: print("DEBUG: Returning for domain,blocked(),suspended():", domain, len(blocks["blocked"]), len(blocks["suspended"]))
1060 "reject" : blocks["blocked"],
1061 "followers_only": blocks["suspended"]
1064 def tidyup_reason(reason: str) -> str:
1065 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
1066 if type(reason) != str:
1067 raise ValueError(f"Parameter reason[]={type(reason)} is not expected")
1070 reason = reason.strip()
1073 reason = re.sub("â", "\"", reason)
1075 ## DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
1078 def tidyup_domain(domain: str) -> str:
1079 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
1080 if type(domain) != str:
1081 raise ValueError(f"Parameter domain[]={type(domain)} is not expected")
1083 # All lower-case and strip spaces out + last dot
1084 domain = domain.lower().strip().rstrip(".")
1087 domain = re.sub("\:\d+$", "", domain)
1089 # No protocol, sometimes without the slashes
1090 domain = re.sub("^https?\:(\/*)", "", domain)
1093 domain = re.sub("\/$", "", domain)
1096 domain = re.sub("^\@", "", domain)
1098 # No individual users in block lists
1099 domain = re.sub("(.+)\@", "", domain)
1101 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
1104 def json_from_response(response: requests.models.Response) -> list:
1105 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
1106 if not isinstance(response, requests.models.Response):
1107 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
1110 if response.text.strip() != "":
1111 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
1113 data = response.json()
1114 except json.decoder.JSONDecodeError:
1117 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
1120 def get_response(domain: str, path: str, headers: dict, timeout: list) -> requests.models.Response:
1121 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!")
1122 if type(domain) != str:
1123 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
1125 raise ValueError("Parameter 'domain' is empty")
1126 elif type(path) != str:
1127 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
1129 raise ValueError("Parameter 'path' is empty")
1132 # DEBUG: print(f"DEBUG: Sending request to '{domain}{path}' ...")
1133 response = reqto.get(
1134 f"https://{domain}{path}",
1138 except requests.exceptions.ConnectionError as e:
1139 # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(e)}]='{str(e)}'")
1140 update_last_error(domain, e)
1143 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")
1146 def has_key(keys: list, search: str, value: any) -> bool:
1147 # DEBUG: print(f"DEBUG: keys()={len(keys)},search='{search}',value[]='{type(value)}' - CALLED!")
1148 if type(keys) != list:
1149 raise ValueError(f"Parameter keys[]='{type(keys)}' is not 'list'")
1150 elif type(search) != str:
1151 raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
1153 raise ValueError("Parameter 'search' is empty")
1156 # DEBUG: print(f"DEBUG: Checking keys()={len(keys)} ...")
1158 # DEBUG: print(f"DEBUG: key['{type(key)}']={key}")
1159 if type(key) != dict:
1160 raise ValueError(f"key[]='{type(key)}' is not 'dict'")
1161 elif not search in key:
1162 raise KeyError(f"Cannot find search='{search}'")
1163 elif key[search] == value:
1167 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
1170 def find_domains(tag: bs4.element.Tag) -> list:
1171 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
1172 if not isinstance(tag, bs4.element.Tag):
1173 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
1174 elif not isinstance(tag, bs4.element.Tag):
1175 raise KeyError("Cannot find table with instances!")
1176 elif len(tag.select("tr")) == 0:
1177 raise KeyError("No table rows found in table!")
1180 for element in tag.select("tr"):
1181 # DEBUG: print(f"DEBUG: element[]={type(element)}")
1182 if not element.find("td"):
1183 # DEBUG: print("DEBUG: Skipping element, no <td> found")
1186 domain = tidyup_domain(element.find("td").text)
1187 reason = tidyup_reason(element.findAll("td")[1].text)
1189 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
1191 if blacklist.is_blacklisted(domain):
1192 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
1194 elif domain == "gab.com/.ai, develop.gab.com":
1195 # DEBUG: print(f"DEBUG: Multiple domains detected in one row")
1197 "domain": "gab.com",
1205 "domain": "develop.gab.com",
1209 elif not validators.domain(domain):
1210 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
1213 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
1219 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
1222 def get_url(url: str, headers: dict, timeout: list) -> requests.models.Response:
1223 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
1224 if type(url) != str:
1225 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
1227 raise ValueError("Parameter 'url' is empty")
1229 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
1230 components = urlparse(url)
1232 # Invoke other function, avoid trailing ?
1233 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
1234 if components.query != "":
1235 response = get_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
1237 response = get_response(components.hostname, f"{components.path}", headers, timeout)
1239 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")