1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
28 from urllib.parse import urlparse
30 from fba import blacklist
32 from fba import config
33 from fba import instances
35 from fba.federation import lemmy
36 from fba.federation import misskey
37 from fba.federation import peertube
39 # Array with pending errors needed to be written to database
43 # "rel" identifiers (no real URLs)
44 nodeinfo_identifier = [
45 "https://nodeinfo.diaspora.software/ns/schema/2.1",
46 "https://nodeinfo.diaspora.software/ns/schema/2.0",
47 "https://nodeinfo.diaspora.software/ns/schema/1.1",
48 "https://nodeinfo.diaspora.software/ns/schema/1.0",
49 "http://nodeinfo.diaspora.software/ns/schema/2.1",
50 "http://nodeinfo.diaspora.software/ns/schema/2.0",
51 "http://nodeinfo.diaspora.software/ns/schema/1.1",
52 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 # HTTP headers for non-API requests
57 "User-Agent": config.get("useragent"),
60 # HTTP headers for API requests
62 "User-Agent": config.get("useragent"),
63 "Content-Type": "application/json",
66 # URL for fetching peers
67 get_peers_url = "/api/v1/instance/peers"
70 connection = sqlite3.connect("blocks.db")
71 cursor = connection.cursor()
73 # Pattern instance for version numbers
75 # semantic version number (with v|V) prefix)
76 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
77 # non-sematic, e.g. 1.2.3.4
78 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
79 # non-sematic, e.g. 2023-05[-dev]
80 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
81 # non-semantic, e.g. abcdef0
82 re.compile("^[a-f0-9]{7}$"),
85 ##### Other functions #####
87 def is_primitive(var: any) -> bool:
88 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
89 return type(var) in {int, str, float, bool} or var == None
91 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
92 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
93 if type(domain) != str:
94 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
96 raise ValueError(f"Parameter 'domain' is empty")
97 elif type(origin) != str and origin != None:
98 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
99 elif type(script) != str:
100 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
102 raise ValueError(f"Parameter 'domain' is empty")
104 if not instances.is_registered(domain):
105 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
106 instances.add(domain, origin, script, path)
108 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
109 peerlist = get_peers(domain, software)
111 if (peerlist is None):
112 print("ERROR: Cannot fetch peers:", domain)
114 elif instances.has_pending_instance_data(domain):
115 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
116 instances.update_data(domain)
118 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
119 for instance in peerlist:
121 # Skip "None" types as tidup() cannot parse them
124 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
125 instance = tidyup_domain(instance)
126 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
129 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
131 elif not validators.domain(instance.split("/")[0]):
132 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
134 elif blacklist.is_blacklisted(instance):
135 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
138 # DEBUG: print("DEBUG: Handling instance:", instance)
140 if not instances.is_registered(instance):
141 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
142 instances.add(instance, domain, script)
143 except BaseException as e:
144 print(f"ERROR: instance='{instance}',exception[{type(e)}]:'{str(e)}'")
147 # DEBUG: print("DEBUG: EXIT!")
149 def add_peers(rows: dict) -> list:
150 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
152 for key in ["linked", "allowed", "blocked"]:
153 # DEBUG: print(f"DEBUG: Checking key='{key}'")
154 if key in rows and rows[key] != None:
155 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
156 for peer in rows[key]:
157 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
158 peer = tidyup_domain(peer)
160 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
161 if blacklist.is_blacklisted(peer):
162 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
165 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
168 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
171 def remove_version(software: str) -> str:
172 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
173 if not "." in software and " " not in software:
174 print(f"WARNING: software='{software}' does not contain a version number.")
179 temp = software.split(";")[0]
180 elif "," in software:
181 temp = software.split(",")[0]
182 elif " - " in software:
183 temp = software.split(" - ")[0]
185 # DEBUG: print(f"DEBUG: software='{software}'")
188 version = temp.split(" ")[-1]
189 elif "/" in software:
190 version = temp.split("/")[-1]
191 elif "-" in software:
192 version = temp.split("-")[-1]
194 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
199 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
200 for pattern in patterns:
202 match = pattern.match(version)
204 # DEBUG: print(f"DEBUG: match[]={type(match)}")
205 if type(match) is re.Match:
208 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
209 if type(match) is not re.Match:
210 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
213 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
214 end = len(temp) - len(version) - 1
216 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
217 software = temp[0:end].strip()
218 if " version" in software:
219 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
220 software = strip_until(software, " version")
222 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
225 def strip_powered_by(software: str) -> str:
226 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
228 print(f"ERROR: Bad method call, 'software' is empty")
229 raise Exception("Parameter 'software' is empty")
230 elif not "powered by" in software:
231 print(f"WARNING: Cannot find 'powered by' in '{software}'!")
234 start = software.find("powered by ")
235 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
237 software = software[start + 11:].strip()
238 # DEBUG: print(f"DEBUG: software='{software}'")
240 software = strip_until(software, " - ")
242 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
245 def strip_hosted_on(software: str) -> str:
246 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
248 print(f"ERROR: Bad method call, 'software' is empty")
249 raise Exception("Parameter 'software' is empty")
250 elif not "hosted on" in software:
251 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
254 end = software.find("hosted on ")
255 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
257 software = software[0, start].strip()
258 # DEBUG: print(f"DEBUG: software='{software}'")
260 software = strip_until(software, " - ")
262 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
265 def strip_until(software: str, until: str) -> str:
266 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
268 print(f"ERROR: Bad method call, 'software' is empty")
269 raise Exception("Parameter 'software' is empty")
271 print(f"ERROR: Bad method call, 'until' is empty")
272 raise Exception("Parameter 'until' is empty")
273 elif not until in software:
274 print(f"WARNING: Cannot find '{until}' in '{software}'!")
277 # Next, strip until part
278 end = software.find(until)
280 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
282 software = software[0:end].strip()
284 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
287 def remove_pending_error(domain: str):
288 if type(domain) != str:
289 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
291 raise ValueError(f"Parameter 'domain' is empty")
294 # Prevent updating any pending errors, nodeinfo was found
295 del pending_errors[domain]
300 # DEBUG: print("DEBUG: EXIT!")
302 def get_hash(domain: str) -> str:
303 if type(domain) != str:
304 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
306 raise ValueError(f"Parameter 'domain' is empty")
308 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
310 def log_error(domain: str, response: requests.models.Response):
311 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
312 if type(domain) != str:
313 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
315 raise ValueError(f"Parameter 'domain' is empty")
318 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
319 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
320 response = str(response)
322 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
323 if type(response) is str:
324 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
330 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
332 response.status_code,
337 # Cleanup old entries
338 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
339 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
340 except BaseException as e:
341 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
344 # DEBUG: print("DEBUG: EXIT!")
346 def get_peers(domain: str, software: str) -> list:
347 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
348 if type(domain) != str:
349 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
351 raise ValueError(f"Parameter 'domain' is empty")
352 elif type(software) != str and software != None:
353 raise ValueError(f"software[]={type(software)} is not 'str'")
355 if software == "misskey":
356 # DEBUG: print(f"DEBUG: Invoking misskey.get_peers({domain}) ...")
357 return misskey.get_peers(domain)
358 elif software == "lemmy":
359 # DEBUG: print(f"DEBUG: Invoking lemmy.get_peers({domain}) ...")
360 return lemmy.get_peers(domain)
361 elif software == "peertube":
362 # DEBUG: print(f"DEBUG: Invoking peertube.get_peers({domain}) ...")
363 return peertube.get_peers(domain)
365 # DEBUG: print(f"DEBUG: Fetching get_peers_url='{get_peers_url}' from '{domain}',software='{software}' ...")
368 response = get_response(domain, get_peers_url, api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
370 data = json_from_response(response)
372 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
373 if not response.ok or response.status_code >= 400:
374 # DEBUG: print(f"DEBUG: Was not able to fetch '{get_peers_url}', trying alternative ...")
375 response = get_response(domain, "/api/v3/site", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
377 data = json_from_response(response)
378 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
379 if not response.ok or response.status_code >= 400:
380 print("WARNING: Could not reach any JSON API:", domain)
381 instances.update_last_error(domain, response)
382 elif response.ok and isinstance(data, list):
383 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
385 elif "federated_instances" in data:
386 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
387 peers = peers + add_peers(data["federated_instances"])
388 # DEBUG: print("DEBUG: Added instance(s) to peers")
390 print("WARNING: JSON response does not contain 'federated_instances':", domain)
391 instances.update_last_error(domain, response)
393 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
396 except BaseException as e:
397 print("WARNING: Some error during get():", domain, e)
398 instances.update_last_error(domain, e)
400 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
401 instances.set("total_peers", domain, len(peers))
403 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
404 instances.update_last_instance_fetch(domain)
406 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
409 def post_json_api(domain: str, path: str, parameter: str, extra_headers: dict = {}) -> dict:
410 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',parameter='{parameter}',extra_headers()={len(extra_headers)} - CALLED!")
411 if type(domain) != str:
412 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
414 raise ValueError(f"Parameter 'domain' is empty")
415 elif type(path) != str:
416 raise ValueError(f"path[]={type(path)} is not 'str'")
418 raise ValueError("Parameter 'path' cannot be empty")
419 elif type(parameter) != str:
420 raise ValueError(f"parameter[]={type(parameter)} is not 'str'")
422 # DEBUG: print("DEBUG: Sending POST to domain,path,parameter:", domain, path, parameter, extra_headers)
425 response = reqto.post(
426 f"https://{domain}{path}",
428 headers={**api_headers, **extra_headers},
429 timeout=(config.get("connection_timeout"), config.get("read_timeout"))
432 data = json_from_response(response)
433 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
434 if not response.ok or response.status_code >= 400:
435 print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',parameter()={len(parameter)},response.status_code='{response.status_code}',data[]='{type(data)}'")
436 instances.update_last_error(domain, response)
438 except BaseException as e:
439 print(f"WARNING: Some error during post(): domain='{domain}',path='{path}',parameter()={len(parameter)},exception[{type(e)}]:'{str(e)}'")
441 # DEBUG: print(f"DEBUG: Returning data({len(data)})=[]:{type(data)}")
444 def fetch_nodeinfo(domain: str, path: str = None) -> list:
445 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
446 if type(domain) != str:
447 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
449 raise ValueError(f"Parameter 'domain' is empty")
450 elif type(path) != str and path != None:
451 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
453 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
454 nodeinfo = fetch_wellknown_nodeinfo(domain)
456 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
457 if len(nodeinfo) > 0:
458 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
462 "/nodeinfo/2.1.json",
464 "/nodeinfo/2.0.json",
471 for request in request_paths:
472 if path != None and path != "" and path != f"https://{domain}{path}":
473 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
477 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
478 response = get_response(domain, request, api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
480 data = json_from_response(response)
481 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
482 if response.ok and isinstance(data, dict):
483 # DEBUG: print("DEBUG: Success:", request)
484 instances.set("detection_mode", domain, "STATIC_CHECK")
485 instances.set("nodeinfo_url" , domain, request)
487 elif response.ok and isinstance(data, list):
488 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
490 elif not response.ok or response.status_code >= 400:
491 print("WARNING: Failed fetching nodeinfo from domain:", domain)
492 instances.update_last_error(domain, response)
495 except BaseException as e:
496 # DEBUG: print("DEBUG: Cannot fetch API request:", request)
497 instances.update_last_error(domain, e)
500 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
503 def fetch_wellknown_nodeinfo(domain: str) -> list:
504 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
505 if type(domain) != str:
506 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
508 raise ValueError(f"Parameter 'domain' is empty")
510 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
514 response = get_response(domain, "/.well-known/nodeinfo", api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
516 data = json_from_response(response)
517 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
518 if response.ok and isinstance(data, dict):
520 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
521 if "links" in nodeinfo:
522 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
523 for link in nodeinfo["links"]:
524 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
525 if link["rel"] in nodeinfo_identifier:
526 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
527 response = get_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
529 data = json_from_response(response)
530 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
531 if response.ok and isinstance(data, dict):
532 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
533 instances.set("detection_mode", domain, "AUTO_DISCOVERY")
534 instances.set("nodeinfo_url" , domain, link["href"])
537 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
539 print("WARNING: nodeinfo does not contain 'links':", domain)
541 except BaseException as e:
542 print("WARNING: Failed fetching .well-known info:", domain)
543 instances.update_last_error(domain, e)
546 # DEBUG: print("DEBUG: Returning data[]:", type(data))
549 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
550 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
551 if type(domain) != str:
552 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
554 raise ValueError(f"Parameter 'domain' is empty")
555 elif type(path) != str:
556 raise ValueError(f"path[]={type(path)} is not 'str'")
558 raise ValueError(f"Parameter 'domain' is empty")
560 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
564 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
565 response = get_response(domain, path, headers, (config.get("connection_timeout"), config.get("read_timeout")))
567 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
568 if response.ok and response.status_code < 300 and len(response.text) > 0:
569 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
570 doc = bs4.BeautifulSoup(response.text, "html.parser")
572 # DEBUG: print("DEBUG: doc[]:", type(doc))
573 generator = doc.find("meta", {"name": "generator"})
574 site_name = doc.find("meta", {"property": "og:site_name"})
576 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
577 if isinstance(generator, bs4.element.Tag):
578 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
579 software = tidyup_domain(generator.get("content"))
580 print(f"INFO: domain='{domain}' is generated by '{software}'")
581 instances.set("detection_mode", domain, "GENERATOR")
582 remove_pending_error(domain)
583 elif isinstance(site_name, bs4.element.Tag):
584 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
585 sofware = tidyup_domain(site_name.get("content"))
586 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
587 instances.set("detection_mode", domain, "SITE_NAME")
588 remove_pending_error(domain)
590 except BaseException as e:
591 # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e)
592 instances.update_last_error(domain, e)
595 # DEBUG: print(f"DEBUG: software[]={type(software)}")
596 if type(software) is str and software == "":
597 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
599 elif type(software) is str and ("." in software or " " in software):
600 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
601 software = remove_version(software)
603 # DEBUG: print(f"DEBUG: software[]={type(software)}")
604 if type(software) is str and " powered by " in software:
605 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
606 software = remove_version(strip_powered_by(software))
607 elif type(software) is str and " hosted on " in software:
608 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
609 software = remove_version(strip_hosted_on(software))
610 elif type(software) is str and " by " in software:
611 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
612 software = strip_until(software, " by ")
613 elif type(software) is str and " see " in software:
614 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
615 software = strip_until(software, " see ")
617 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
620 def determine_software(domain: str, path: str = None) -> str:
621 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
622 if type(domain) != str:
623 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
625 raise ValueError(f"Parameter 'domain' is empty")
626 elif type(path) != str and path != None:
627 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
629 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
632 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
633 data = fetch_nodeinfo(domain, path)
635 # DEBUG: print("DEBUG: data[]:", type(data))
636 if not isinstance(data, dict) or len(data) == 0:
637 # DEBUG: print("DEBUG: Could not determine software type:", domain)
638 return fetch_generator_from_path(domain)
640 # DEBUG: print("DEBUG: data():", len(data), data)
641 if "status" in data and data["status"] == "error" and "message" in data:
642 print("WARNING: JSON response is an error:", data["message"])
643 instances.update_last_error(domain, data["message"])
644 return fetch_generator_from_path(domain)
645 elif "message" in data:
646 print("WARNING: JSON response contains only a message:", data["message"])
647 instances.update_last_error(domain, data["message"])
648 return fetch_generator_from_path(domain)
649 elif "software" not in data or "name" not in data["software"]:
650 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
651 software = fetch_generator_from_path(domain)
653 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
656 software = tidyup_domain(data["software"]["name"])
658 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
659 if software in ["akkoma", "rebased"]:
660 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
662 elif software in ["hometown", "ecko"]:
663 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
664 software = "mastodon"
665 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
666 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
668 elif software.find("/") > 0:
669 print("WARNING: Spliting of slash:", software)
670 software = tidup_domain(software.split("/")[-1]);
671 elif software.find("|") > 0:
672 print("WARNING: Spliting of pipe:", software)
673 software = tidyup_domain(software.split("|")[0]);
674 elif "powered by" in software:
675 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
676 software = strip_powered_by(software)
677 elif type(software) is str and " by " in software:
678 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
679 software = strip_until(software, " by ")
680 elif type(software) is str and " see " in software:
681 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
682 software = strip_until(software, " see ")
684 # DEBUG: print(f"DEBUG: software[]={type(software)}")
686 print("WARNING: tidyup_domain() left no software name behind:", domain)
689 # DEBUG: print(f"DEBUG: software[]={type(software)}")
690 if str(software) == "":
691 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
692 software = fetch_generator_from_path(domain)
693 elif len(str(software)) > 0 and ("." in software or " " in software):
694 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
695 software = remove_version(software)
697 # DEBUG: print(f"DEBUG: software[]={type(software)}")
698 if type(software) is str and "powered by" in software:
699 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
700 software = remove_version(strip_powered_by(software))
702 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
705 def send_bot_post(instance: str, blocklist: dict):
706 # DEBUG: print(f"DEBUG: instance={instance},blocklist()={len(blocklist)} - CALLED!")
707 if type(domain) != str:
708 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
710 raise ValueError("Parameter 'domain' is empty")
711 elif type(blocklist) != dict:
712 raise ValueError(f"Parameter blocklist[]='{type(blocklist)}' is not 'dict'")
714 message = instance + " has blocked the following instances:\n\n"
717 if len(blocklist) > 20:
719 blocklist = blocklist[0 : 19]
721 # DEBUG: print(f"DEBUG: blocklist()={len(blocklist)}")
722 for block in blocklist:
723 # DEBUG: print(f"DEBUG: block['{type(block)}']={block}")
724 if block["reason"] == None or block["reason"] == '':
725 message = message + block["blocked"] + " with unspecified reason\n"
727 if len(block["reason"]) > 420:
728 block["reason"] = block["reason"][0:419] + "[…]"
730 message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n'
733 message = message + "(the list has been truncated to the first 20 entries)"
735 botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}}
738 f"{config.get('bot_instance')}/api/v1/statuses",
741 "visibility" : config.get('bot_visibility'),
742 "content_type": "text/plain"
750 def fetch_friendica_blocks(domain: str) -> dict:
751 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
752 if type(domain) != str:
753 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
755 raise ValueError(f"Parameter 'domain' is empty")
757 # DEBUG: print("DEBUG: Fetching friendica blocks from domain:", domain)
761 doc = bs4.BeautifulSoup(
762 get_response(domain, "/friendica", headers, (config.get("connection_timeout"), config.get("read_timeout"))).text,
765 except BaseException as e:
766 print("WARNING: Failed to fetch /friendica from domain:", domain, e)
767 instances.update_last_error(domain, e)
770 blocklist = doc.find(id="about_blocklist")
772 # Prevents exceptions:
773 if blocklist is None:
774 # DEBUG: print("DEBUG: Instance has no block list:", domain)
777 table = blocklist.find("table")
779 # DEBUG: print(f"DEBUG: table[]='{type(table)}'")
780 if table.find("tbody"):
781 rows = table.find("tbody").find_all("tr")
783 rows = table.find_all("tr")
785 # DEBUG: print(f"DEBUG: Found rows()={len(rows)}")
787 # DEBUG: print(f"DEBUG: line='{line}'")
789 "domain": tidyup_domain(line.find_all("td")[0].text),
790 "reason": tidyup_reason(line.find_all("td")[1].text)
792 # DEBUG: print("DEBUG: Next!")
794 # DEBUG: print("DEBUG: Returning blocklist() for domain:", domain, len(blocklist))
799 def fetch_misskey_blocks(domain: str) -> dict:
800 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
801 if type(domain) != str:
802 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
804 raise ValueError(f"Parameter 'domain' is empty")
806 # DEBUG: print("DEBUG: Fetching misskey blocks from domain:", domain)
813 step = config.get("misskey_limit")
815 # iterating through all "suspended" (follow-only in its terminology)
816 # instances page-by-page, since that troonware doesn't support
817 # sending them all at once
819 # DEBUG: print(f"DEBUG: Fetching offset='{offset}' from '{domain}' ...")
821 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
822 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
831 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
832 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
837 "offset" : offset - 1
842 # DEBUG: print("DEBUG: fetched():", len(fetched))
843 if len(fetched) == 0:
844 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
846 elif len(fetched) != config.get("misskey_limit"):
847 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
848 offset = offset + (config.get("misskey_limit") - len(fetched))
850 # DEBUG: print("DEBUG: Raising offset by step:", step)
851 offset = offset + step
854 for instance in fetched:
856 if instance["isSuspended"] and not has_key(blocklist["suspended"], "domain", instance):
858 blocklist["suspended"].append(
860 "domain": tidyup_domain(instance["host"]),
861 # no reason field, nothing
866 # DEBUG: print(f"DEBUG: count={count}")
868 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
871 except BaseException as e:
872 print("WARNING: Caught error, exiting loop:", domain, e)
873 instances.update_last_error(domain, e)
878 # same shit, different asshole ("blocked" aka full suspend)
881 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
882 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
891 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
892 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
897 "offset" : offset - 1
902 # DEBUG: print("DEBUG: fetched():", len(fetched))
903 if len(fetched) == 0:
904 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
906 elif len(fetched) != config.get("misskey_limit"):
907 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
908 offset = offset + (config.get("misskey_limit") - len(fetched))
910 # DEBUG: print("DEBUG: Raising offset by step:", step)
911 offset = offset + step
914 for instance in fetched:
916 if instance["isBlocked"] and not has_key(blocklist["blocked"], "domain", instance):
918 blocklist["blocked"].append({
919 "domain": tidyup_domain(instance["host"]),
923 # DEBUG: print(f"DEBUG: count={count}")
925 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
928 except BaseException as e:
929 print("ERROR: Exception during POST:", domain, e)
930 instances.update_last_error(domain, e)
934 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
935 instances.update_last_instance_fetch(domain)
937 # DEBUG: print("DEBUG: Returning for domain,blocked(),suspended():", domain, len(blocklist["blocked"]), len(blocklist["suspended"]))
939 "reject" : blocklist["blocked"],
940 "followers_only": blocklist["suspended"]
943 def tidyup_reason(reason: str) -> str:
944 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
945 if type(reason) != str:
946 raise ValueError(f"Parameter reason[]={type(reason)} is not expected")
949 reason = reason.strip()
952 reason = re.sub("â", "\"", reason)
954 # DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
957 def tidyup_domain(domain: str) -> str:
958 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
959 if type(domain) != str:
960 raise ValueError(f"Parameter domain[]={type(domain)} is not expected")
962 # All lower-case and strip spaces out + last dot
963 domain = domain.lower().strip().rstrip(".")
966 domain = re.sub("\:\d+$", "", domain)
968 # No protocol, sometimes without the slashes
969 domain = re.sub("^https?\:(\/*)", "", domain)
972 domain = re.sub("\/$", "", domain)
975 domain = re.sub("^\@", "", domain)
977 # No individual users in block lists
978 domain = re.sub("(.+)\@", "", domain)
979 if domain.find("/profile/"):
980 domain = domain.split("/profile/")[0]
981 elif domain.find("/users/"):
982 domain = domain.split("/users/")[0]
984 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
987 def json_from_response(response: requests.models.Response) -> list:
988 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
989 if not isinstance(response, requests.models.Response):
990 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
993 if response.text.strip() != "":
994 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
996 data = response.json()
997 except json.decoder.JSONDecodeError:
1000 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
1003 def get_response(domain: str, path: str, headers: dict, timeout: list) -> requests.models.Response:
1004 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!")
1005 if type(domain) != str:
1006 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
1008 raise ValueError("Parameter 'domain' is empty")
1009 elif type(path) != str:
1010 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
1012 raise ValueError("Parameter 'path' is empty")
1015 # DEBUG: print(f"DEBUG: Sending request to '{domain}{path}' ...")
1016 response = reqto.get(
1017 f"https://{domain}{path}",
1021 except requests.exceptions.ConnectionError as e:
1022 # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(e)}]='{str(e)}'")
1023 instances.update_last_error(domain, e)
1026 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")
1029 def has_key(keys: list, search: str, value: any) -> bool:
1030 # DEBUG: print(f"DEBUG: keys()={len(keys)},search='{search}',value[]='{type(value)}' - CALLED!")
1031 if type(keys) != list:
1032 raise ValueError(f"Parameter keys[]='{type(keys)}' is not 'list'")
1033 elif type(search) != str:
1034 raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
1036 raise ValueError("Parameter 'search' is empty")
1039 # DEBUG: print(f"DEBUG: Checking keys()={len(keys)} ...")
1041 # DEBUG: print(f"DEBUG: key['{type(key)}']={key}")
1042 if type(key) != dict:
1043 raise ValueError(f"key[]='{type(key)}' is not 'dict'")
1044 elif not search in key:
1045 raise KeyError(f"Cannot find search='{search}'")
1046 elif key[search] == value:
1050 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
1053 def find_domains(tag: bs4.element.Tag) -> list:
1054 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
1055 if not isinstance(tag, bs4.element.Tag):
1056 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
1057 elif not isinstance(tag, bs4.element.Tag):
1058 raise KeyError("Cannot find table with instances!")
1059 elif len(tag.select("tr")) == 0:
1060 raise KeyError("No table rows found in table!")
1063 for element in tag.select("tr"):
1064 # DEBUG: print(f"DEBUG: element[]={type(element)}")
1065 if not element.find("td"):
1066 # DEBUG: print("DEBUG: Skipping element, no <td> found")
1069 domain = tidyup_domain(element.find("td").text)
1070 reason = tidyup_reason(element.findAll("td")[1].text)
1072 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
1074 if blacklist.is_blacklisted(domain):
1075 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
1077 elif domain == "gab.com/.ai, develop.gab.com":
1078 # DEBUG: print(f"DEBUG: Multiple domains detected in one row")
1080 "domain": "gab.com",
1088 "domain": "develop.gab.com",
1092 elif not validators.domain(domain):
1093 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
1096 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
1102 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
1105 def get_url(url: str, headers: dict, timeout: list) -> requests.models.Response:
1106 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
1107 if type(url) != str:
1108 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
1110 raise ValueError("Parameter 'url' is empty")
1112 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
1113 components = urlparse(url)
1115 # Invoke other function, avoid trailing ?
1116 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
1117 if components.query != "":
1118 response = get_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
1120 response = get_response(components.hostname, f"{components.path}", headers, timeout)
1122 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")