1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
28 from urllib.parse import urlparse
30 from fba import blacklist
32 from fba import config
33 from fba import instances
35 from fba.federation import lemmy
36 from fba.federation import misskey
37 from fba.federation import peertube
39 # Array with pending errors needed to be written to database
43 # "rel" identifiers (no real URLs)
44 nodeinfo_identifier = [
45 "https://nodeinfo.diaspora.software/ns/schema/2.1",
46 "https://nodeinfo.diaspora.software/ns/schema/2.0",
47 "https://nodeinfo.diaspora.software/ns/schema/1.1",
48 "https://nodeinfo.diaspora.software/ns/schema/1.0",
49 "http://nodeinfo.diaspora.software/ns/schema/2.1",
50 "http://nodeinfo.diaspora.software/ns/schema/2.0",
51 "http://nodeinfo.diaspora.software/ns/schema/1.1",
52 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 # HTTP headers for non-API requests
57 "User-Agent": config.get("useragent"),
60 # HTTP headers for API requests
62 "User-Agent": config.get("useragent"),
63 "Content-Type": "application/json",
67 connection = sqlite3.connect("blocks.db")
68 cursor = connection.cursor()
70 # Pattern instance for version numbers
72 # semantic version number (with v|V) prefix)
73 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)?$"),
74 # non-sematic, e.g. 1.2.3.4
75 re.compile("^(?P<version>v|V{0,1})(\.{0,1})(?P<major>0|[1-9]\d*)\.(?P<minor>0+|[1-9]\d*)(\.(?P<patch>0+|[1-9]\d*)(\.(?P<subpatch>0|[1-9]\d*))?)$"),
76 # non-sematic, e.g. 2023-05[-dev]
77 re.compile("^(?P<year>[1-9]{1}[0-9]{3})\.(?P<month>[0-9]{2})(-dev){0,1}$"),
78 # non-semantic, e.g. abcdef0
79 re.compile("^[a-f0-9]{7}$"),
82 ##### Other functions #####
84 def is_primitive(var: any) -> bool:
85 # DEBUG: print(f"DEBUG: var[]='{type(var)}' - CALLED!")
86 return type(var) in {int, str, float, bool} or var == None
88 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
89 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
90 if type(domain) != str:
91 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
93 raise ValueError(f"Parameter 'domain' is empty")
94 elif type(origin) != str and origin != None:
95 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
96 elif type(script) != str:
97 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
99 raise ValueError(f"Parameter 'domain' is empty")
101 if not instances.is_registered(domain):
102 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
103 instances.add(domain, origin, script, path)
105 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
106 peerlist = fetch_peers(domain, software)
108 if (peerlist is None):
109 print("ERROR: Cannot fetch peers:", domain)
111 elif instances.has_pending_instance_data(domain):
112 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
113 instances.update_data(domain)
115 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
116 for instance in peerlist:
118 # Skip "None" types as tidup() cannot parse them
121 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
122 instance = tidyup_domain(instance)
123 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
126 print("WARNING: Empty instance after tidyup_domain(), domain:", domain)
128 elif not validators.domain(instance.split("/")[0]):
129 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
131 elif blacklist.is_blacklisted(instance):
132 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
135 # DEBUG: print("DEBUG: Handling instance:", instance)
137 if not instances.is_registered(instance):
138 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
139 instances.add(instance, domain, script)
140 except BaseException as e:
141 print(f"ERROR: instance='{instance}',exception[{type(e)}]:'{str(e)}'")
144 # DEBUG: print("DEBUG: EXIT!")
146 def add_peers(rows: dict) -> list:
147 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
149 for key in ["linked", "allowed", "blocked"]:
150 # DEBUG: print(f"DEBUG: Checking key='{key}'")
151 if key in rows and rows[key] != None:
152 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
153 for peer in rows[key]:
154 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
155 peer = tidyup_domain(peer)
157 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
158 if blacklist.is_blacklisted(peer):
159 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
162 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
165 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
168 def remove_version(software: str) -> str:
169 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
170 if not "." in software and " " not in software:
171 print(f"WARNING: software='{software}' does not contain a version number.")
176 temp = software.split(";")[0]
177 elif "," in software:
178 temp = software.split(",")[0]
179 elif " - " in software:
180 temp = software.split(" - ")[0]
182 # DEBUG: print(f"DEBUG: software='{software}'")
185 version = temp.split(" ")[-1]
186 elif "/" in software:
187 version = temp.split("/")[-1]
188 elif "-" in software:
189 version = temp.split("-")[-1]
191 # DEBUG: print(f"DEBUG: Was not able to find common seperator, returning untouched software='{software}'")
196 # DEBUG: print(f"DEBUG: Checking {len(patterns)} patterns ...")
197 for pattern in patterns:
199 match = pattern.match(version)
201 # DEBUG: print(f"DEBUG: match[]={type(match)}")
202 if type(match) is re.Match:
205 # DEBUG: print(f"DEBUG: version[{type(version)}]='{version}',match='{match}'")
206 if type(match) is not re.Match:
207 print(f"WARNING: version='{version}' does not match regex, leaving software='{software}' untouched.")
210 # DEBUG: print(f"DEBUG: Found valid version number: '{version}', removing it ...")
211 end = len(temp) - len(version) - 1
213 # DEBUG: print(f"DEBUG: end[{type(end)}]={end}")
214 software = temp[0:end].strip()
215 if " version" in software:
216 # DEBUG: print(f"DEBUG: software='{software}' contains word ' version'")
217 software = strip_until(software, " version")
219 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
222 def strip_powered_by(software: str) -> str:
223 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
225 print(f"ERROR: Bad method call, 'software' is empty")
226 raise Exception("Parameter 'software' is empty")
227 elif not "powered by" in software:
228 print(f"WARNING: Cannot find 'powered by' in '{software}'!")
231 start = software.find("powered by ")
232 # DEBUG: print(f"DEBUG: start[{type(start)}]='{start}'")
234 software = software[start + 11:].strip()
235 # DEBUG: print(f"DEBUG: software='{software}'")
237 software = strip_until(software, " - ")
239 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
242 def strip_hosted_on(software: str) -> str:
243 # DEBUG: print(f"DEBUG: software='{software}' - CALLED!")
245 print(f"ERROR: Bad method call, 'software' is empty")
246 raise Exception("Parameter 'software' is empty")
247 elif not "hosted on" in software:
248 print(f"WARNING: Cannot find 'hosted on' in '{software}'!")
251 end = software.find("hosted on ")
252 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
254 software = software[0, start].strip()
255 # DEBUG: print(f"DEBUG: software='{software}'")
257 software = strip_until(software, " - ")
259 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
262 def strip_until(software: str, until: str) -> str:
263 # DEBUG: print(f"DEBUG: software='{software}',until='{until}' - CALLED!")
265 print(f"ERROR: Bad method call, 'software' is empty")
266 raise Exception("Parameter 'software' is empty")
268 print(f"ERROR: Bad method call, 'until' is empty")
269 raise Exception("Parameter 'until' is empty")
270 elif not until in software:
271 print(f"WARNING: Cannot find '{until}' in '{software}'!")
274 # Next, strip until part
275 end = software.find(until)
277 # DEBUG: print(f"DEBUG: end[{type(end)}]='{end}'")
279 software = software[0:end].strip()
281 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
284 def remove_pending_error(domain: str):
285 if type(domain) != str:
286 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
288 raise ValueError(f"Parameter 'domain' is empty")
291 # Prevent updating any pending errors, nodeinfo was found
292 del pending_errors[domain]
297 # DEBUG: print("DEBUG: EXIT!")
299 def get_hash(domain: str) -> str:
300 if type(domain) != str:
301 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
303 raise ValueError(f"Parameter 'domain' is empty")
305 return hashlib.sha256(domain.encode("utf-8")).hexdigest()
307 def log_error(domain: str, response: requests.models.Response):
308 # DEBUG: print("DEBUG: domain,response[]:", domain, type(response))
309 if type(domain) != str:
310 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
312 raise ValueError(f"Parameter 'domain' is empty")
315 # DEBUG: print("DEBUG: BEFORE response[]:", type(response))
316 if isinstance(response, BaseException) or isinstance(response, json.decoder.JSONDecodeError):
317 response = str(response)
319 # DEBUG: print("DEBUG: AFTER response[]:", type(response))
320 if type(response) is str:
321 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
327 cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
329 response.status_code,
334 # Cleanup old entries
335 # DEBUG: print(f"DEBUG: Purging old records (distance: {config.get('error_log_cleanup')})")
336 cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
337 except BaseException as e:
338 print(f"ERROR: failed SQL query: domain='{domain}',exception[{type(e)}]:'{str(e)}'")
341 # DEBUG: print("DEBUG: EXIT!")
343 def fetch_peers(domain: str, software: str) -> list:
344 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
345 if type(domain) != str:
346 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
348 raise ValueError(f"Parameter 'domain' is empty")
349 elif type(software) != str and software != None:
350 raise ValueError(f"software[]={type(software)} is not 'str'")
352 if software == "misskey":
353 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
354 return misskey.fetch_peers(domain)
355 elif software == "lemmy":
356 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
357 return lemmy.fetch_peers(domain)
358 elif software == "peertube":
359 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
360 return peertube.fetch_peers(domain)
362 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
365 response = get_response(domain, "/api/v1/instance/peers", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
367 data = json_from_response(response)
369 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
370 if not response.ok or response.status_code >= 400:
371 # DEBUG: print(f"DEBUG: Was not able to fetch peers, trying alternative ...")
372 response = get_response(domain, "/api/v3/site", api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
374 data = json_from_response(response)
375 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
376 if not response.ok or response.status_code >= 400:
377 print("WARNING: Could not reach any JSON API:", domain)
378 instances.update_last_error(domain, response)
379 elif response.ok and isinstance(data, list):
380 # DEBUG: print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
382 elif "federated_instances" in data:
383 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
384 peers = peers + add_peers(data["federated_instances"])
385 # DEBUG: print("DEBUG: Added instance(s) to peers")
387 print("WARNING: JSON response does not contain 'federated_instances':", domain)
388 instances.update_last_error(domain, response)
390 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
393 except BaseException as e:
394 print("WARNING: Some error during get():", domain, e)
395 instances.update_last_error(domain, e)
397 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
398 instances.set("total_peers", domain, len(peers))
400 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
401 instances.update_last_instance_fetch(domain)
403 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
406 def post_json_api(domain: str, path: str, parameter: str, extra_headers: dict = {}) -> dict:
407 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',parameter='{parameter}',extra_headers()={len(extra_headers)} - CALLED!")
408 if type(domain) != str:
409 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
411 raise ValueError(f"Parameter 'domain' is empty")
412 elif type(path) != str:
413 raise ValueError(f"path[]={type(path)} is not 'str'")
415 raise ValueError("Parameter 'path' cannot be empty")
416 elif type(parameter) != str:
417 raise ValueError(f"parameter[]={type(parameter)} is not 'str'")
419 # DEBUG: print("DEBUG: Sending POST to domain,path,parameter:", domain, path, parameter, extra_headers)
422 response = reqto.post(
423 f"https://{domain}{path}",
425 headers={**api_headers, **extra_headers},
426 timeout=(config.get("connection_timeout"), config.get("read_timeout"))
429 data = json_from_response(response)
430 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
431 if not response.ok or response.status_code >= 400:
432 print(f"WARNING: Cannot query JSON API: domain='{domain}',path='{path}',parameter()={len(parameter)},response.status_code='{response.status_code}',data[]='{type(data)}'")
433 instances.update_last_error(domain, response)
435 except BaseException as e:
436 print(f"WARNING: Some error during post(): domain='{domain}',path='{path}',parameter()={len(parameter)},exception[{type(e)}]:'{str(e)}'")
438 # DEBUG: print(f"DEBUG: Returning data({len(data)})=[]:{type(data)}")
441 def fetch_nodeinfo(domain: str, path: str = None) -> list:
442 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
443 if type(domain) != str:
444 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
446 raise ValueError(f"Parameter 'domain' is empty")
447 elif type(path) != str and path != None:
448 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
450 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
451 nodeinfo = fetch_wellknown_nodeinfo(domain)
453 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
454 if len(nodeinfo) > 0:
455 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
459 "/nodeinfo/2.1.json",
461 "/nodeinfo/2.0.json",
468 for request in request_paths:
469 if path != None and path != "" and path != f"https://{domain}{path}":
470 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
474 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
475 response = get_response(domain, request, api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
477 data = json_from_response(response)
478 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
479 if response.ok and isinstance(data, dict):
480 # DEBUG: print("DEBUG: Success:", request)
481 instances.set("detection_mode", domain, "STATIC_CHECK")
482 instances.set("nodeinfo_url" , domain, request)
484 elif response.ok and isinstance(data, list):
485 print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
487 elif not response.ok or response.status_code >= 400:
488 print("WARNING: Failed fetching nodeinfo from domain:", domain)
489 instances.update_last_error(domain, response)
492 except BaseException as e:
493 # DEBUG: print("DEBUG: Cannot fetch API request:", request)
494 instances.update_last_error(domain, e)
497 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
500 def fetch_wellknown_nodeinfo(domain: str) -> list:
501 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
502 if type(domain) != str:
503 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
505 raise ValueError(f"Parameter 'domain' is empty")
507 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
511 response = get_response(domain, "/.well-known/nodeinfo", api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
513 data = json_from_response(response)
514 # DEBUG: print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
515 if response.ok and isinstance(data, dict):
517 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
518 if "links" in nodeinfo:
519 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
520 for link in nodeinfo["links"]:
521 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
522 if link["rel"] in nodeinfo_identifier:
523 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
524 response = get_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
526 data = json_from_response(response)
527 # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
528 if response.ok and isinstance(data, dict):
529 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
530 instances.set("detection_mode", domain, "AUTO_DISCOVERY")
531 instances.set("nodeinfo_url" , domain, link["href"])
534 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
536 print("WARNING: nodeinfo does not contain 'links':", domain)
538 except BaseException as e:
539 print("WARNING: Failed fetching .well-known info:", domain)
540 instances.update_last_error(domain, e)
543 # DEBUG: print("DEBUG: Returning data[]:", type(data))
546 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
547 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
548 if type(domain) != str:
549 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
551 raise ValueError(f"Parameter 'domain' is empty")
552 elif type(path) != str:
553 raise ValueError(f"path[]={type(path)} is not 'str'")
555 raise ValueError(f"Parameter 'domain' is empty")
557 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
561 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
562 response = get_response(domain, path, headers, (config.get("connection_timeout"), config.get("read_timeout")))
564 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
565 if response.ok and response.status_code < 300 and len(response.text) > 0:
566 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
567 doc = bs4.BeautifulSoup(response.text, "html.parser")
569 # DEBUG: print("DEBUG: doc[]:", type(doc))
570 generator = doc.find("meta", {"name": "generator"})
571 site_name = doc.find("meta", {"property": "og:site_name"})
573 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
574 if isinstance(generator, bs4.element.Tag):
575 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
576 software = tidyup_domain(generator.get("content"))
577 print(f"INFO: domain='{domain}' is generated by '{software}'")
578 instances.set("detection_mode", domain, "GENERATOR")
579 remove_pending_error(domain)
580 elif isinstance(site_name, bs4.element.Tag):
581 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
582 sofware = tidyup_domain(site_name.get("content"))
583 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
584 instances.set("detection_mode", domain, "SITE_NAME")
585 remove_pending_error(domain)
587 except BaseException as e:
588 # DEBUG: print(f"DEBUG: Cannot fetch / from '{domain}':", e)
589 instances.update_last_error(domain, e)
592 # DEBUG: print(f"DEBUG: software[]={type(software)}")
593 if type(software) is str and software == "":
594 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
596 elif type(software) is str and ("." in software or " " in software):
597 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
598 software = remove_version(software)
600 # DEBUG: print(f"DEBUG: software[]={type(software)}")
601 if type(software) is str and " powered by " in software:
602 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
603 software = remove_version(strip_powered_by(software))
604 elif type(software) is str and " hosted on " in software:
605 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
606 software = remove_version(strip_hosted_on(software))
607 elif type(software) is str and " by " in software:
608 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
609 software = strip_until(software, " by ")
610 elif type(software) is str and " see " in software:
611 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
612 software = strip_until(software, " see ")
614 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
617 def determine_software(domain: str, path: str = None) -> str:
618 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
619 if type(domain) != str:
620 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
622 raise ValueError(f"Parameter 'domain' is empty")
623 elif type(path) != str and path != None:
624 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
626 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
629 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
630 data = fetch_nodeinfo(domain, path)
632 # DEBUG: print("DEBUG: data[]:", type(data))
633 if not isinstance(data, dict) or len(data) == 0:
634 # DEBUG: print("DEBUG: Could not determine software type:", domain)
635 return fetch_generator_from_path(domain)
637 # DEBUG: print("DEBUG: data():", len(data), data)
638 if "status" in data and data["status"] == "error" and "message" in data:
639 print("WARNING: JSON response is an error:", data["message"])
640 instances.update_last_error(domain, data["message"])
641 return fetch_generator_from_path(domain)
642 elif "message" in data:
643 print("WARNING: JSON response contains only a message:", data["message"])
644 instances.update_last_error(domain, data["message"])
645 return fetch_generator_from_path(domain)
646 elif "software" not in data or "name" not in data["software"]:
647 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
648 software = fetch_generator_from_path(domain)
650 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
653 software = tidyup_domain(data["software"]["name"])
655 # DEBUG: print("DEBUG: sofware after tidyup_domain():", software)
656 if software in ["akkoma", "rebased"]:
657 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
659 elif software in ["hometown", "ecko"]:
660 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
661 software = "mastodon"
662 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
663 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
665 elif software.find("/") > 0:
666 print("WARNING: Spliting of slash:", software)
667 software = tidup_domain(software.split("/")[-1]);
668 elif software.find("|") > 0:
669 print("WARNING: Spliting of pipe:", software)
670 software = tidyup_domain(software.split("|")[0]);
671 elif "powered by" in software:
672 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
673 software = strip_powered_by(software)
674 elif type(software) is str and " by " in software:
675 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
676 software = strip_until(software, " by ")
677 elif type(software) is str and " see " in software:
678 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
679 software = strip_until(software, " see ")
681 # DEBUG: print(f"DEBUG: software[]={type(software)}")
683 print("WARNING: tidyup_domain() left no software name behind:", domain)
686 # DEBUG: print(f"DEBUG: software[]={type(software)}")
687 if str(software) == "":
688 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
689 software = fetch_generator_from_path(domain)
690 elif len(str(software)) > 0 and ("." in software or " " in software):
691 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
692 software = remove_version(software)
694 # DEBUG: print(f"DEBUG: software[]={type(software)}")
695 if type(software) is str and "powered by" in software:
696 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
697 software = remove_version(strip_powered_by(software))
699 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
702 def send_bot_post(instance: str, blocklist: dict):
703 # DEBUG: print(f"DEBUG: instance={instance},blocklist()={len(blocklist)} - CALLED!")
704 if type(domain) != str:
705 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
707 raise ValueError("Parameter 'domain' is empty")
708 elif type(blocklist) != dict:
709 raise ValueError(f"Parameter blocklist[]='{type(blocklist)}' is not 'dict'")
711 message = instance + " has blocked the following instances:\n\n"
714 if len(blocklist) > 20:
716 blocklist = blocklist[0 : 19]
718 # DEBUG: print(f"DEBUG: blocklist()={len(blocklist)}")
719 for block in blocklist:
720 # DEBUG: print(f"DEBUG: block['{type(block)}']={block}")
721 if block["reason"] == None or block["reason"] == '':
722 message = message + block["blocked"] + " with unspecified reason\n"
724 if len(block["reason"]) > 420:
725 block["reason"] = block["reason"][0:419] + "[…]"
727 message = message + block["blocked"] + ' for "' + block["reason"].replace("@", "@\u200b") + '"\n'
730 message = message + "(the list has been truncated to the first 20 entries)"
732 botheaders = {**api_headers, **{"Authorization": "Bearer " + config.get("bot_token")}}
735 f"{config.get('bot_instance')}/api/v1/statuses",
738 "visibility" : config.get('bot_visibility'),
739 "content_type": "text/plain"
747 def fetch_friendica_blocks(domain: str) -> dict:
748 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
749 if type(domain) != str:
750 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
752 raise ValueError(f"Parameter 'domain' is empty")
754 # DEBUG: print("DEBUG: Fetching friendica blocks from domain:", domain)
758 doc = bs4.BeautifulSoup(
759 get_response(domain, "/friendica", headers, (config.get("connection_timeout"), config.get("read_timeout"))).text,
762 except BaseException as e:
763 print("WARNING: Failed to fetch /friendica from domain:", domain, e)
764 instances.update_last_error(domain, e)
767 blocklist = doc.find(id="about_blocklist")
769 # Prevents exceptions:
770 if blocklist is None:
771 # DEBUG: print("DEBUG: Instance has no block list:", domain)
774 table = blocklist.find("table")
776 # DEBUG: print(f"DEBUG: table[]='{type(table)}'")
777 if table.find("tbody"):
778 rows = table.find("tbody").find_all("tr")
780 rows = table.find_all("tr")
782 # DEBUG: print(f"DEBUG: Found rows()={len(rows)}")
784 # DEBUG: print(f"DEBUG: line='{line}'")
786 "domain": tidyup_domain(line.find_all("td")[0].text),
787 "reason": tidyup_reason(line.find_all("td")[1].text)
789 # DEBUG: print("DEBUG: Next!")
791 # DEBUG: print("DEBUG: Returning blocklist() for domain:", domain, len(blocklist))
796 def fetch_misskey_blocks(domain: str) -> dict:
797 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
798 if type(domain) != str:
799 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
801 raise ValueError(f"Parameter 'domain' is empty")
803 # DEBUG: print("DEBUG: Fetching misskey blocks from domain:", domain)
810 step = config.get("misskey_limit")
812 # iterating through all "suspended" (follow-only in its terminology)
813 # instances page-by-page, since that troonware doesn't support
814 # sending them all at once
816 # DEBUG: print(f"DEBUG: Fetching offset='{offset}' from '{domain}' ...")
818 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
819 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
828 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
829 fetched = post_json_api(domain, "/api/federation/instances", json.dumps({
834 "offset" : offset - 1
839 # DEBUG: print("DEBUG: fetched():", len(fetched))
840 if len(fetched) == 0:
841 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
843 elif len(fetched) != config.get("misskey_limit"):
844 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
845 offset = offset + (config.get("misskey_limit") - len(fetched))
847 # DEBUG: print("DEBUG: Raising offset by step:", step)
848 offset = offset + step
851 for instance in fetched:
853 if instance["isSuspended"] and not has_key(blocklist["suspended"], "domain", instance):
855 blocklist["suspended"].append(
857 "domain": tidyup_domain(instance["host"]),
858 # no reason field, nothing
863 # DEBUG: print(f"DEBUG: count={count}")
865 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
868 except BaseException as e:
869 print("WARNING: Caught error, exiting loop:", domain, e)
870 instances.update_last_error(domain, e)
875 # same shit, different asshole ("blocked" aka full suspend)
878 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
879 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
888 # DEBUG: print("DEBUG: Sending JSON API request to domain,step,offset:", domain, step, offset)
889 fetched = post_json_api(domain,"/api/federation/instances", json.dumps({
894 "offset" : offset - 1
899 # DEBUG: print("DEBUG: fetched():", len(fetched))
900 if len(fetched) == 0:
901 # DEBUG: print("DEBUG: Returned zero bytes, exiting loop:", domain)
903 elif len(fetched) != config.get("misskey_limit"):
904 # DEBUG: print(f"DEBUG: Fetched '{len(fetched)}' row(s) but expected: '{config.get('misskey_limit')}'")
905 offset = offset + (config.get("misskey_limit") - len(fetched))
907 # DEBUG: print("DEBUG: Raising offset by step:", step)
908 offset = offset + step
911 for instance in fetched:
913 if instance["isBlocked"] and not has_key(blocklist["blocked"], "domain", instance):
915 blocklist["blocked"].append({
916 "domain": tidyup_domain(instance["host"]),
920 # DEBUG: print(f"DEBUG: count={count}")
922 # DEBUG: print(f"DEBUG: API is no more returning new instances, aborting loop!")
925 except BaseException as e:
926 print("ERROR: Exception during POST:", domain, e)
927 instances.update_last_error(domain, e)
931 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
932 instances.update_last_instance_fetch(domain)
934 # DEBUG: print("DEBUG: Returning for domain,blocked(),suspended():", domain, len(blocklist["blocked"]), len(blocklist["suspended"]))
936 "reject" : blocklist["blocked"],
937 "followers_only": blocklist["suspended"]
940 def tidyup_reason(reason: str) -> str:
941 # DEBUG: print(f"DEBUG: reason='{reason}' - CALLED!")
942 if type(reason) != str:
943 raise ValueError(f"Parameter reason[]={type(reason)} is not expected")
946 reason = reason.strip()
949 reason = re.sub("â", "\"", reason)
951 # DEBUG: print(f"DEBUG: reason='{reason}' - EXIT!")
954 def tidyup_domain(domain: str) -> str:
955 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
956 if type(domain) != str:
957 raise ValueError(f"Parameter domain[]={type(domain)} is not expected")
959 # All lower-case and strip spaces out + last dot
960 domain = domain.lower().strip().rstrip(".")
963 domain = re.sub("\:\d+$", "", domain)
965 # No protocol, sometimes without the slashes
966 domain = re.sub("^https?\:(\/*)", "", domain)
969 domain = re.sub("\/$", "", domain)
972 domain = re.sub("^\@", "", domain)
974 # No individual users in block lists
975 domain = re.sub("(.+)\@", "", domain)
976 if domain.find("/profile/"):
977 domain = domain.split("/profile/")[0]
978 elif domain.find("/users/"):
979 domain = domain.split("/users/")[0]
981 # DEBUG: print(f"DEBUG: domain='{domain}' - EXIT!")
984 def json_from_response(response: requests.models.Response) -> list:
985 # DEBUG: print(f"DEBUG: response[]={type(response)} - CALLED!")
986 if not isinstance(response, requests.models.Response):
987 raise ValueError(f"Parameter response[]='{type(response)}' is not type of 'Response'")
990 if response.text.strip() != "":
991 # DEBUG: print(f"DEBUG: response.text()={len(response.text)} is not empty, invoking response.json() ...")
993 data = response.json()
994 except json.decoder.JSONDecodeError:
997 # DEBUG: print(f"DEBUG: data[]={type(data)} - EXIT!")
1000 def get_response(domain: str, path: str, headers: dict, timeout: list) -> requests.models.Response:
1001 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}',headers()={len(headers)},timeout={timeout} - CALLED!")
1002 if type(domain) != str:
1003 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
1005 raise ValueError("Parameter 'domain' is empty")
1006 elif type(path) != str:
1007 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
1009 raise ValueError("Parameter 'path' is empty")
1012 # DEBUG: print(f"DEBUG: Sending request to '{domain}{path}' ...")
1013 response = reqto.get(
1014 f"https://{domain}{path}",
1018 except requests.exceptions.ConnectionError as e:
1019 # DEBUG: print(f"DEBUG: Fetching '{path}' from '{domain}' failed. exception[{type(e)}]='{str(e)}'")
1020 instances.update_last_error(domain, e)
1023 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")
1026 def has_key(keys: list, search: str, value: any) -> bool:
1027 # DEBUG: print(f"DEBUG: keys()={len(keys)},search='{search}',value[]='{type(value)}' - CALLED!")
1028 if type(keys) != list:
1029 raise ValueError(f"Parameter keys[]='{type(keys)}' is not 'list'")
1030 elif type(search) != str:
1031 raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
1033 raise ValueError("Parameter 'search' is empty")
1036 # DEBUG: print(f"DEBUG: Checking keys()={len(keys)} ...")
1038 # DEBUG: print(f"DEBUG: key['{type(key)}']={key}")
1039 if type(key) != dict:
1040 raise ValueError(f"key[]='{type(key)}' is not 'dict'")
1041 elif not search in key:
1042 raise KeyError(f"Cannot find search='{search}'")
1043 elif key[search] == value:
1047 # DEBUG: print(f"DEBUG: has={has} - EXIT!")
1050 def find_domains(tag: bs4.element.Tag) -> list:
1051 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
1052 if not isinstance(tag, bs4.element.Tag):
1053 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
1054 elif not isinstance(tag, bs4.element.Tag):
1055 raise KeyError("Cannot find table with instances!")
1056 elif len(tag.select("tr")) == 0:
1057 raise KeyError("No table rows found in table!")
1060 for element in tag.select("tr"):
1061 # DEBUG: print(f"DEBUG: element[]={type(element)}")
1062 if not element.find("td"):
1063 # DEBUG: print("DEBUG: Skipping element, no <td> found")
1066 domain = tidyup_domain(element.find("td").text)
1067 reason = tidyup_reason(element.findAll("td")[1].text)
1069 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
1071 if blacklist.is_blacklisted(domain):
1072 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
1074 elif domain == "gab.com/.ai, develop.gab.com":
1075 # DEBUG: print(f"DEBUG: Multiple domains detected in one row")
1077 "domain": "gab.com",
1085 "domain": "develop.gab.com",
1089 elif not validators.domain(domain):
1090 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
1093 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
1099 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
1102 def get_url(url: str, headers: dict, timeout: list) -> requests.models.Response:
1103 # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
1104 if type(url) != str:
1105 raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
1107 raise ValueError("Parameter 'url' is empty")
1109 # DEBUG: print(f"DEBUG: Parsing url='{url}'")
1110 components = urlparse(url)
1112 # Invoke other function, avoid trailing ?
1113 # DEBUG: print(f"DEBUG: components[{type(components)}]={components}")
1114 if components.query != "":
1115 response = get_response(components.hostname, f"{components.path}?{components.query}", headers, timeout)
1117 response = get_response(components.hostname, f"{components.path}", headers, timeout)
1119 # DEBUG: print(f"DEBUG: response[]='{type(response)}' - EXXIT!")