1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from fba import blacklist
20 from fba import config
23 from fba import instances
24 from fba import network
26 from fba.helpers import tidyup
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34 "https://nodeinfo.diaspora.software/ns/schema/2.1",
35 "https://nodeinfo.diaspora.software/ns/schema/2.0",
36 "https://nodeinfo.diaspora.software/ns/schema/1.1",
37 "https://nodeinfo.diaspora.software/ns/schema/1.0",
38 "http://nodeinfo.diaspora.software/ns/schema/2.1",
39 "http://nodeinfo.diaspora.software/ns/schema/2.0",
40 "http://nodeinfo.diaspora.software/ns/schema/1.1",
41 "http://nodeinfo.diaspora.software/ns/schema/1.0",
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif not isinstance(origin, str) and origin is not None:
51 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
52 elif software is None:
53 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54 software = determine_software(domain, path)
55 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56 elif not isinstance(software, str):
57 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
58 elif not isinstance(script, str):
59 raise ValueError(f"Parameter script[]='{type(script)}' is not 'str'")
61 raise ValueError("Parameter 'domain' is empty")
63 if domain.split(".")[-1] == "arpa":
64 print(f"WARNING: domain='{domain}' is a reversed .arpa domain and should not be used generally.")
66 elif not instances.is_registered(domain):
67 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
68 instances.add(domain, origin, script, path)
70 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
71 peerlist = fetch_peers(domain, software)
74 print("ERROR: Cannot fetch peers:", domain)
76 elif instances.has_pending_instance_data(domain):
77 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
78 instances.update_data(domain)
80 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
81 for instance in peerlist:
82 # DEBUG: print(f"DEBUG: instance='{instance}'")
84 # Skip "None" types as tidup.domain() cannot parse them
87 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
88 instance = tidyup.domain(instance)
89 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
92 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
94 elif not validators.domain(instance.split("/")[0]):
95 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
97 elif instance.split(".")[-1] == "arpa":
98 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
100 elif blacklist.is_blacklisted(instance):
101 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
104 # DEBUG: print("DEBUG: Handling instance:", instance)
105 if instance.split(".")[-1] == "arpa":
106 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
108 elif not instances.is_registered(instance):
109 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
110 instances.add(instance, domain, script)
112 # DEBUG: print("DEBUG: EXIT!")
114 def fetch_peers(domain: str, software: str) -> list:
115 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
116 if not isinstance(domain, str):
117 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
119 raise ValueError("Parameter 'domain' is empty")
120 elif not isinstance(software, str) and software is not None:
121 raise ValueError(f"software[]='{type(software)}' is not 'str'")
123 if software == "misskey":
124 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
125 return misskey.fetch_peers(domain)
126 elif software == "lemmy":
127 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
128 return lemmy.fetch_peers(domain)
129 elif software == "peertube":
130 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
131 return peertube.fetch_peers(domain)
133 # Init peers variable
135 # No CSRF by default, you don't have to add network.api_headers by yourself here
139 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
140 headers = csrf.determine(domain, dict())
141 except network.exceptions as exception:
142 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
145 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
146 data = network.get_json_api(
148 "/api/v1/instance/peers",
150 (config.get("connection_timeout"), config.get("read_timeout"))
153 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
154 if "error_message" in data:
155 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
156 data = network.get_json_api(
160 (config.get("connection_timeout"), config.get("read_timeout"))
163 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
164 if "error_message" in data:
165 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
166 elif "federated_instances" in data["json"]:
167 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
168 peers = peers + add_peers(data["json"]["federated_instances"])
169 # DEBUG: print("DEBUG: Added instance(s) to peers")
171 message = "JSON response does not contain 'federated_instances' or 'error_message'"
172 print(f"WARNING: {message},domain='{domain}'")
173 instances.update_last_error(domain, message)
175 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
178 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
179 instances.set_data("total_peers", domain, len(peers))
181 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
182 instances.update_last_instance_fetch(domain)
184 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
187 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
188 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
189 if not isinstance(domain, str):
190 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
192 raise ValueError("Parameter 'domain' is empty")
193 elif not isinstance(path, str) and path is not None:
194 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
196 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
197 nodeinfo = fetch_wellknown_nodeinfo(domain)
199 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]()='{len(nodeinfo)}'")
200 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
201 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
202 return nodeinfo["json"]
204 # No CSRF by default, you don't have to add network.api_headers by yourself here
209 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
210 headers = csrf.determine(domain, dict())
211 except network.exceptions as exception:
212 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
215 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
216 "exception" : exception,
220 "/nodeinfo/2.1.json",
222 "/nodeinfo/2.0.json",
228 for request in request_paths:
229 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
230 if path is not None and path != "" and path != request:
231 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
234 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
235 data = network.get_json_api(
239 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
242 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
243 if "error_message" not in data:
244 # DEBUG: print("DEBUG: Success:", request)
245 instances.set_data("detection_mode", domain, "STATIC_CHECK")
246 instances.set_data("nodeinfo_url" , domain, request)
249 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
251 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
254 def fetch_wellknown_nodeinfo(domain: str) -> dict:
255 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
256 if not isinstance(domain, str):
257 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
259 raise ValueError("Parameter 'domain' is empty")
261 # No CSRF by default, you don't have to add network.api_headers by yourself here
265 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
266 headers = csrf.determine(domain, dict())
267 except network.exceptions as exception:
268 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown,{__name__}) - EXIT!")
271 "error_message": type(exception),
272 "exception" : exception,
275 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
276 data = network.get_json_api(
278 "/.well-known/nodeinfo",
280 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
283 if "error_message" not in data:
284 nodeinfo = data["json"]
285 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
286 if "links" in nodeinfo:
287 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
288 for link in nodeinfo["links"]:
289 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
290 if link["rel"] in nodeinfo_identifier:
291 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
292 data = network.fetch_api_url(
294 (config.get("connection_timeout"), config.get("read_timeout"))
297 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
299 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
300 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
301 instances.set_data("nodeinfo_url" , domain, link["href"])
304 instances.update_last_error(domain, data)
306 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
308 print("WARNING: nodeinfo does not contain 'links':", domain)
310 # DEBUG: print("DEBUG: Returning data[]:", type(data))
313 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
314 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
315 if not isinstance(domain, str):
316 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
318 raise ValueError("Parameter 'domain' is empty")
319 elif not isinstance(path, str):
320 raise ValueError(f"path[]='{type(path)}' is not 'str'")
322 raise ValueError("Parameter 'path' is empty")
324 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
327 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
328 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
330 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
331 if response.ok and response.status_code < 300 and len(response.text) > 0:
332 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
333 doc = bs4.BeautifulSoup(response.text, "html.parser")
335 # DEBUG: print("DEBUG: doc[]:", type(doc))
336 generator = doc.find("meta", {"name" : "generator"})
337 site_name = doc.find("meta", {"property": "og:site_name"})
339 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
340 if isinstance(generator, bs4.element.Tag):
341 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
342 software = tidyup.domain(generator.get("content"))
343 print(f"INFO: domain='{domain}' is generated by '{software}'")
344 instances.set_data("detection_mode", domain, "GENERATOR")
345 elif isinstance(site_name, bs4.element.Tag):
346 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
347 sofware = tidyup.domain(site_name.get("content"))
348 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
349 instances.set_data("detection_mode", domain, "SITE_NAME")
351 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
352 if isinstance(software, str) and software == "":
353 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
355 elif isinstance(software, str) and ("." in software or " " in software):
356 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
357 software = fba.remove_version(software)
359 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
360 if isinstance(software, str) and " powered by " in software:
361 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
362 software = fba.remove_version(fba.strip_powered_by(software))
363 elif isinstance(software, str) and " hosted on " in software:
364 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
365 software = fba.remove_version(fba.strip_hosted_on(software))
366 elif isinstance(software, str) and " by " in software:
367 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
368 software = fba.strip_until(software, " by ")
369 elif isinstance(software, str) and " see " in software:
370 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
371 software = fba.strip_until(software, " see ")
373 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
376 def determine_software(domain: str, path: str = None) -> str:
377 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
378 if not isinstance(domain, str):
379 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
381 raise ValueError("Parameter 'domain' is empty")
382 elif not isinstance(path, str) and path is not None:
383 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
385 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
388 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
389 data = fetch_nodeinfo(domain, path)
391 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
392 if "exception" in data:
393 # Continue raising it
394 raise data["exception"]
395 elif "error_message" in data:
396 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
397 return fetch_generator_from_path(domain)
398 elif "status" in data and data["status"] == "error" and "message" in data:
399 print("WARNING: JSON response is an error:", data["message"])
400 instances.update_last_error(domain, data["message"])
401 return fetch_generator_from_path(domain)
402 elif "message" in data:
403 print("WARNING: JSON response contains only a message:", data["message"])
404 instances.update_last_error(domain, data["message"])
405 return fetch_generator_from_path(domain)
406 elif "software" not in data or "name" not in data["software"]:
407 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
408 software = fetch_generator_from_path(domain)
410 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
413 software = tidyup.domain(data["software"]["name"])
415 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
416 if software in ["akkoma", "rebased"]:
417 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
419 elif software in ["hometown", "ecko"]:
420 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
421 software = "mastodon"
422 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
423 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
425 elif software.find("/") > 0:
426 print("WARNING: Spliting of slash:", software)
427 software = tidyup.domain(software.split("/")[-1])
428 elif software.find("|") > 0:
429 print("WARNING: Spliting of pipe:", software)
430 software = tidyup.domain(software.split("|")[0])
431 elif "powered by" in software:
432 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
433 software = fba.strip_powered_by(software)
434 elif isinstance(software, str) and " by " in software:
435 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
436 software = fba.strip_until(software, " by ")
437 elif isinstance(software, str) and " see " in software:
438 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
439 software = fba.strip_until(software, " see ")
441 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
443 print("WARNING: tidyup.domain() left no software name behind:", domain)
446 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
447 if str(software) == "":
448 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
449 software = fetch_generator_from_path(domain)
450 elif len(str(software)) > 0 and ("." in software or " " in software):
451 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
452 software = fba.remove_version(software)
454 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
455 if isinstance(software, str) and "powered by" in software:
456 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
457 software = fba.remove_version(fba.strip_powered_by(software))
459 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
462 def find_domains(tag: bs4.element.Tag) -> list:
463 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
464 if not isinstance(tag, bs4.element.Tag):
465 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
466 elif len(tag.select("tr")) == 0:
467 raise KeyError("No table rows found in table!")
470 for element in tag.select("tr"):
471 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
472 if not element.find("td"):
473 # DEBUG: print("DEBUG: Skipping element, no <td> found")
476 domain = tidyup.domain(element.find("td").text)
477 reason = tidyup.reason(element.findAll("td")[1].text)
479 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
481 if blacklist.is_blacklisted(domain):
482 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
484 elif domain == "gab.com/.ai, develop.gab.com":
485 # DEBUG: print("DEBUG: Multiple domains detected in one row")
495 "domain": "develop.gab.com",
499 elif not validators.domain(domain):
500 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
503 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
509 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
512 def add_peers(rows: dict) -> list:
513 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
515 for key in ["linked", "allowed", "blocked"]:
516 # DEBUG: print(f"DEBUG: Checking key='{key}'")
517 if key in rows and rows[key] is not None:
518 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
519 for peer in rows[key]:
520 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
521 peer = tidyup.domain(peer)
523 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
524 if blacklist.is_blacklisted(peer):
525 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
528 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
531 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")