1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
20 from fba import blacklist
21 from fba import config
23 from fba import network
25 from fba.helpers import tidyup
26 from fba.helpers import version
28 from fba.models import instances
30 from fba.networks import lemmy
31 from fba.networks import misskey
32 from fba.networks import peertube
34 # "rel" identifiers (no real URLs)
35 nodeinfo_identifier = [
36 "https://nodeinfo.diaspora.software/ns/schema/2.1",
37 "https://nodeinfo.diaspora.software/ns/schema/2.0",
38 "https://nodeinfo.diaspora.software/ns/schema/1.1",
39 "https://nodeinfo.diaspora.software/ns/schema/1.0",
40 "http://nodeinfo.diaspora.software/ns/schema/2.1",
41 "http://nodeinfo.diaspora.software/ns/schema/2.0",
42 "http://nodeinfo.diaspora.software/ns/schema/1.1",
43 "http://nodeinfo.diaspora.software/ns/schema/1.0",
46 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
47 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
48 if not isinstance(domain, str):
49 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
51 raise ValueError("Parameter 'domain' is empty")
52 elif domain.endswith(".tld"):
53 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
54 elif not isinstance(origin, str) and origin is not None:
55 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
56 elif software is None:
57 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
58 instances.set_data("last_instance_fetch", domain, time.time())
60 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
61 software = determine_software(domain, path)
62 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
63 elif not isinstance(software, str):
64 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
65 elif not isinstance(command, str):
66 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
68 raise ValueError("Parameter 'command' is empty")
69 elif domain.endswith(".arpa"):
70 print(f"WARNING: domain='{domain}' is a reversed .arpa domain and should not be used generally.")
72 elif not instances.is_registered(domain):
73 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
74 instances.add(domain, origin, command, path)
76 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
77 instances.set_data("last_instance_fetch", domain, time.time())
79 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
80 peerlist = fetch_peers(domain, software)
83 print("ERROR: Cannot fetch peers:", domain)
85 elif instances.has_pending(domain):
86 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
87 instances.update_data(domain)
89 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
90 for instance in peerlist:
91 # DEBUG: print(f"DEBUG: instance='{instance}'")
93 # Skip "None" types as tidup.domain() cannot parse them
96 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
97 instance = tidyup.domain(instance)
98 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
101 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
103 elif not validators.domain(instance.split("/")[0]):
104 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
106 elif instance.endswith(".arpa"):
107 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
109 elif blacklist.is_blacklisted(instance):
110 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
113 # DEBUG: print("DEBUG: Handling instance:", instance)
114 if instance.endswith(".arpa"):
115 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
117 elif not instances.is_registered(instance):
118 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
119 instances.add(instance, domain, command)
121 # DEBUG: print("DEBUG: EXIT!")
123 def fetch_peers(domain: str, software: str) -> list:
124 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
125 if not isinstance(domain, str):
126 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
128 raise ValueError("Parameter 'domain' is empty")
129 elif not isinstance(software, str) and software is not None:
130 raise ValueError(f"software[]='{type(software)}' is not 'str'")
132 if software == "misskey":
133 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
134 return misskey.fetch_peers(domain)
135 elif software == "lemmy":
136 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
137 return lemmy.fetch_peers(domain)
138 elif software == "peertube":
139 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
140 return peertube.fetch_peers(domain)
142 # Init peers variable
145 # No CSRF by default, you don't have to add network.api_headers by yourself here
149 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
150 headers = csrf.determine(domain, dict())
151 except network.exceptions as exception:
152 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
155 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
156 data = network.get_json_api(
158 "/api/v1/instance/peers",
160 (config.get("connection_timeout"), config.get("read_timeout"))
163 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
164 if "error_message" in data:
165 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
166 data = network.get_json_api(
170 (config.get("connection_timeout"), config.get("read_timeout"))
173 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
174 if "error_message" in data:
175 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
176 elif "federated_instances" in data["json"]:
177 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
178 peers = peers + add_peers(data["json"]["federated_instances"])
179 # DEBUG: print("DEBUG: Added instance(s) to peers")
181 message = "JSON response does not contain 'federated_instances' or 'error_message'"
182 print(f"WARNING: {message},domain='{domain}'")
183 instances.update_last_error(domain, message)
185 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
188 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
189 instances.set_data("total_peers", domain, len(peers))
191 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
194 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
195 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
196 if not isinstance(domain, str):
197 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
199 raise ValueError("Parameter 'domain' is empty")
200 elif not isinstance(path, str) and path is not None:
201 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
203 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
204 nodeinfo = fetch_wellknown_nodeinfo(domain)
206 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
207 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
208 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
209 return nodeinfo["json"]
211 # No CSRF by default, you don't have to add network.api_headers by yourself here
216 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
217 headers = csrf.determine(domain, dict())
218 except network.exceptions as exception:
219 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
222 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
223 "exception" : exception,
227 "/nodeinfo/2.1.json",
229 "/nodeinfo/2.0.json",
235 for request in request_paths:
236 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
237 if path is not None and path != "" and path != request:
238 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
241 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
242 data = network.get_json_api(
246 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
249 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
250 if "error_message" not in data:
251 # DEBUG: print("DEBUG: Success:", request)
252 instances.set_data("detection_mode", domain, "STATIC_CHECK")
253 instances.set_data("nodeinfo_url" , domain, request)
256 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
258 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
261 def fetch_wellknown_nodeinfo(domain: str) -> dict:
262 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
263 if not isinstance(domain, str):
264 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
266 raise ValueError("Parameter 'domain' is empty")
268 # No CSRF by default, you don't have to add network.api_headers by yourself here
272 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
273 headers = csrf.determine(domain, dict())
274 except network.exceptions as exception:
275 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
278 "error_message": type(exception),
279 "exception" : exception,
282 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
283 data = network.get_json_api(
285 "/.well-known/nodeinfo",
287 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
290 if "error_message" not in data:
291 nodeinfo = data["json"]
292 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
293 if "links" in nodeinfo:
294 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
295 for link in nodeinfo["links"]:
296 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
297 if link["rel"] in nodeinfo_identifier:
298 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
299 data = network.fetch_api_url(
301 (config.get("connection_timeout"), config.get("read_timeout"))
304 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
306 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
307 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
308 instances.set_data("nodeinfo_url" , domain, link["href"])
311 instances.update_last_error(domain, data)
313 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
315 print("WARNING: nodeinfo does not contain 'links':", domain)
317 # DEBUG: print("DEBUG: Returning data[]:", type(data))
320 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
321 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
322 if not isinstance(domain, str):
323 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
325 raise ValueError("Parameter 'domain' is empty")
326 elif not isinstance(path, str):
327 raise ValueError(f"path[]='{type(path)}' is not 'str'")
329 raise ValueError("Parameter 'path' is empty")
331 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
334 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
335 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
337 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
338 if response.ok and response.status_code < 300 and len(response.text) > 0:
339 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
340 doc = bs4.BeautifulSoup(response.text, "html.parser")
342 # DEBUG: print("DEBUG: doc[]:", type(doc))
343 generator = doc.find("meta", {"name" : "generator"})
344 site_name = doc.find("meta", {"property": "og:site_name"})
346 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
347 if isinstance(generator, bs4.element.Tag):
348 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
349 software = tidyup.domain(generator.get("content"))
350 print(f"INFO: domain='{domain}' is generated by '{software}'")
351 instances.set_data("detection_mode", domain, "GENERATOR")
352 elif isinstance(site_name, bs4.element.Tag):
353 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
354 sofware = tidyup.domain(site_name.get("content"))
355 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
356 instances.set_data("detection_mode", domain, "SITE_NAME")
358 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
359 if isinstance(software, str) and software == "":
360 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
362 elif isinstance(software, str) and ("." in software or " " in software):
363 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
364 software = version.remove(software)
366 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
367 if isinstance(software, str) and "powered by " in software:
368 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
369 software = version.remove(version.strip_powered_by(software))
370 elif isinstance(software, str) and " hosted on " in software:
371 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
372 software = version.remove(version.strip_hosted_on(software))
373 elif isinstance(software, str) and " by " in software:
374 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
375 software = version.strip_until(software, " by ")
376 elif isinstance(software, str) and " see " in software:
377 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
378 software = version.strip_until(software, " see ")
380 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
383 def determine_software(domain: str, path: str = None) -> str:
384 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
385 if not isinstance(domain, str):
386 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
388 raise ValueError("Parameter 'domain' is empty")
389 elif not isinstance(path, str) and path is not None:
390 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
392 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
395 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
396 data = fetch_nodeinfo(domain, path)
398 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
399 if "exception" in data:
400 # Continue raising it
401 raise data["exception"]
402 elif "error_message" in data:
403 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
404 return fetch_generator_from_path(domain)
405 elif "status" in data and data["status"] == "error" and "message" in data:
406 print("WARNING: JSON response is an error:", data["message"])
407 instances.update_last_error(domain, data["message"])
408 return fetch_generator_from_path(domain)
409 elif "message" in data:
410 print("WARNING: JSON response contains only a message:", data["message"])
411 instances.update_last_error(domain, data["message"])
412 return fetch_generator_from_path(domain)
413 elif "software" not in data or "name" not in data["software"]:
414 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
415 software = fetch_generator_from_path(domain)
417 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
420 software = tidyup.domain(data["software"]["name"])
422 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
423 if software in ["akkoma", "rebased"]:
424 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
426 elif software in ["hometown", "ecko"]:
427 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
428 software = "mastodon"
429 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
430 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
432 elif software == "runtube.re":
433 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
434 software = "peertube"
435 elif software == "nextcloud social":
436 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
437 software = "nextcloud"
438 elif software.find("/") > 0:
439 print("WARNING: Spliting of slash:", software)
440 software = tidyup.domain(software.split("/")[-1])
441 elif software.find("|") > 0:
442 print("WARNING: Spliting of pipe:", software)
443 software = tidyup.domain(software.split("|")[0])
444 elif "powered by" in software:
445 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
446 software = version.strip_powered_by(software)
447 elif isinstance(software, str) and " by " in software:
448 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
449 software = version.strip_until(software, " by ")
450 elif isinstance(software, str) and " see " in software:
451 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
452 software = version.strip_until(software, " see ")
454 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
456 print("WARNING: tidyup.domain() left no software name behind:", domain)
459 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
460 if str(software) == "":
461 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
462 software = fetch_generator_from_path(domain)
463 elif len(str(software)) > 0 and ("." in software or " " in software):
464 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
465 software = version.remove(software)
467 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
468 if isinstance(software, str) and "powered by" in software:
469 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
470 software = version.remove(version.strip_powered_by(software))
472 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
475 def find_domains(tag: bs4.element.Tag) -> list:
476 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
477 if not isinstance(tag, bs4.element.Tag):
478 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
479 elif len(tag.select("tr")) == 0:
480 raise KeyError("No table rows found in table!")
483 for element in tag.select("tr"):
484 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
485 if not element.find("td"):
486 # DEBUG: print("DEBUG: Skipping element, no <td> found")
489 domain = tidyup.domain(element.find("td").text)
490 reason = tidyup.reason(element.findAll("td")[1].text)
492 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
494 if blacklist.is_blacklisted(domain):
495 print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!")
497 elif domain == "gab.com/.ai, develop.gab.com":
498 # DEBUG: print("DEBUG: Multiple domains detected in one row")
508 "domain": "develop.gab.com",
512 elif not validators.domain(domain):
513 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
516 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
522 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
525 def add_peers(rows: dict) -> list:
526 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
528 for key in ["linked", "allowed", "blocked"]:
529 # DEBUG: print(f"DEBUG: Checking key='{key}'")
530 if key in rows and rows[key] is not None:
531 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
532 for peer in rows[key]:
533 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
534 peer = tidyup.domain(peer)
536 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
537 if blacklist.is_blacklisted(peer):
538 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
541 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
544 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")