1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from fba import blacklist
20 from fba import config
23 from fba import instances
24 from fba import network
26 from fba.helpers import tidyup
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34 "https://nodeinfo.diaspora.software/ns/schema/2.1",
35 "https://nodeinfo.diaspora.software/ns/schema/2.0",
36 "https://nodeinfo.diaspora.software/ns/schema/1.1",
37 "https://nodeinfo.diaspora.software/ns/schema/1.0",
38 "http://nodeinfo.diaspora.software/ns/schema/2.1",
39 "http://nodeinfo.diaspora.software/ns/schema/2.0",
40 "http://nodeinfo.diaspora.software/ns/schema/1.1",
41 "http://nodeinfo.diaspora.software/ns/schema/1.0",
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif not isinstance(origin, str) and origin is not None:
51 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
52 elif software is None:
53 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54 software = determine_software(domain, path)
55 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56 elif not isinstance(software, str):
57 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
58 elif not isinstance(script, str):
59 raise ValueError(f"Parameter script[]='{type(script)}' is not 'str'")
61 raise ValueError("Parameter 'domain' is empty")
63 if not instances.is_registered(domain):
64 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65 instances.add(domain, origin, script, path)
67 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68 peerlist = fetch_peers(domain, software)
71 print("ERROR: Cannot fetch peers:", domain)
73 elif instances.has_pending_instance_data(domain):
74 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75 instances.update_data(domain)
77 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78 for instance in peerlist:
80 # Skip "None" types as tidup.domain() cannot parse them
83 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84 instance = tidyup.domain(instance)
85 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
88 print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90 elif not validators.domain(instance.split("/")[0]):
91 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93 elif blacklist.is_blacklisted(instance):
94 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
97 # DEBUG: print("DEBUG: Handling instance:", instance)
98 if not instances.is_registered(instance):
99 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100 instances.add(instance, domain, script)
102 # DEBUG: print("DEBUG: EXIT!")
104 def fetch_peers(domain: str, software: str) -> list:
105 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106 if not isinstance(domain, str):
107 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
109 raise ValueError("Parameter 'domain' is empty")
110 elif not isinstance(software, str) and software is not None:
111 raise ValueError(f"software[]='{type(software)}' is not 'str'")
113 if software == "misskey":
114 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115 return misskey.fetch_peers(domain)
116 elif software == "lemmy":
117 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118 return lemmy.fetch_peers(domain)
119 elif software == "peertube":
120 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121 return peertube.fetch_peers(domain)
123 # Init peers variable
125 # No CSRF by default, you don't have to add network.api_headers by yourself here
129 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
130 headers = csrf.determine(domain, dict())
131 except network.exceptions as exception:
132 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
135 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
136 data = network.get_json_api(
138 "/api/v1/instance/peers",
140 (config.get("connection_timeout"), config.get("read_timeout"))
143 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
144 if "error_message" in data:
145 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
146 data = network.get_json_api(
150 (config.get("connection_timeout"), config.get("read_timeout"))
153 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
154 if "error_message" in data:
155 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
156 elif "federated_instances" in data["json"]:
157 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
158 peers = peers + add_peers(data["json"]["federated_instances"])
159 # DEBUG: print("DEBUG: Added instance(s) to peers")
161 message = "JSON response does not contain 'federated_instances' or 'error_message'"
162 print(f"WARNING: {message},domain='{domain}'")
163 instances.update_last_error(domain, message)
165 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
168 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
169 instances.set_data("total_peers", domain, len(peers))
171 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
172 instances.update_last_instance_fetch(domain)
174 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
177 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
178 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
179 if not isinstance(domain, str):
180 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
182 raise ValueError("Parameter 'domain' is empty")
183 elif not isinstance(path, str) and path is not None:
184 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
186 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
187 nodeinfo = fetch_wellknown_nodeinfo(domain)
189 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]='{nodeinfo}'")
190 if "error_message" in nodeinfo:
191 print(f"WARNING: Error during fetching nodeinfo: '{nodeinfo['error_message']}' - EXIT!")
194 # No CSRF by default, you don't have to add network.api_headers by yourself here
198 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
199 headers = csrf.determine(domain, dict())
200 except network.exceptions as exception:
201 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
205 "/nodeinfo/2.1.json",
207 "/nodeinfo/2.0.json",
213 for request in request_paths:
214 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request'}")
215 if path is not None and path != "" and path != request:
216 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
219 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
220 data = network.get_json_api(
224 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
227 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
228 if "error_message" not in data:
229 # DEBUG: print("DEBUG: Success:", request)
230 instances.set_data("detection_mode", domain, "STATIC_CHECK")
231 instances.set_data("nodeinfo_url" , domain, request)
234 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
236 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
239 def fetch_wellknown_nodeinfo(domain: str) -> dict:
240 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
241 if not isinstance(domain, str):
242 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
244 raise ValueError("Parameter 'domain' is empty")
246 # No CSRF by default, you don't have to add network.api_headers by yourself here
250 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
251 headers = csrf.determine(domain, dict())
252 except network.exceptions as exception:
253 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown,{__name__}) - EXIT!")
256 "error_message": type(exception)
259 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
260 data = network.get_json_api(
262 "/.well-known/nodeinfo",
264 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
267 if "error_message" not in data:
268 nodeinfo = data["json"]
269 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
270 if "links" in nodeinfo:
271 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
272 for link in nodeinfo["links"]:
273 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
274 if link["rel"] in nodeinfo_identifier:
275 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
276 data = network.fetch_api_url(
278 (config.get("connection_timeout"), config.get("read_timeout"))
281 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
283 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
284 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
285 instances.set_data("nodeinfo_url" , domain, link["href"])
288 instances.update_last_error(domain, data)
290 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
292 print("WARNING: nodeinfo does not contain 'links':", domain)
294 # DEBUG: print("DEBUG: Returning data[]:", type(data))
297 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
298 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
299 if not isinstance(domain, str):
300 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
302 raise ValueError("Parameter 'domain' is empty")
303 elif not isinstance(path, str):
304 raise ValueError(f"path[]='{type(path)}' is not 'str'")
306 raise ValueError("Parameter 'path' is empty")
308 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
311 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
312 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
314 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
315 if response.ok and response.status_code < 300 and len(response.text) > 0:
316 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
317 doc = bs4.BeautifulSoup(response.text, "html.parser")
319 # DEBUG: print("DEBUG: doc[]:", type(doc))
320 generator = doc.find("meta", {"name" : "generator"})
321 site_name = doc.find("meta", {"property": "og:site_name"})
323 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
324 if isinstance(generator, bs4.element.Tag):
325 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
326 software = tidyup.domain(generator.get("content"))
327 print(f"INFO: domain='{domain}' is generated by '{software}'")
328 instances.set_data("detection_mode", domain, "GENERATOR")
329 elif isinstance(site_name, bs4.element.Tag):
330 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
331 sofware = tidyup.domain(site_name.get("content"))
332 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
333 instances.set_data("detection_mode", domain, "SITE_NAME")
335 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
336 if isinstance(software, str) and software == "":
337 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
339 elif isinstance(software, str) and ("." in software or " " in software):
340 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
341 software = fba.remove_version(software)
343 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
344 if isinstance(software, str) and " powered by " in software:
345 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
346 software = fba.remove_version(fba.strip_powered_by(software))
347 elif isinstance(software, str) and " hosted on " in software:
348 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
349 software = fba.remove_version(fba.strip_hosted_on(software))
350 elif isinstance(software, str) and " by " in software:
351 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
352 software = fba.strip_until(software, " by ")
353 elif isinstance(software, str) and " see " in software:
354 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
355 software = fba.strip_until(software, " see ")
357 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
360 def determine_software(domain: str, path: str = None) -> str:
361 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
362 if not isinstance(domain, str):
363 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
365 raise ValueError("Parameter 'domain' is empty")
366 elif not isinstance(path, str) and path is not None:
367 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
369 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
372 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
373 data = fetch_nodeinfo(domain, path)
375 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
376 if "error_message" in data:
377 # DEBUG: print("DEBUG: Could not determine software type:", domain)
378 return fetch_generator_from_path(domain)
380 # DEBUG: print("DEBUG: data():", len(data), data)
381 if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
382 print("WARNING: JSON response is an error:", data["json"]["message"])
383 instances.update_last_error(domain, data["json"]["message"])
384 return fetch_generator_from_path(domain)
385 elif "message" in data["json"]:
386 print("WARNING: JSON response contains only a message:", data["message"])
387 instances.update_last_error(domain, data["json"]["message"])
388 return fetch_generator_from_path(domain)
389 elif "software" not in data["json"] or "name" not in data["json"]["software"]:
390 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
391 software = fetch_generator_from_path(domain)
393 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
396 software = tidyup.domain(data["json"]["software"]["name"])
398 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
399 if software in ["akkoma", "rebased"]:
400 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
402 elif software in ["hometown", "ecko"]:
403 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
404 software = "mastodon"
405 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
406 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
408 elif software.find("/") > 0:
409 print("WARNING: Spliting of slash:", software)
410 software = tidyup.domain(software.split("/")[-1])
411 elif software.find("|") > 0:
412 print("WARNING: Spliting of pipe:", software)
413 software = tidyup.domain(software.split("|")[0])
414 elif "powered by" in software:
415 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
416 software = fba.strip_powered_by(software)
417 elif isinstance(software, str) and " by " in software:
418 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
419 software = fba.strip_until(software, " by ")
420 elif isinstance(software, str) and " see " in software:
421 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
422 software = fba.strip_until(software, " see ")
424 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
426 print("WARNING: tidyup.domain() left no software name behind:", domain)
429 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
430 if str(software) == "":
431 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
432 software = fetch_generator_from_path(domain)
433 elif len(str(software)) > 0 and ("." in software or " " in software):
434 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
435 software = fba.remove_version(software)
437 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
438 if isinstance(software, str) and "powered by" in software:
439 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
440 software = fba.remove_version(fba.strip_powered_by(software))
442 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
445 def find_domains(tag: bs4.element.Tag) -> list:
446 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
447 if not isinstance(tag, bs4.element.Tag):
448 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
449 elif len(tag.select("tr")) == 0:
450 raise KeyError("No table rows found in table!")
453 for element in tag.select("tr"):
454 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
455 if not element.find("td"):
456 # DEBUG: print("DEBUG: Skipping element, no <td> found")
459 domain = tidyup.domain(element.find("td").text)
460 reason = tidyup.reason(element.findAll("td")[1].text)
462 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
464 if blacklist.is_blacklisted(domain):
465 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
467 elif domain == "gab.com/.ai, develop.gab.com":
468 # DEBUG: print("DEBUG: Multiple domains detected in one row")
478 "domain": "develop.gab.com",
482 elif not validators.domain(domain):
483 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
486 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
492 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
495 def add_peers(rows: dict) -> list:
496 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
498 for key in ["linked", "allowed", "blocked"]:
499 # DEBUG: print(f"DEBUG: Checking key='{key}'")
500 if key in rows and rows[key] is not None:
501 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
502 for peer in rows[key]:
503 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
504 peer = tidyup.domain(peer)
506 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
507 if blacklist.is_blacklisted(peer):
508 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
511 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
514 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")