1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from fba import blacklist
20 from fba import config
23 from fba import instances
24 from fba import network
26 from fba.helpers import tidyup
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34 "https://nodeinfo.diaspora.software/ns/schema/2.1",
35 "https://nodeinfo.diaspora.software/ns/schema/2.0",
36 "https://nodeinfo.diaspora.software/ns/schema/1.1",
37 "https://nodeinfo.diaspora.software/ns/schema/1.0",
38 "http://nodeinfo.diaspora.software/ns/schema/2.1",
39 "http://nodeinfo.diaspora.software/ns/schema/2.0",
40 "http://nodeinfo.diaspora.software/ns/schema/1.1",
41 "http://nodeinfo.diaspora.software/ns/schema/1.0",
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif not isinstance(origin, str) and origin is not None:
51 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
52 elif software is None:
53 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54 software = determine_software(domain, path)
55 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56 elif not isinstance(software, str):
57 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
58 elif not isinstance(script, str):
59 raise ValueError(f"Parameter script[]='{type(script)}' is not 'str'")
61 raise ValueError("Parameter 'domain' is empty")
63 if not instances.is_registered(domain):
64 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65 instances.add(domain, origin, script, path)
67 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68 peerlist = fetch_peers(domain, software)
71 print("ERROR: Cannot fetch peers:", domain)
73 elif instances.has_pending_instance_data(domain):
74 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75 instances.update_data(domain)
77 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78 for instance in peerlist:
80 # Skip "None" types as tidup.domain() cannot parse them
83 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84 instance = tidyup.domain(instance)
85 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
88 print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90 elif not validators.domain(instance.split("/")[0]):
91 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93 elif blacklist.is_blacklisted(instance):
94 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
97 # DEBUG: print("DEBUG: Handling instance:", instance)
98 if not instances.is_registered(instance):
99 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100 instances.add(instance, domain, script)
102 # DEBUG: print("DEBUG: EXIT!")
104 def fetch_peers(domain: str, software: str) -> list:
105 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106 if not isinstance(domain, str):
107 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
109 raise ValueError("Parameter 'domain' is empty")
110 elif not isinstance(software, str) and software is not None:
111 raise ValueError(f"software[]='{type(software)}' is not 'str'")
113 if software == "misskey":
114 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115 return misskey.fetch_peers(domain)
116 elif software == "lemmy":
117 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118 return lemmy.fetch_peers(domain)
119 elif software == "peertube":
120 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121 return peertube.fetch_peers(domain)
123 # Init peers variable
125 # No CSRF by default, you don't have to add network.api_headers by yourself here
129 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
130 headers = csrf.determine(domain, dict())
131 except network.exceptions as exception:
132 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
135 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
136 data = network.get_json_api(
138 "/api/v1/instance/peers",
140 (config.get("connection_timeout"), config.get("read_timeout"))
143 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
144 if "error_message" in data:
145 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
146 data = network.get_json_api(
150 (config.get("connection_timeout"), config.get("read_timeout"))
153 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
154 if "error_message" in data:
155 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
156 elif "federated_instances" in data["json"]:
157 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
158 peers = peers + add_peers(data["json"]["federated_instances"])
159 # DEBUG: print("DEBUG: Added instance(s) to peers")
161 message = "JSON response does not contain 'federated_instances' or 'error_message'"
162 print(f"WARNING: {message},domain='{domain}'")
163 instances.update_last_error(domain, message)
165 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
168 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
169 instances.set_data("total_peers", domain, len(peers))
171 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
172 instances.update_last_instance_fetch(domain)
174 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
177 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
178 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
179 if not isinstance(domain, str):
180 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
182 raise ValueError("Parameter 'domain' is empty")
183 elif not isinstance(path, str) and path is not None:
184 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
186 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
187 nodeinfo = fetch_wellknown_nodeinfo(domain)
189 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]='{nodeinfo}'")
190 if "error_message" in nodeinfo:
191 print(f"WARNING: Error during fetching nodeinfo: '{nodeinfo['error_message']}' - EXIT!")
194 # No CSRF by default, you don't have to add network.api_headers by yourself here
199 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
200 headers = csrf.determine(domain, dict())
201 except network.exceptions as exception:
202 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
206 "/nodeinfo/2.1.json",
208 "/nodeinfo/2.0.json",
214 for request in request_paths:
215 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request'}")
216 if path is not None and path != "" and path != request:
217 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
220 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
221 data = network.get_json_api(
225 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
228 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
229 if "error_message" not in data:
230 # DEBUG: print("DEBUG: Success:", request)
231 instances.set_data("detection_mode", domain, "STATIC_CHECK")
232 instances.set_data("nodeinfo_url" , domain, request)
235 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
237 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
240 def fetch_wellknown_nodeinfo(domain: str) -> dict:
241 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
242 if not isinstance(domain, str):
243 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
245 raise ValueError("Parameter 'domain' is empty")
247 # No CSRF by default, you don't have to add network.api_headers by yourself here
251 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
252 headers = csrf.determine(domain, dict())
253 except network.exceptions as exception:
254 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown,{__name__}) - EXIT!")
257 "error_message": type(exception)
260 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
261 data = network.get_json_api(
263 "/.well-known/nodeinfo",
265 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
268 if "error_message" not in data:
269 nodeinfo = data["json"]
270 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
271 if "links" in nodeinfo:
272 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
273 for link in nodeinfo["links"]:
274 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
275 if link["rel"] in nodeinfo_identifier:
276 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
277 data = network.fetch_api_url(
279 (config.get("connection_timeout"), config.get("read_timeout"))
282 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
284 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
285 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
286 instances.set_data("nodeinfo_url" , domain, link["href"])
289 instances.update_last_error(domain, data)
291 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
293 print("WARNING: nodeinfo does not contain 'links':", domain)
295 # DEBUG: print("DEBUG: Returning data[]:", type(data))
298 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
299 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
300 if not isinstance(domain, str):
301 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
303 raise ValueError("Parameter 'domain' is empty")
304 elif not isinstance(path, str):
305 raise ValueError(f"path[]='{type(path)}' is not 'str'")
307 raise ValueError("Parameter 'path' is empty")
309 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
312 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
313 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
315 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
316 if response.ok and response.status_code < 300 and len(response.text) > 0:
317 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
318 doc = bs4.BeautifulSoup(response.text, "html.parser")
320 # DEBUG: print("DEBUG: doc[]:", type(doc))
321 generator = doc.find("meta", {"name" : "generator"})
322 site_name = doc.find("meta", {"property": "og:site_name"})
324 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
325 if isinstance(generator, bs4.element.Tag):
326 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
327 software = tidyup.domain(generator.get("content"))
328 print(f"INFO: domain='{domain}' is generated by '{software}'")
329 instances.set_data("detection_mode", domain, "GENERATOR")
330 elif isinstance(site_name, bs4.element.Tag):
331 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
332 sofware = tidyup.domain(site_name.get("content"))
333 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
334 instances.set_data("detection_mode", domain, "SITE_NAME")
336 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
337 if isinstance(software, str) and software == "":
338 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
340 elif isinstance(software, str) and ("." in software or " " in software):
341 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
342 software = fba.remove_version(software)
344 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
345 if isinstance(software, str) and " powered by " in software:
346 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
347 software = fba.remove_version(fba.strip_powered_by(software))
348 elif isinstance(software, str) and " hosted on " in software:
349 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
350 software = fba.remove_version(fba.strip_hosted_on(software))
351 elif isinstance(software, str) and " by " in software:
352 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
353 software = fba.strip_until(software, " by ")
354 elif isinstance(software, str) and " see " in software:
355 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
356 software = fba.strip_until(software, " see ")
358 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
361 def determine_software(domain: str, path: str = None) -> str:
362 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
363 if not isinstance(domain, str):
364 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
366 raise ValueError("Parameter 'domain' is empty")
367 elif not isinstance(path, str) and path is not None:
368 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
370 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
373 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
374 data = fetch_nodeinfo(domain, path)
376 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
377 if "error_message" in data:
378 # DEBUG: print("DEBUG: Could not determine software type:", domain)
379 return fetch_generator_from_path(domain)
381 # DEBUG: print("DEBUG: data():", len(data), data)
382 if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
383 print("WARNING: JSON response is an error:", data["json"]["message"])
384 instances.update_last_error(domain, data["json"]["message"])
385 return fetch_generator_from_path(domain)
386 elif "message" in data["json"]:
387 print("WARNING: JSON response contains only a message:", data["message"])
388 instances.update_last_error(domain, data["json"]["message"])
389 return fetch_generator_from_path(domain)
390 elif "software" not in data["json"] or "name" not in data["json"]["software"]:
391 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
392 software = fetch_generator_from_path(domain)
394 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
397 software = tidyup.domain(data["json"]["software"]["name"])
399 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
400 if software in ["akkoma", "rebased"]:
401 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
403 elif software in ["hometown", "ecko"]:
404 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
405 software = "mastodon"
406 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
407 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
409 elif software.find("/") > 0:
410 print("WARNING: Spliting of slash:", software)
411 software = tidyup.domain(software.split("/")[-1])
412 elif software.find("|") > 0:
413 print("WARNING: Spliting of pipe:", software)
414 software = tidyup.domain(software.split("|")[0])
415 elif "powered by" in software:
416 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
417 software = fba.strip_powered_by(software)
418 elif isinstance(software, str) and " by " in software:
419 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
420 software = fba.strip_until(software, " by ")
421 elif isinstance(software, str) and " see " in software:
422 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
423 software = fba.strip_until(software, " see ")
425 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
427 print("WARNING: tidyup.domain() left no software name behind:", domain)
430 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
431 if str(software) == "":
432 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
433 software = fetch_generator_from_path(domain)
434 elif len(str(software)) > 0 and ("." in software or " " in software):
435 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
436 software = fba.remove_version(software)
438 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
439 if isinstance(software, str) and "powered by" in software:
440 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
441 software = fba.remove_version(fba.strip_powered_by(software))
443 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
446 def find_domains(tag: bs4.element.Tag) -> list:
447 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
448 if not isinstance(tag, bs4.element.Tag):
449 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
450 elif len(tag.select("tr")) == 0:
451 raise KeyError("No table rows found in table!")
454 for element in tag.select("tr"):
455 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
456 if not element.find("td"):
457 # DEBUG: print("DEBUG: Skipping element, no <td> found")
460 domain = tidyup.domain(element.find("td").text)
461 reason = tidyup.reason(element.findAll("td")[1].text)
463 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
465 if blacklist.is_blacklisted(domain):
466 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
468 elif domain == "gab.com/.ai, develop.gab.com":
469 # DEBUG: print("DEBUG: Multiple domains detected in one row")
479 "domain": "develop.gab.com",
483 elif not validators.domain(domain):
484 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
487 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
493 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
496 def add_peers(rows: dict) -> list:
497 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
499 for key in ["linked", "allowed", "blocked"]:
500 # DEBUG: print(f"DEBUG: Checking key='{key}'")
501 if key in rows and rows[key] is not None:
502 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
503 for peer in rows[key]:
504 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
505 peer = tidyup.domain(peer)
507 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
508 if blacklist.is_blacklisted(peer):
509 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
512 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
515 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")