1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from fba import blacklist
20 from fba import config
23 from fba import instances
24 from fba import network
26 from fba.helpers import tidyup
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34 "https://nodeinfo.diaspora.software/ns/schema/2.1",
35 "https://nodeinfo.diaspora.software/ns/schema/2.0",
36 "https://nodeinfo.diaspora.software/ns/schema/1.1",
37 "https://nodeinfo.diaspora.software/ns/schema/1.0",
38 "http://nodeinfo.diaspora.software/ns/schema/2.1",
39 "http://nodeinfo.diaspora.software/ns/schema/2.0",
40 "http://nodeinfo.diaspora.software/ns/schema/1.1",
41 "http://nodeinfo.diaspora.software/ns/schema/1.0",
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif not isinstance(origin, str) and origin is not None:
51 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
52 elif software is None:
53 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54 software = determine_software(domain, path)
55 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56 elif not isinstance(software, str):
57 raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
58 elif not isinstance(script, str):
59 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
61 raise ValueError("Parameter 'domain' is empty")
63 if not instances.is_registered(domain):
64 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65 instances.add(domain, origin, script, path)
67 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68 peerlist = fetch_peers(domain, software)
71 print("ERROR: Cannot fetch peers:", domain)
73 elif instances.has_pending_instance_data(domain):
74 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75 instances.update_data(domain)
77 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78 for instance in peerlist:
80 # Skip "None" types as tidup.domain() cannot parse them
83 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84 instance = tidyup.domain(instance)
85 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
88 print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90 elif not validators.domain(instance.split("/")[0]):
91 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93 elif blacklist.is_blacklisted(instance):
94 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
97 # DEBUG: print("DEBUG: Handling instance:", instance)
98 if not instances.is_registered(instance):
99 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100 instances.add(instance, domain, script)
102 # DEBUG: print("DEBUG: EXIT!")
104 def fetch_peers(domain: str, software: str) -> list:
105 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106 if not isinstance(domain, str):
107 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
109 raise ValueError("Parameter 'domain' is empty")
110 elif not isinstance(software, str) and software is not None:
111 raise ValueError(f"software[]={type(software)} is not 'str'")
113 if software == "misskey":
114 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115 return misskey.fetch_peers(domain)
116 elif software == "lemmy":
117 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118 return lemmy.fetch_peers(domain)
119 elif software == "peertube":
120 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121 return peertube.fetch_peers(domain)
123 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
124 headers = csrf.determine(domain, dict())
126 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
127 data = network.get_json_api(
129 "/api/v1/instance/peers",
131 (config.get("connection_timeout"), config.get("read_timeout"))
133 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
135 if "error_message" in data:
136 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
137 data = network.get_json_api(
140 (config.get("connection_timeout"), config.get("read_timeout"))
143 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
144 if "error_message" in data:
145 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
146 elif "federated_instances" in data["json"]:
147 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
148 peers = peers + add_peers(data["json"]["federated_instances"])
149 # DEBUG: print("DEBUG: Added instance(s) to peers")
151 print("WARNING: JSON response does not contain 'federated_instances':", domain)
152 instances.update_last_error(domain, data)
154 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
157 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
158 instances.set_data("total_peers", domain, len(peers))
160 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
161 instances.update_last_instance_fetch(domain)
163 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
166 def fetch_nodeinfo(domain: str, path: str = None) -> list:
167 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
168 if not isinstance(domain, str):
169 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
171 raise ValueError("Parameter 'domain' is empty")
172 elif not isinstance(path, str) and path is not None:
173 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
175 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
176 nodeinfo = fetch_wellknown_nodeinfo(domain)
178 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
179 if len(nodeinfo) > 0:
180 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
183 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
184 headers = csrf.determine(domain, dict())
187 "/nodeinfo/2.1.json",
189 "/nodeinfo/2.0.json",
195 for request in request_paths:
196 if path is not None and path != "" and path != request:
197 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
200 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
201 data = network.get_json_api(
205 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
208 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
209 if "error_message" not in data:
210 # DEBUG: print("DEBUG: Success:", request)
211 instances.set_data("detection_mode", domain, "STATIC_CHECK")
212 instances.set_data("nodeinfo_url" , domain, request)
215 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
217 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
220 def fetch_wellknown_nodeinfo(domain: str) -> list:
221 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
222 if not isinstance(domain, str):
223 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
225 raise ValueError("Parameter 'domain' is empty")
227 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
228 headers = csrf.determine(domain, dict())
230 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
231 data = network.get_json_api(
233 "/.well-known/nodeinfo",
235 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
238 if "error_message" not in data:
239 nodeinfo = data["json"]
240 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
241 if "links" in nodeinfo:
242 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
243 for link in nodeinfo["links"]:
244 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
245 if link["rel"] in nodeinfo_identifier:
246 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
247 data = network.fetch_api_url(
249 (config.get("connection_timeout"), config.get("read_timeout"))
252 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
254 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
255 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
256 instances.set_data("nodeinfo_url" , domain, link["href"])
259 instances.update_last_error(domain, data)
261 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
263 print("WARNING: nodeinfo does not contain 'links':", domain)
265 # DEBUG: print("DEBUG: Returning data[]:", type(data))
268 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
269 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
270 if not isinstance(domain, str):
271 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
273 raise ValueError("Parameter 'domain' is empty")
274 elif not isinstance(path, str):
275 raise ValueError(f"path[]={type(path)} is not 'str'")
277 raise ValueError("Parameter 'path' is empty")
279 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
282 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
283 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
285 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
286 if response.ok and response.status_code < 300 and len(response.text) > 0:
287 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
288 doc = bs4.BeautifulSoup(response.text, "html.parser")
290 # DEBUG: print("DEBUG: doc[]:", type(doc))
291 generator = doc.find("meta", {"name" : "generator"})
292 site_name = doc.find("meta", {"property": "og:site_name"})
294 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
295 if isinstance(generator, bs4.element.Tag):
296 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
297 software = tidyup.domain(generator.get("content"))
298 print(f"INFO: domain='{domain}' is generated by '{software}'")
299 instances.set_data("detection_mode", domain, "GENERATOR")
300 elif isinstance(site_name, bs4.element.Tag):
301 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
302 sofware = tidyup.domain(site_name.get("content"))
303 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
304 instances.set_data("detection_mode", domain, "SITE_NAME")
306 # DEBUG: print(f"DEBUG: software[]={type(software)}")
307 if isinstance(software, str) and software == "":
308 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
310 elif isinstance(software, str) and ("." in software or " " in software):
311 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
312 software = fba.remove_version(software)
314 # DEBUG: print(f"DEBUG: software[]={type(software)}")
315 if isinstance(software, str) and " powered by " in software:
316 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
317 software = fba.remove_version(fba.strip_powered_by(software))
318 elif isinstance(software, str) and " hosted on " in software:
319 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
320 software = fba.remove_version(fba.strip_hosted_on(software))
321 elif isinstance(software, str) and " by " in software:
322 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
323 software = fba.strip_until(software, " by ")
324 elif isinstance(software, str) and " see " in software:
325 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
326 software = fba.strip_until(software, " see ")
328 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
331 def determine_software(domain: str, path: str = None) -> str:
332 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
333 if not isinstance(domain, str):
334 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
336 raise ValueError("Parameter 'domain' is empty")
337 elif not isinstance(path, str) and path is not None:
338 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
340 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
343 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
344 data = fetch_nodeinfo(domain, path)
346 # DEBUG: print("DEBUG: data[]:", type(data))
347 if "error_message" in data:
348 # DEBUG: print("DEBUG: Could not determine software type:", domain)
349 return fetch_generator_from_path(domain)
351 # DEBUG: print("DEBUG: data():", len(data), data)
352 if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
353 print("WARNING: JSON response is an error:", data["json"]["message"])
354 instances.update_last_error(domain, data["json"]["message"])
355 return fetch_generator_from_path(domain)
356 elif "message" in data["json"]:
357 print("WARNING: JSON response contains only a message:", data["message"])
358 instances.update_last_error(domain, data["json"]["message"])
359 return fetch_generator_from_path(domain)
360 elif "software" not in data["json"] or "name" not in data["json"]["software"]:
361 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
362 software = fetch_generator_from_path(domain)
364 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
367 software = tidyup.domain(data["json"]["software"]["name"])
369 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
370 if software in ["akkoma", "rebased"]:
371 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
373 elif software in ["hometown", "ecko"]:
374 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
375 software = "mastodon"
376 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
377 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
379 elif software.find("/") > 0:
380 print("WARNING: Spliting of slash:", software)
381 software = tidyup.domain(software.split("/")[-1])
382 elif software.find("|") > 0:
383 print("WARNING: Spliting of pipe:", software)
384 software = tidyup.domain(software.split("|")[0])
385 elif "powered by" in software:
386 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
387 software = fba.strip_powered_by(software)
388 elif isinstance(software, str) and " by " in software:
389 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
390 software = fba.strip_until(software, " by ")
391 elif isinstance(software, str) and " see " in software:
392 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
393 software = fba.strip_until(software, " see ")
395 # DEBUG: print(f"DEBUG: software[]={type(software)}")
397 print("WARNING: tidyup.domain() left no software name behind:", domain)
400 # DEBUG: print(f"DEBUG: software[]={type(software)}")
401 if str(software) == "":
402 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
403 software = fetch_generator_from_path(domain)
404 elif len(str(software)) > 0 and ("." in software or " " in software):
405 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
406 software = fba.remove_version(software)
408 # DEBUG: print(f"DEBUG: software[]={type(software)}")
409 if isinstance(software, str) and "powered by" in software:
410 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
411 software = fba.remove_version(fba.strip_powered_by(software))
413 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
416 def find_domains(tag: bs4.element.Tag) -> list:
417 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
418 if not isinstance(tag, bs4.element.Tag):
419 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
420 elif len(tag.select("tr")) == 0:
421 raise KeyError("No table rows found in table!")
424 for element in tag.select("tr"):
425 # DEBUG: print(f"DEBUG: element[]={type(element)}")
426 if not element.find("td"):
427 # DEBUG: print("DEBUG: Skipping element, no <td> found")
430 domain = tidyup.domain(element.find("td").text)
431 reason = tidyup.reason(element.findAll("td")[1].text)
433 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
435 if blacklist.is_blacklisted(domain):
436 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
438 elif domain == "gab.com/.ai, develop.gab.com":
439 # DEBUG: print("DEBUG: Multiple domains detected in one row")
449 "domain": "develop.gab.com",
453 elif not validators.domain(domain):
454 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
457 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
463 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
466 def add_peers(rows: dict) -> list:
467 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
469 for key in ["linked", "allowed", "blocked"]:
470 # DEBUG: print(f"DEBUG: Checking key='{key}'")
471 if key in rows and rows[key] is not None:
472 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
473 for peer in rows[key]:
474 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
475 peer = tidyup.domain(peer)
477 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
478 if blacklist.is_blacklisted(peer):
479 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
482 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
485 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")