1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from fba import blacklist
20 from fba import config
23 from fba import instances
24 from fba import network
26 from fba.helpers import tidyup
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34 "https://nodeinfo.diaspora.software/ns/schema/2.1",
35 "https://nodeinfo.diaspora.software/ns/schema/2.0",
36 "https://nodeinfo.diaspora.software/ns/schema/1.1",
37 "https://nodeinfo.diaspora.software/ns/schema/1.0",
38 "http://nodeinfo.diaspora.software/ns/schema/2.1",
39 "http://nodeinfo.diaspora.software/ns/schema/2.0",
40 "http://nodeinfo.diaspora.software/ns/schema/1.1",
41 "http://nodeinfo.diaspora.software/ns/schema/1.0",
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif not isinstance(origin, str) and origin is not None:
51 raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
52 elif software is None:
53 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54 software = determine_software(domain, path)
55 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56 elif not isinstance(software, str):
57 raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
58 elif not isinstance(script, str):
59 raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
61 raise ValueError("Parameter 'domain' is empty")
63 if not instances.is_registered(domain):
64 # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65 instances.add(domain, origin, script, path)
67 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68 peerlist = fetch_peers(domain, software)
71 print("ERROR: Cannot fetch peers:", domain)
73 elif instances.has_pending_instance_data(domain):
74 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75 instances.update_data(domain)
77 print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78 for instance in peerlist:
80 # Skip "None" types as tidup.domain() cannot parse them
83 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84 instance = tidyup.domain(instance)
85 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
88 print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90 elif not validators.domain(instance.split("/")[0]):
91 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93 elif blacklist.is_blacklisted(instance):
94 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
97 # DEBUG: print("DEBUG: Handling instance:", instance)
98 if not instances.is_registered(instance):
99 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100 instances.add(instance, domain, script)
102 # DEBUG: print("DEBUG: EXIT!")
104 def fetch_peers(domain: str, software: str) -> list:
105 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106 if not isinstance(domain, str):
107 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
109 raise ValueError("Parameter 'domain' is empty")
110 elif not isinstance(software, str) and software is not None:
111 raise ValueError(f"software[]={type(software)} is not 'str'")
113 if software == "misskey":
114 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115 return misskey.fetch_peers(domain)
116 elif software == "lemmy":
117 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118 return lemmy.fetch_peers(domain)
119 elif software == "peertube":
120 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121 return peertube.fetch_peers(domain)
123 # Init peers variable
127 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
129 headers = csrf.determine(domain, dict())
130 except network.exceptions as exception:
131 print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
134 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
135 data = network.get_json_api(
137 "/api/v1/instance/peers",
139 (config.get("connection_timeout"), config.get("read_timeout"))
142 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
143 if "error_message" in data:
144 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
145 data = network.get_json_api(
149 (config.get("connection_timeout"), config.get("read_timeout"))
152 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
153 if "error_message" in data:
154 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
155 elif "federated_instances" in data["json"]:
156 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
157 peers = peers + add_peers(data["json"]["federated_instances"])
158 # DEBUG: print("DEBUG: Added instance(s) to peers")
160 message = "JSON response does not contain 'federated_instances' or 'error_message'"
161 print(f"WARNING: {message},domain='{domain}'")
162 instances.update_last_error(domain, message)
164 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
167 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
168 instances.set_data("total_peers", domain, len(peers))
170 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
171 instances.update_last_instance_fetch(domain)
173 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
176 def fetch_nodeinfo(domain: str, path: str = None) -> list:
177 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
178 if not isinstance(domain, str):
179 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
181 raise ValueError("Parameter 'domain' is empty")
182 elif not isinstance(path, str) and path is not None:
183 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
185 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
186 nodeinfo = fetch_wellknown_nodeinfo(domain)
188 # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
189 if len(nodeinfo) > 0:
190 # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
195 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
197 headers = csrf.determine(domain, dict())
198 except network.exceptions as exception:
199 print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
203 "/nodeinfo/2.1.json",
205 "/nodeinfo/2.0.json",
211 for request in request_paths:
212 if path is not None and path != "" and path != request:
213 # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
216 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
217 data = network.get_json_api(
221 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
224 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
225 if "error_message" not in data:
226 # DEBUG: print("DEBUG: Success:", request)
227 instances.set_data("detection_mode", domain, "STATIC_CHECK")
228 instances.set_data("nodeinfo_url" , domain, request)
231 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
233 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
236 def fetch_wellknown_nodeinfo(domain: str) -> list:
237 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
238 if not isinstance(domain, str):
239 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
241 raise ValueError("Parameter 'domain' is empty")
245 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
247 headers = csrf.determine(domain, dict())
248 except network.exceptions as exception:
249 print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
252 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
253 data = network.get_json_api(
255 "/.well-known/nodeinfo",
257 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
260 if "error_message" not in data:
261 nodeinfo = data["json"]
262 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
263 if "links" in nodeinfo:
264 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
265 for link in nodeinfo["links"]:
266 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
267 if link["rel"] in nodeinfo_identifier:
268 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
269 data = network.fetch_api_url(
271 (config.get("connection_timeout"), config.get("read_timeout"))
274 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
276 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
277 instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
278 instances.set_data("nodeinfo_url" , domain, link["href"])
281 instances.update_last_error(domain, data)
283 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
285 print("WARNING: nodeinfo does not contain 'links':", domain)
287 # DEBUG: print("DEBUG: Returning data[]:", type(data))
290 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
291 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
292 if not isinstance(domain, str):
293 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
295 raise ValueError("Parameter 'domain' is empty")
296 elif not isinstance(path, str):
297 raise ValueError(f"path[]={type(path)} is not 'str'")
299 raise ValueError("Parameter 'path' is empty")
301 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
304 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
305 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
307 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
308 if response.ok and response.status_code < 300 and len(response.text) > 0:
309 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
310 doc = bs4.BeautifulSoup(response.text, "html.parser")
312 # DEBUG: print("DEBUG: doc[]:", type(doc))
313 generator = doc.find("meta", {"name" : "generator"})
314 site_name = doc.find("meta", {"property": "og:site_name"})
316 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
317 if isinstance(generator, bs4.element.Tag):
318 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
319 software = tidyup.domain(generator.get("content"))
320 print(f"INFO: domain='{domain}' is generated by '{software}'")
321 instances.set_data("detection_mode", domain, "GENERATOR")
322 elif isinstance(site_name, bs4.element.Tag):
323 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
324 sofware = tidyup.domain(site_name.get("content"))
325 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
326 instances.set_data("detection_mode", domain, "SITE_NAME")
328 # DEBUG: print(f"DEBUG: software[]={type(software)}")
329 if isinstance(software, str) and software == "":
330 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
332 elif isinstance(software, str) and ("." in software or " " in software):
333 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
334 software = fba.remove_version(software)
336 # DEBUG: print(f"DEBUG: software[]={type(software)}")
337 if isinstance(software, str) and " powered by " in software:
338 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
339 software = fba.remove_version(fba.strip_powered_by(software))
340 elif isinstance(software, str) and " hosted on " in software:
341 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
342 software = fba.remove_version(fba.strip_hosted_on(software))
343 elif isinstance(software, str) and " by " in software:
344 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
345 software = fba.strip_until(software, " by ")
346 elif isinstance(software, str) and " see " in software:
347 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
348 software = fba.strip_until(software, " see ")
350 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
353 def determine_software(domain: str, path: str = None) -> str:
354 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
355 if not isinstance(domain, str):
356 raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
358 raise ValueError("Parameter 'domain' is empty")
359 elif not isinstance(path, str) and path is not None:
360 raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
362 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
365 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
366 data = fetch_nodeinfo(domain, path)
368 # DEBUG: print("DEBUG: data[]:", type(data))
369 if "error_message" in data:
370 # DEBUG: print("DEBUG: Could not determine software type:", domain)
371 return fetch_generator_from_path(domain)
373 # DEBUG: print("DEBUG: data():", len(data), data)
374 if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
375 print("WARNING: JSON response is an error:", data["json"]["message"])
376 instances.update_last_error(domain, data["json"]["message"])
377 return fetch_generator_from_path(domain)
378 elif "message" in data["json"]:
379 print("WARNING: JSON response contains only a message:", data["message"])
380 instances.update_last_error(domain, data["json"]["message"])
381 return fetch_generator_from_path(domain)
382 elif "software" not in data["json"] or "name" not in data["json"]["software"]:
383 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
384 software = fetch_generator_from_path(domain)
386 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
389 software = tidyup.domain(data["json"]["software"]["name"])
391 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
392 if software in ["akkoma", "rebased"]:
393 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
395 elif software in ["hometown", "ecko"]:
396 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
397 software = "mastodon"
398 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
399 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
401 elif software.find("/") > 0:
402 print("WARNING: Spliting of slash:", software)
403 software = tidyup.domain(software.split("/")[-1])
404 elif software.find("|") > 0:
405 print("WARNING: Spliting of pipe:", software)
406 software = tidyup.domain(software.split("|")[0])
407 elif "powered by" in software:
408 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
409 software = fba.strip_powered_by(software)
410 elif isinstance(software, str) and " by " in software:
411 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
412 software = fba.strip_until(software, " by ")
413 elif isinstance(software, str) and " see " in software:
414 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
415 software = fba.strip_until(software, " see ")
417 # DEBUG: print(f"DEBUG: software[]={type(software)}")
419 print("WARNING: tidyup.domain() left no software name behind:", domain)
422 # DEBUG: print(f"DEBUG: software[]={type(software)}")
423 if str(software) == "":
424 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
425 software = fetch_generator_from_path(domain)
426 elif len(str(software)) > 0 and ("." in software or " " in software):
427 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
428 software = fba.remove_version(software)
430 # DEBUG: print(f"DEBUG: software[]={type(software)}")
431 if isinstance(software, str) and "powered by" in software:
432 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
433 software = fba.remove_version(fba.strip_powered_by(software))
435 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
438 def find_domains(tag: bs4.element.Tag) -> list:
439 # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
440 if not isinstance(tag, bs4.element.Tag):
441 raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
442 elif len(tag.select("tr")) == 0:
443 raise KeyError("No table rows found in table!")
446 for element in tag.select("tr"):
447 # DEBUG: print(f"DEBUG: element[]={type(element)}")
448 if not element.find("td"):
449 # DEBUG: print("DEBUG: Skipping element, no <td> found")
452 domain = tidyup.domain(element.find("td").text)
453 reason = tidyup.reason(element.findAll("td")[1].text)
455 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
457 if blacklist.is_blacklisted(domain):
458 print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
460 elif domain == "gab.com/.ai, develop.gab.com":
461 # DEBUG: print("DEBUG: Multiple domains detected in one row")
471 "domain": "develop.gab.com",
475 elif not validators.domain(domain):
476 print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
479 # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
485 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
488 def add_peers(rows: dict) -> list:
489 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
491 for key in ["linked", "allowed", "blocked"]:
492 # DEBUG: print(f"DEBUG: Checking key='{key}'")
493 if key in rows and rows[key] is not None:
494 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
495 for peer in rows[key]:
496 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
497 peer = tidyup.domain(peer)
499 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
500 if blacklist.is_blacklisted(peer):
501 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
504 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
507 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")