]> git.mxchange.org Git - fba.git/blob - fba/federation.py
92669c50cea2356f67e0ec8d0a1ba47889946154
[fba.git] / fba / federation.py
1 # Copyright (C) 2023 Free Software Foundation
2 #
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU Affero General Public License for more details.
12 #
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
15
16 import bs4
17 import validators
18
19 from fba import blacklist
20 from fba import config
21 from fba import csrf
22 from fba import fba
23 from fba import instances
24 from fba import network
25
26 from fba.helpers import tidyup
27
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
31
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34     "https://nodeinfo.diaspora.software/ns/schema/2.1",
35     "https://nodeinfo.diaspora.software/ns/schema/2.0",
36     "https://nodeinfo.diaspora.software/ns/schema/1.1",
37     "https://nodeinfo.diaspora.software/ns/schema/1.0",
38     "http://nodeinfo.diaspora.software/ns/schema/2.1",
39     "http://nodeinfo.diaspora.software/ns/schema/2.0",
40     "http://nodeinfo.diaspora.software/ns/schema/1.1",
41     "http://nodeinfo.diaspora.software/ns/schema/1.0",
42 ]
43
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46     if not isinstance(domain, str):
47         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
48     elif domain == "":
49         raise ValueError("Parameter 'domain' is empty")
50     elif not isinstance(origin, str) and origin is not None:
51         raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
52     elif software is None:
53         # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54         software = determine_software(domain, path)
55         # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56     elif not isinstance(software, str):
57         raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
58     elif not isinstance(script, str):
59         raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
60     elif domain == "":
61         raise ValueError("Parameter 'domain' is empty")
62
63     if not instances.is_registered(domain):
64         # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65         instances.add(domain, origin, script, path)
66
67     # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68     peerlist = fetch_peers(domain, software)
69
70     if peerlist is None:
71         print("ERROR: Cannot fetch peers:", domain)
72         return
73     elif instances.has_pending_instance_data(domain):
74         # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75         instances.update_data(domain)
76
77     print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78     for instance in peerlist:
79         if instance is None:
80             # Skip "None" types as tidup.domain() cannot parse them
81             continue
82
83         # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84         instance = tidyup.domain(instance)
85         # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
86
87         if instance == "":
88             print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
89             continue
90         elif not validators.domain(instance.split("/")[0]):
91             print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
92             continue
93         elif blacklist.is_blacklisted(instance):
94             # DEBUG: print("DEBUG: instance is blacklisted:", instance)
95             continue
96
97         # DEBUG: print("DEBUG: Handling instance:", instance)
98         if not instances.is_registered(instance):
99             # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100             instances.add(instance, domain, script)
101
102     # DEBUG: print("DEBUG: EXIT!")
103
104 def fetch_peers(domain: str, software: str) -> list:
105     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106     if not isinstance(domain, str):
107         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
108     elif domain == "":
109         raise ValueError("Parameter 'domain' is empty")
110     elif not isinstance(software, str) and software is not None:
111         raise ValueError(f"software[]={type(software)} is not 'str'")
112
113     if software == "misskey":
114         # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115         return misskey.fetch_peers(domain)
116     elif software == "lemmy":
117         # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118         return lemmy.fetch_peers(domain)
119     elif software == "peertube":
120         # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121         return peertube.fetch_peers(domain)
122
123     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
124     headers = csrf.determine(domain, dict())
125
126     # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
127     data = network.get_json_api(
128         domain,
129         "/api/v1/instance/peers",
130         headers,
131         (config.get("connection_timeout"), config.get("read_timeout"))
132     )
133     # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
134
135     if "error_message" in data:
136         # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
137         data = network.get_json_api(
138             domain,
139             "/api/v3/site",
140             (config.get("connection_timeout"), config.get("read_timeout"))
141         )
142
143         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
144         if "error_message" in data:
145             print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
146         elif "federated_instances" in data["json"]:
147             # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
148             peers = peers + add_peers(data["json"]["federated_instances"])
149             # DEBUG: print("DEBUG: Added instance(s) to peers")
150         else:
151             print("WARNING: JSON response does not contain 'federated_instances':", domain)
152             instances.update_last_error(domain, data)
153     else:
154         # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
155         peers = data["json"]
156
157     # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
158     instances.set_data("total_peers", domain, len(peers))
159
160     # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
161     instances.update_last_instance_fetch(domain)
162
163     # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
164     return peers
165
166 def fetch_nodeinfo(domain: str, path: str = None) -> list:
167     # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
168     if not isinstance(domain, str):
169         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
170     elif domain == "":
171         raise ValueError("Parameter 'domain' is empty")
172     elif not isinstance(path, str) and path is not None:
173         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
174
175     # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
176     nodeinfo = fetch_wellknown_nodeinfo(domain)
177
178     # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
179     if len(nodeinfo) > 0:
180         # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
181         return nodeinfo
182
183     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
184     headers = csrf.determine(domain, dict())
185
186     request_paths = [
187        "/nodeinfo/2.1.json",
188        "/nodeinfo/2.1",
189        "/nodeinfo/2.0.json",
190        "/nodeinfo/2.0",
191        "/nodeinfo/1.0",
192        "/api/v1/instance"
193     ]
194
195     for request in request_paths:
196         if path is not None and path != "" and path != request:
197             # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
198             continue
199
200         # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
201         data = network.get_json_api(
202             domain,
203             request,
204             headers,
205             (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
206         )
207
208         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
209         if "error_message" not in data:
210             # DEBUG: print("DEBUG: Success:", request)
211             instances.set_data("detection_mode", domain, "STATIC_CHECK")
212             instances.set_data("nodeinfo_url"  , domain, request)
213             break
214
215         print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
216
217     # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
218     return data
219
220 def fetch_wellknown_nodeinfo(domain: str) -> list:
221     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
222     if not isinstance(domain, str):
223         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
224     elif domain == "":
225         raise ValueError("Parameter 'domain' is empty")
226
227     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
228     headers = csrf.determine(domain, dict())
229
230     # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
231     data = network.get_json_api(
232         domain,
233         "/.well-known/nodeinfo",
234         headers,
235         (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
236     )
237
238     if "error_message" not in data:
239         nodeinfo = data["json"]
240         # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
241         if "links" in nodeinfo:
242             # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
243             for link in nodeinfo["links"]:
244                 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
245                 if link["rel"] in nodeinfo_identifier:
246                     # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
247                     data = network.fetch_api_url(
248                         link["href"],
249                         (config.get("connection_timeout"), config.get("read_timeout"))
250                      )
251
252                     # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
253                     if "json" in data:
254                         # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
255                         instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
256                         instances.set_data("nodeinfo_url"  , domain, link["href"])
257                         break
258                     else:
259                         instances.update_last_error(domain, data)
260                 else:
261                     print("WARNING: Unknown 'rel' value:", domain, link["rel"])
262         else:
263             print("WARNING: nodeinfo does not contain 'links':", domain)
264
265     # DEBUG: print("DEBUG: Returning data[]:", type(data))
266     return data
267
268 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
269     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
270     if not isinstance(domain, str):
271         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
272     elif domain == "":
273         raise ValueError("Parameter 'domain' is empty")
274     elif not isinstance(path, str):
275         raise ValueError(f"path[]={type(path)} is not 'str'")
276     elif path == "":
277         raise ValueError("Parameter 'path' is empty")
278
279     # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
280     software = None
281
282     # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
283     response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
284
285     # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
286     if response.ok and response.status_code < 300 and len(response.text) > 0:
287         # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
288         doc = bs4.BeautifulSoup(response.text, "html.parser")
289
290         # DEBUG: print("DEBUG: doc[]:", type(doc))
291         generator = doc.find("meta", {"name"    : "generator"})
292         site_name = doc.find("meta", {"property": "og:site_name"})
293
294         # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
295         if isinstance(generator, bs4.element.Tag):
296             # DEBUG: print("DEBUG: Found generator meta tag:", domain)
297             software = tidyup.domain(generator.get("content"))
298             print(f"INFO: domain='{domain}' is generated by '{software}'")
299             instances.set_data("detection_mode", domain, "GENERATOR")
300         elif isinstance(site_name, bs4.element.Tag):
301             # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
302             sofware = tidyup.domain(site_name.get("content"))
303             print(f"INFO: domain='{domain}' has og:site_name='{software}'")
304             instances.set_data("detection_mode", domain, "SITE_NAME")
305
306     # DEBUG: print(f"DEBUG: software[]={type(software)}")
307     if isinstance(software, str) and software == "":
308         # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
309         software = None
310     elif isinstance(software, str) and ("." in software or " " in software):
311         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
312         software = fba.remove_version(software)
313
314     # DEBUG: print(f"DEBUG: software[]={type(software)}")
315     if isinstance(software, str) and " powered by " in software:
316         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
317         software = fba.remove_version(fba.strip_powered_by(software))
318     elif isinstance(software, str) and " hosted on " in software:
319         # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
320         software = fba.remove_version(fba.strip_hosted_on(software))
321     elif isinstance(software, str) and " by " in software:
322         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
323         software = fba.strip_until(software, " by ")
324     elif isinstance(software, str) and " see " in software:
325         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
326         software = fba.strip_until(software, " see ")
327
328     # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
329     return software
330
331 def determine_software(domain: str, path: str = None) -> str:
332     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
333     if not isinstance(domain, str):
334         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
335     elif domain == "":
336         raise ValueError("Parameter 'domain' is empty")
337     elif not isinstance(path, str) and path is not None:
338         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
339
340     # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
341     software = None
342
343     # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
344     data = fetch_nodeinfo(domain, path)
345
346     # DEBUG: print("DEBUG: data[]:", type(data))
347     if "error_message" in data:
348         # DEBUG: print("DEBUG: Could not determine software type:", domain)
349         return fetch_generator_from_path(domain)
350
351     # DEBUG: print("DEBUG: data():", len(data), data)
352     if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
353         print("WARNING: JSON response is an error:", data["json"]["message"])
354         instances.update_last_error(domain, data["json"]["message"])
355         return fetch_generator_from_path(domain)
356     elif "message" in data["json"]:
357         print("WARNING: JSON response contains only a message:", data["message"])
358         instances.update_last_error(domain, data["json"]["message"])
359         return fetch_generator_from_path(domain)
360     elif "software" not in data["json"] or "name" not in data["json"]["software"]:
361         # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
362         software = fetch_generator_from_path(domain)
363
364         # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
365         return software
366
367     software = tidyup.domain(data["json"]["software"]["name"])
368
369     # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
370     if software in ["akkoma", "rebased"]:
371         # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
372         software = "pleroma"
373     elif software in ["hometown", "ecko"]:
374         # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
375         software = "mastodon"
376     elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
377         # DEBUG: print("DEBUG: Setting misskey:", domain, software)
378         software = "misskey"
379     elif software.find("/") > 0:
380         print("WARNING: Spliting of slash:", software)
381         software = tidyup.domain(software.split("/")[-1])
382     elif software.find("|") > 0:
383         print("WARNING: Spliting of pipe:", software)
384         software = tidyup.domain(software.split("|")[0])
385     elif "powered by" in software:
386         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
387         software = fba.strip_powered_by(software)
388     elif isinstance(software, str) and " by " in software:
389         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
390         software = fba.strip_until(software, " by ")
391     elif isinstance(software, str) and " see " in software:
392         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
393         software = fba.strip_until(software, " see ")
394
395     # DEBUG: print(f"DEBUG: software[]={type(software)}")
396     if software == "":
397         print("WARNING: tidyup.domain() left no software name behind:", domain)
398         software = None
399
400     # DEBUG: print(f"DEBUG: software[]={type(software)}")
401     if str(software) == "":
402         # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
403         software = fetch_generator_from_path(domain)
404     elif len(str(software)) > 0 and ("." in software or " " in software):
405         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
406         software = fba.remove_version(software)
407
408     # DEBUG: print(f"DEBUG: software[]={type(software)}")
409     if isinstance(software, str) and "powered by" in software:
410         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
411         software = fba.remove_version(fba.strip_powered_by(software))
412
413     # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
414     return software
415
416 def find_domains(tag: bs4.element.Tag) -> list:
417     # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
418     if not isinstance(tag, bs4.element.Tag):
419         raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
420     elif len(tag.select("tr")) == 0:
421         raise KeyError("No table rows found in table!")
422
423     domains = list()
424     for element in tag.select("tr"):
425         # DEBUG: print(f"DEBUG: element[]={type(element)}")
426         if not element.find("td"):
427             # DEBUG: print("DEBUG: Skipping element, no <td> found")
428             continue
429
430         domain = tidyup.domain(element.find("td").text)
431         reason = tidyup.reason(element.findAll("td")[1].text)
432
433         # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
434
435         if blacklist.is_blacklisted(domain):
436             print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
437             continue
438         elif domain == "gab.com/.ai, develop.gab.com":
439             # DEBUG: print("DEBUG: Multiple domains detected in one row")
440             domains.append({
441                 "domain": "gab.com",
442                 "reason": reason,
443             })
444             domains.append({
445                 "domain": "gab.ai",
446                 "reason": reason,
447             })
448             domains.append({
449                 "domain": "develop.gab.com",
450                 "reason": reason,
451             })
452             continue
453         elif not validators.domain(domain):
454             print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
455             continue
456
457         # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
458         domains.append({
459             "domain": domain,
460             "reason": reason,
461         })
462
463     # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
464     return domains
465
466 def add_peers(rows: dict) -> list:
467     # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
468     peers = list()
469     for key in ["linked", "allowed", "blocked"]:
470         # DEBUG: print(f"DEBUG: Checking key='{key}'")
471         if key in rows and rows[key] is not None:
472             # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
473             for peer in rows[key]:
474                 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
475                 peer = tidyup.domain(peer)
476
477                 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
478                 if blacklist.is_blacklisted(peer):
479                     # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
480                     continue
481
482                 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
483                 peers.append(peer)
484
485     # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
486     return peers