]> git.mxchange.org Git - fba.git/blob - fba/federation.py
8d5515e0ad8caf21e4c618c6d220eb66d4dae2ac
[fba.git] / fba / federation.py
1 # Copyright (C) 2023 Free Software Foundation
2 #
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU Affero General Public License for more details.
12 #
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
15
16 import sys
17
18 import bs4
19 import validators
20
21 from fba import blacklist
22 from fba import config
23 from fba import fba
24 from fba import instances
25 from fba import network
26
27 from fba.helpers import tidyup
28
29 from fba.networks import lemmy
30 from fba.networks import misskey
31 from fba.networks import peertube
32
33 # "rel" identifiers (no real URLs)
34 nodeinfo_identifier = [
35     "https://nodeinfo.diaspora.software/ns/schema/2.1",
36     "https://nodeinfo.diaspora.software/ns/schema/2.0",
37     "https://nodeinfo.diaspora.software/ns/schema/1.1",
38     "https://nodeinfo.diaspora.software/ns/schema/1.0",
39     "http://nodeinfo.diaspora.software/ns/schema/2.1",
40     "http://nodeinfo.diaspora.software/ns/schema/2.0",
41     "http://nodeinfo.diaspora.software/ns/schema/1.1",
42     "http://nodeinfo.diaspora.software/ns/schema/1.0",
43 ]
44
45 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
46     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
47     if not isinstance(domain, str):
48         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
49     elif domain == "":
50         raise ValueError("Parameter 'domain' is empty")
51     elif not isinstance(origin, str) and origin is not None:
52         raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
53     elif software is None:
54         # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
55         software = determine_software(domain, path)
56         # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
57     elif not isinstance(software, str):
58         raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
59     elif not isinstance(script, str):
60         raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
61     elif domain == "":
62         raise ValueError("Parameter 'domain' is empty")
63
64     if not instances.is_registered(domain):
65         # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
66         instances.add(domain, origin, script, path)
67
68     # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
69     peerlist = fetch_peers(domain, software)
70
71     if peerlist is None:
72         print("ERROR: Cannot fetch peers:", domain)
73         return
74     elif instances.has_pending_instance_data(domain):
75         # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
76         instances.update_data(domain)
77
78     print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
79     for instance in peerlist:
80         if instance is None:
81             # Skip "None" types as tidup.domain() cannot parse them
82             continue
83
84         # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
85         instance = tidyup.domain(instance)
86         # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
87
88         if instance == "":
89             print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90             continue
91         elif not validators.domain(instance.split("/")[0]):
92             print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93             continue
94         elif blacklist.is_blacklisted(instance):
95             # DEBUG: print("DEBUG: instance is blacklisted:", instance)
96             continue
97
98         # DEBUG: print("DEBUG: Handling instance:", instance)
99         if not instances.is_registered(instance):
100             # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
101             instances.add(instance, domain, script)
102
103     # DEBUG: print("DEBUG: EXIT!")
104
105 def fetch_peers(domain: str, software: str) -> list:
106     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
107     if not isinstance(domain, str):
108         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
109     elif domain == "":
110         raise ValueError("Parameter 'domain' is empty")
111     elif not isinstance(software, str) and software is not None:
112         raise ValueError(f"software[]={type(software)} is not 'str'")
113
114     if software == "misskey":
115         # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
116         return misskey.fetch_peers(domain)
117     elif software == "lemmy":
118         # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
119         return lemmy.fetch_peers(domain)
120     elif software == "peertube":
121         # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
122         return peertube.fetch_peers(domain)
123
124     # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
125     data = network.get_json_api(
126         domain,
127         "/api/v1/instance/peers",
128         (config.get("connection_timeout"), config.get("read_timeout"))
129     )
130     # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
131
132     if "error_message" in data:
133         # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
134         data = network.get_json_api(
135             domain,
136             "/api/v3/site",
137             (config.get("connection_timeout"), config.get("read_timeout"))
138         )
139
140         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
141         if "error_message" in data:
142             print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
143         elif "federated_instances" in data["json"]:
144             # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
145             peers = peers + add_peers(data["json"]["federated_instances"])
146             # DEBUG: print("DEBUG: Added instance(s) to peers")
147         else:
148             print("WARNING: JSON response does not contain 'federated_instances':", domain)
149             instances.update_last_error(domain, response)
150     else:
151         # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
152         peers = data["json"]
153
154     # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
155     instances.set_data("total_peers", domain, len(peers))
156
157     # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
158     instances.update_last_instance_fetch(domain)
159
160     # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
161     return peers
162
163 def fetch_nodeinfo(domain: str, path: str = None) -> list:
164     # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
165     if not isinstance(domain, str):
166         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
167     elif domain == "":
168         raise ValueError("Parameter 'domain' is empty")
169     elif not isinstance(path, str) and path is not None:
170         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
171
172     # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
173     nodeinfo = fetch_wellknown_nodeinfo(domain)
174
175     # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
176     if len(nodeinfo) > 0:
177         # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
178         return nodeinfo
179
180     request_paths = [
181        "/nodeinfo/2.1.json",
182        "/nodeinfo/2.1",
183        "/nodeinfo/2.0.json",
184        "/nodeinfo/2.0",
185        "/nodeinfo/1.0",
186        "/api/v1/instance"
187     ]
188
189     for request in request_paths:
190         if path is not None and path != "" and path != request:
191             # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
192             continue
193
194         # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
195         data = network.get_json_api(
196             domain,
197             request,
198             (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
199         )
200
201         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
202         if "error_message" not in data:
203             # DEBUG: print("DEBUG: Success:", request)
204             instances.set_data("detection_mode", domain, "STATIC_CHECK")
205             instances.set_data("nodeinfo_url"  , domain, request)
206             break
207
208         print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
209
210     # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
211     return data
212
213 def fetch_wellknown_nodeinfo(domain: str) -> list:
214     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
215     if not isinstance(domain, str):
216         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
217     elif domain == "":
218         raise ValueError("Parameter 'domain' is empty")
219
220     # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
221     data = network.get_json_api(
222         domain,
223         "/.well-known/nodeinfo",
224         (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
225     )
226
227     if "error_message" not in data:
228         nodeinfo = data["json"]
229         # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
230         if "links" in nodeinfo:
231             # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
232             for link in nodeinfo["links"]:
233                 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
234                 if link["rel"] in nodeinfo_identifier:
235                     # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
236                     data = network.fetch_api_url(
237                         link["href"],
238                         (config.get("connection_timeout"), config.get("read_timeout"))
239                      )
240
241                     # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
242                     if "json" in data:
243                         # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
244                         instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
245                         instances.set_data("nodeinfo_url"  , domain, link["href"])
246                         break
247                     else:
248                         instances.update_last_error(domain, data)
249                 else:
250                     print("WARNING: Unknown 'rel' value:", domain, link["rel"])
251         else:
252             print("WARNING: nodeinfo does not contain 'links':", domain)
253
254     # DEBUG: print("DEBUG: Returning data[]:", type(data))
255     return data
256
257 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
258     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
259     if not isinstance(domain, str):
260         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
261     elif domain == "":
262         raise ValueError("Parameter 'domain' is empty")
263     elif not isinstance(path, str):
264         raise ValueError(f"path[]={type(path)} is not 'str'")
265     elif path == "":
266         raise ValueError("Parameter 'path' is empty")
267
268     # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
269     software = None
270
271     # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
272     response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
273
274     # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
275     if response.ok and response.status_code < 300 and len(response.text) > 0:
276         # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
277         doc = bs4.BeautifulSoup(response.text, "html.parser")
278
279         # DEBUG: print("DEBUG: doc[]:", type(doc))
280         generator = doc.find("meta", {"name"    : "generator"})
281         site_name = doc.find("meta", {"property": "og:site_name"})
282
283         # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
284         if isinstance(generator, bs4.element.Tag):
285             # DEBUG: print("DEBUG: Found generator meta tag:", domain)
286             software = tidyup.domain(generator.get("content"))
287             print(f"INFO: domain='{domain}' is generated by '{software}'")
288             instances.set_data("detection_mode", domain, "GENERATOR")
289         elif isinstance(site_name, bs4.element.Tag):
290             # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
291             sofware = tidyup.domain(site_name.get("content"))
292             print(f"INFO: domain='{domain}' has og:site_name='{software}'")
293             instances.set_data("detection_mode", domain, "SITE_NAME")
294
295     # DEBUG: print(f"DEBUG: software[]={type(software)}")
296     if isinstance(software, str) and software == "":
297         # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
298         software = None
299     elif isinstance(software, str) and ("." in software or " " in software):
300         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
301         software = fba.remove_version(software)
302
303     # DEBUG: print(f"DEBUG: software[]={type(software)}")
304     if isinstance(software, str) and " powered by " in software:
305         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
306         software = fba.remove_version(fba.strip_powered_by(software))
307     elif isinstance(software, str) and " hosted on " in software:
308         # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
309         software = fba.remove_version(fba.strip_hosted_on(software))
310     elif isinstance(software, str) and " by " in software:
311         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
312         software = fba.strip_until(software, " by ")
313     elif isinstance(software, str) and " see " in software:
314         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
315         software = fba.strip_until(software, " see ")
316
317     # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
318     return software
319
320 def determine_software(domain: str, path: str = None) -> str:
321     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
322     if not isinstance(domain, str):
323         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
324     elif domain == "":
325         raise ValueError("Parameter 'domain' is empty")
326     elif not isinstance(path, str) and path is not None:
327         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
328
329     # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
330     software = None
331
332     # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
333     data = fetch_nodeinfo(domain, path)
334
335     # DEBUG: print("DEBUG: data[]:", type(data))
336     if not isinstance(data, dict) or len(data) == 0:
337         # DEBUG: print("DEBUG: Could not determine software type:", domain)
338         return fetch_generator_from_path(domain)
339
340     # DEBUG: print("DEBUG: data():", len(data), data)
341     if "status" in data and data["status"] == "error" and "message" in data:
342         print("WARNING: JSON response is an error:", data["message"])
343         instances.update_last_error(domain, data["message"])
344         return fetch_generator_from_path(domain)
345     elif "message" in data:
346         print("WARNING: JSON response contains only a message:", data["message"])
347         instances.update_last_error(domain, data["message"])
348         return fetch_generator_from_path(domain)
349     elif "software" not in data or "name" not in data["software"]:
350         # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
351         software = fetch_generator_from_path(domain)
352
353         # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
354         return software
355
356     software = tidyup.domain(data["software"]["name"])
357
358     # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
359     if software in ["akkoma", "rebased"]:
360         # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
361         software = "pleroma"
362     elif software in ["hometown", "ecko"]:
363         # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
364         software = "mastodon"
365     elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
366         # DEBUG: print("DEBUG: Setting misskey:", domain, software)
367         software = "misskey"
368     elif software.find("/") > 0:
369         print("WARNING: Spliting of slash:", software)
370         software = tidyup.domain(software.split("/")[-1])
371     elif software.find("|") > 0:
372         print("WARNING: Spliting of pipe:", software)
373         software = tidyup.domain(software.split("|")[0])
374     elif "powered by" in software:
375         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
376         software = fba.strip_powered_by(software)
377     elif isinstance(software, str) and " by " in software:
378         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
379         software = fba.strip_until(software, " by ")
380     elif isinstance(software, str) and " see " in software:
381         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
382         software = fba.strip_until(software, " see ")
383
384     # DEBUG: print(f"DEBUG: software[]={type(software)}")
385     if software == "":
386         print("WARNING: tidyup.domain() left no software name behind:", domain)
387         software = None
388
389     # DEBUG: print(f"DEBUG: software[]={type(software)}")
390     if str(software) == "":
391         # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
392         software = fetch_generator_from_path(domain)
393     elif len(str(software)) > 0 and ("." in software or " " in software):
394         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
395         software = fba.remove_version(software)
396
397     # DEBUG: print(f"DEBUG: software[]={type(software)}")
398     if isinstance(software, str) and "powered by" in software:
399         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
400         software = fba.remove_version(fba.strip_powered_by(software))
401
402     # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
403     return software
404
405 def find_domains(tag: bs4.element.Tag) -> list:
406     # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
407     if not isinstance(tag, bs4.element.Tag):
408         raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
409     elif len(tag.select("tr")) == 0:
410         raise KeyError("No table rows found in table!")
411
412     domains = list()
413     for element in tag.select("tr"):
414         # DEBUG: print(f"DEBUG: element[]={type(element)}")
415         if not element.find("td"):
416             # DEBUG: print("DEBUG: Skipping element, no <td> found")
417             continue
418
419         domain = tidyup.domain(element.find("td").text)
420         reason = tidyup.reason(element.findAll("td")[1].text)
421
422         # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
423
424         if blacklist.is_blacklisted(domain):
425             print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
426             continue
427         elif domain == "gab.com/.ai, develop.gab.com":
428             # DEBUG: print("DEBUG: Multiple domains detected in one row")
429             domains.append({
430                 "domain": "gab.com",
431                 "reason": reason,
432             })
433             domains.append({
434                 "domain": "gab.ai",
435                 "reason": reason,
436             })
437             domains.append({
438                 "domain": "develop.gab.com",
439                 "reason": reason,
440             })
441             continue
442         elif not validators.domain(domain):
443             print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
444             continue
445
446         # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
447         domains.append({
448             "domain": domain,
449             "reason": reason,
450         })
451
452     # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
453     return domains
454
455 def add_peers(rows: dict) -> list:
456     # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
457     peers = list()
458     for key in ["linked", "allowed", "blocked"]:
459         # DEBUG: print(f"DEBUG: Checking key='{key}'")
460         if key in rows and rows[key] is not None:
461             # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
462             for peer in rows[key]:
463                 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
464                 peer = tidyup.domain(peer)
465
466                 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
467                 if blacklist.is_blacklisted(peer):
468                     # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
469                     continue
470
471                 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
472                 peers.append(peer)
473
474     # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
475     return peers