]> git.mxchange.org Git - fba.git/blob - fba/federation.py
b7e86a21dd5593fa5b8dc38ea6bc82e490c9a11e
[fba.git] / fba / federation.py
1 # Copyright (C) 2023 Free Software Foundation
2 #
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU Affero General Public License for more details.
12 #
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
15
16 import sys
17
18 import bs4
19 import validators
20
21 from fba import blacklist
22 from fba import config
23 from fba import fba
24 from fba import instances
25 from fba import network
26
27 from fba.helpers import tidyup
28
29 from fba.networks import lemmy
30 from fba.networks import misskey
31 from fba.networks import peertube
32
33 # "rel" identifiers (no real URLs)
34 nodeinfo_identifier = [
35     "https://nodeinfo.diaspora.software/ns/schema/2.1",
36     "https://nodeinfo.diaspora.software/ns/schema/2.0",
37     "https://nodeinfo.diaspora.software/ns/schema/1.1",
38     "https://nodeinfo.diaspora.software/ns/schema/1.0",
39     "http://nodeinfo.diaspora.software/ns/schema/2.1",
40     "http://nodeinfo.diaspora.software/ns/schema/2.0",
41     "http://nodeinfo.diaspora.software/ns/schema/1.1",
42     "http://nodeinfo.diaspora.software/ns/schema/1.0",
43 ]
44
45 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
46     print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
47     if not isinstance(domain, str):
48         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
49     elif domain == "":
50         raise ValueError("Parameter 'domain' is empty")
51     elif not isinstance(origin, str) and origin is not None:
52         raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
53     elif software is None:
54         print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
55         software = determine_software(domain, path)
56         print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
57     elif not isinstance(software, str):
58         raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
59     elif not isinstance(script, str):
60         raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
61     elif domain == "":
62         raise ValueError("Parameter 'domain' is empty")
63
64     if not instances.is_registered(domain):
65         print("DEBUG: Adding new domain:", domain, origin)
66         instances.add(domain, origin, script, path)
67
68     print("DEBUG: Fetching instances for domain:", domain, software)
69     peerlist = fetch_peers(domain, software)
70
71     if peerlist is None:
72         print("ERROR: Cannot fetch peers:", domain)
73         return
74     elif instances.has_pending_instance_data(domain):
75         print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
76         instances.update_data(domain)
77
78     print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
79     for instance in peerlist:
80         if instance is None:
81             # Skip "None" types as tidup() cannot parse them
82             continue
83
84         print(f"DEBUG: instance='{instance}' - BEFORE")
85         instance = tidyup.domain(instance)
86         print(f"DEBUG: instance='{instance}' - AFTER")
87
88         if instance == "":
89             print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
90             continue
91         elif not validators.domain(instance.split("/")[0]):
92             print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
93             continue
94         elif blacklist.is_blacklisted(instance):
95             print("DEBUG: instance is blacklisted:", instance)
96             continue
97
98         print("DEBUG: Handling instance:", instance)
99         try:
100             if not instances.is_registered(instance):
101                 print("DEBUG: Adding new instance:", instance, domain)
102                 instances.add(instance, domain, script)
103         except BaseException as exception:
104             print(f"ERROR: instance='{instance}',exception[{type(exception)}]:'{str(exception)}'")
105             continue
106
107     print("DEBUG: EXIT!")
108
109 def fetch_peers(domain: str, software: str) -> list:
110     print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
111     if not isinstance(domain, str):
112         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
113     elif domain == "":
114         raise ValueError("Parameter 'domain' is empty")
115     elif not isinstance(software, str) and software is not None:
116         raise ValueError(f"software[]={type(software)} is not 'str'")
117
118     if software == "misskey":
119         print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
120         return misskey.fetch_peers(domain)
121     elif software == "lemmy":
122         print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
123         return lemmy.fetch_peers(domain)
124     elif software == "peertube":
125         print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
126         return peertube.fetch_peers(domain)
127
128     print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
129     peers = list()
130     response = network.fetch_response(domain, "/api/v1/instance/peers", network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
131     print(f"DEBUG: response[]='{type(response)}'")
132
133     data = network.json_from_response(response)
134     print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
135
136     if not response.ok or response.status_code >= 400:
137         print("DEBUG: Was not able to fetch peers, trying alternative ...")
138         response = network.fetch_response(domain, "/api/v3/site", network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
139
140         data = network.json_from_response(response)
141         print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
142         if not response.ok or response.status_code >= 400:
143             print("WARNING: Could not reach any JSON API:", domain)
144             instances.update_last_error(domain, response)
145         elif response.ok and isinstance(data, list):
146             print(f"DEBUG: domain='{domain}' returned a list: '{data}'")
147             sys.exit(255)
148         elif "federated_instances" in data:
149             print(f"DEBUG: Found federated_instances for domain='{domain}'")
150             peers = peers + add_peers(data["federated_instances"])
151             print("DEBUG: Added instance(s) to peers")
152         else:
153             print("WARNING: JSON response does not contain 'federated_instances':", domain)
154             instances.update_last_error(domain, response)
155     else:
156         print("DEBUG: Querying API was successful:", domain, len(data))
157         peers = data
158
159     print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
160     instances.set_data("total_peers", domain, len(peers))
161
162     print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
163     instances.update_last_instance_fetch(domain)
164
165     print("DEBUG: Returning peers[]:", type(peers))
166     return peers
167
168 def fetch_nodeinfo(domain: str, path: str = None) -> list:
169     print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
170     if not isinstance(domain, str):
171         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
172     elif domain == "":
173         raise ValueError("Parameter 'domain' is empty")
174     elif not isinstance(path, str) and path is not None:
175         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
176
177     print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
178     nodeinfo = fetch_wellknown_nodeinfo(domain)
179
180     print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
181     if len(nodeinfo) > 0:
182         print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
183         return nodeinfo
184
185     request_paths = [
186        "/nodeinfo/2.1.json",
187        "/nodeinfo/2.1",
188        "/nodeinfo/2.0.json",
189        "/nodeinfo/2.0",
190        "/nodeinfo/1.0",
191        "/api/v1/instance"
192     ]
193
194     for request in request_paths:
195         if path is not None and path != "" and path != request:
196             print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
197             continue
198
199         print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
200         response = network.fetch_response(domain, request, network.api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
201
202         data = network.json_from_response(response)
203         print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
204         if response.ok and isinstance(data, dict):
205             print("DEBUG: Success:", request)
206             instances.set_data("detection_mode", domain, "STATIC_CHECK")
207             instances.set_data("nodeinfo_url"  , domain, request)
208             break
209         elif response.ok and isinstance(data, list):
210             print(f"UNSUPPORTED: domain='{domain}' returned a list: '{data}'")
211             sys.exit(255)
212         elif not response.ok or response.status_code >= 400:
213             print("WARNING: Failed fetching nodeinfo from domain:", domain)
214             instances.update_last_error(domain, response)
215             continue
216
217     print(f"DEBUG: data()={len(data)} - EXIT!")
218     return data
219
220 def fetch_wellknown_nodeinfo(domain: str) -> list:
221     print(f"DEBUG: domain='{domain}' - CALLED!")
222     if not isinstance(domain, str):
223         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
224     elif domain == "":
225         raise ValueError("Parameter 'domain' is empty")
226
227     print("DEBUG: Fetching .well-known info for domain:", domain)
228     response = network.fetch_response(domain, "/.well-known/nodeinfo", network.api_headers, (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout")))
229
230     data = network.json_from_response(response)
231     print("DEBUG: domain,response.ok,data[]:", domain, response.ok, type(data))
232     if response.ok and isinstance(data, dict):
233         nodeinfo = data
234         print("DEBUG: Found entries:", len(nodeinfo), domain)
235         if "links" in nodeinfo:
236             print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
237             for link in nodeinfo["links"]:
238                 print("DEBUG: rel,href:", link["rel"], link["href"])
239                 if link["rel"] in nodeinfo_identifier:
240                     print("DEBUG: Fetching nodeinfo from:", link["href"])
241                     response = fba.fetch_url(link["href"], network.api_headers, (config.get("connection_timeout"), config.get("read_timeout")))
242
243                     data = network.json_from_response(response)
244                     print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code)
245                     if response.ok and isinstance(data, dict):
246                         print("DEBUG: Found JSON nodeinfo():", len(data))
247                         instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
248                         instances.set_data("nodeinfo_url"  , domain, link["href"])
249                         break
250                 else:
251                     print("WARNING: Unknown 'rel' value:", domain, link["rel"])
252         else:
253             print("WARNING: nodeinfo does not contain 'links':", domain)
254
255     print("DEBUG: Returning data[]:", type(data))
256     return data
257
258 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
259     print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
260     if not isinstance(domain, str):
261         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
262     elif domain == "":
263         raise ValueError("Parameter 'domain' is empty")
264     elif not isinstance(path, str):
265         raise ValueError(f"path[]={type(path)} is not 'str'")
266     elif path == "":
267         raise ValueError("Parameter 'path' is empty")
268
269     print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
270     software = None
271
272     print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
273     response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
274
275     print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
276     if response.ok and response.status_code < 300 and len(response.text) > 0:
277         print("DEBUG: Search for <meta name='generator'>:", domain)
278         doc = bs4.BeautifulSoup(response.text, "html.parser")
279
280         print("DEBUG: doc[]:", type(doc))
281         generator = doc.find("meta", {"name": "generator"})
282         site_name = doc.find("meta", {"property": "og:site_name"})
283
284         print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
285         if isinstance(generator, bs4.element.Tag):
286             print("DEBUG: Found generator meta tag:", domain)
287             software = tidyup.domain(generator.get("content"))
288             print(f"INFO: domain='{domain}' is generated by '{software}'")
289             instances.set_data("detection_mode", domain, "GENERATOR")
290             fba.remove_pending_error(domain)
291         elif isinstance(site_name, bs4.element.Tag):
292             print("DEBUG: Found property=og:site_name:", domain)
293             sofware = tidyup.domain(site_name.get("content"))
294             print(f"INFO: domain='{domain}' has og:site_name='{software}'")
295             instances.set_data("detection_mode", domain, "SITE_NAME")
296             fba.remove_pending_error(domain)
297
298     print(f"DEBUG: software[]={type(software)}")
299     if isinstance(software, str) and software == "":
300         print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
301         software = None
302     elif isinstance(software, str) and ("." in software or " " in software):
303         print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
304         software = fba.remove_version(software)
305
306     print(f"DEBUG: software[]={type(software)}")
307     if isinstance(software, str) and " powered by " in software:
308         print(f"DEBUG: software='{software}' has 'powered by' in it")
309         software = fba.remove_version(fba.strip_powered_by(software))
310     elif isinstance(software, str) and " hosted on " in software:
311         print(f"DEBUG: software='{software}' has 'hosted on' in it")
312         software = fba.remove_version(fba.strip_hosted_on(software))
313     elif isinstance(software, str) and " by " in software:
314         print(f"DEBUG: software='{software}' has ' by ' in it")
315         software = fba.strip_until(software, " by ")
316     elif isinstance(software, str) and " see " in software:
317         print(f"DEBUG: software='{software}' has ' see ' in it")
318         software = fba.strip_until(software, " see ")
319
320     print(f"DEBUG: software='{software}' - EXIT!")
321     return software
322
323 def determine_software(domain: str, path: str = None) -> str:
324     print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
325     if not isinstance(domain, str):
326         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
327     elif domain == "":
328         raise ValueError("Parameter 'domain' is empty")
329     elif not isinstance(path, str) and path is not None:
330         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
331
332     print("DEBUG: Determining software for domain,path:", domain, path)
333     software = None
334
335     print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
336     data = fetch_nodeinfo(domain, path)
337
338     print("DEBUG: data[]:", type(data))
339     if not isinstance(data, dict) or len(data) == 0:
340         print("DEBUG: Could not determine software type:", domain)
341         return fetch_generator_from_path(domain)
342
343     print("DEBUG: data():", len(data), data)
344     if "status" in data and data["status"] == "error" and "message" in data:
345         print("WARNING: JSON response is an error:", data["message"])
346         instances.update_last_error(domain, data["message"])
347         return fetch_generator_from_path(domain)
348     elif "message" in data:
349         print("WARNING: JSON response contains only a message:", data["message"])
350         instances.update_last_error(domain, data["message"])
351         return fetch_generator_from_path(domain)
352     elif "software" not in data or "name" not in data["software"]:
353         print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
354         software = fetch_generator_from_path(domain)
355
356         print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
357         return software
358
359     software = tidyup.domain(data["software"]["name"])
360
361     print("DEBUG: sofware after tidyup.domain():", software)
362     if software in ["akkoma", "rebased"]:
363         print("DEBUG: Setting pleroma:", domain, software)
364         software = "pleroma"
365     elif software in ["hometown", "ecko"]:
366         print("DEBUG: Setting mastodon:", domain, software)
367         software = "mastodon"
368     elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
369         print("DEBUG: Setting misskey:", domain, software)
370         software = "misskey"
371     elif software.find("/") > 0:
372         print("WARNING: Spliting of slash:", software)
373         software = tidyup.domain(software.split("/")[-1])
374     elif software.find("|") > 0:
375         print("WARNING: Spliting of pipe:", software)
376         software = tidyup.domain(software.split("|")[0])
377     elif "powered by" in software:
378         print(f"DEBUG: software='{software}' has 'powered by' in it")
379         software = fba.strip_powered_by(software)
380     elif isinstance(software, str) and " by " in software:
381         print(f"DEBUG: software='{software}' has ' by ' in it")
382         software = fba.strip_until(software, " by ")
383     elif isinstance(software, str) and " see " in software:
384         print(f"DEBUG: software='{software}' has ' see ' in it")
385         software = fba.strip_until(software, " see ")
386
387     print(f"DEBUG: software[]={type(software)}")
388     if software == "":
389         print("WARNING: tidyup.domain() left no software name behind:", domain)
390         software = None
391
392     print(f"DEBUG: software[]={type(software)}")
393     if str(software) == "":
394         print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
395         software = fetch_generator_from_path(domain)
396     elif len(str(software)) > 0 and ("." in software or " " in software):
397         print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
398         software = fba.remove_version(software)
399
400     print(f"DEBUG: software[]={type(software)}")
401     if isinstance(software, str) and "powered by" in software:
402         print(f"DEBUG: software='{software}' has 'powered by' in it")
403         software = fba.remove_version(fba.strip_powered_by(software))
404
405     print("DEBUG: Returning domain,software:", domain, software)
406     return software
407
408 def find_domains(tag: bs4.element.Tag) -> list:
409     print(f"DEBUG: tag[]={type(tag)} - CALLED!")
410     if not isinstance(tag, bs4.element.Tag):
411         raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
412     elif len(tag.select("tr")) == 0:
413         raise KeyError("No table rows found in table!")
414
415     domains = list()
416     for element in tag.select("tr"):
417         print(f"DEBUG: element[]={type(element)}")
418         if not element.find("td"):
419             print("DEBUG: Skipping element, no <td> found")
420             continue
421
422         domain = tidyup.domain(element.find("td").text)
423         reason = tidyup.reason(element.findAll("td")[1].text)
424
425         print(f"DEBUG: domain='{domain}',reason='{reason}'")
426
427         if blacklist.is_blacklisted(domain):
428             print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
429             continue
430         elif domain == "gab.com/.ai, develop.gab.com":
431             print("DEBUG: Multiple domains detected in one row")
432             domains.append({
433                 "domain": "gab.com",
434                 "reason": reason,
435             })
436             domains.append({
437                 "domain": "gab.ai",
438                 "reason": reason,
439             })
440             domains.append({
441                 "domain": "develop.gab.com",
442                 "reason": reason,
443             })
444             continue
445         elif not validators.domain(domain):
446             print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
447             continue
448
449         print(f"DEBUG: Adding domain='{domain}' ...")
450         domains.append({
451             "domain": domain,
452             "reason": reason,
453         })
454
455     print(f"DEBUG: domains()={len(domains)} - EXIT!")
456     return domains
457
458 def add_peers(rows: dict) -> list:
459     # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
460     peers = list()
461     for key in ["linked", "allowed", "blocked"]:
462         # DEBUG: print(f"DEBUG: Checking key='{key}'")
463         if key in rows and rows[key] is not None:
464             # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
465             for peer in rows[key]:
466                 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
467                 peer = tidyup.domain(peer)
468
469                 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
470                 if blacklist.is_blacklisted(peer):
471                     # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
472                     continue
473
474                 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
475                 peers.append(peer)
476
477     # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
478     return peers