]> git.mxchange.org Git - fba.git/blob - fba/federation.py
9f1e0098cc4f09f21a844259488bfcd0e04a800a
[fba.git] / fba / federation.py
1 # Copyright (C) 2023 Free Software Foundation
2 #
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU Affero General Public License for more details.
12 #
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
15
16 import bs4
17 import validators
18
19 from fba import blacklist
20 from fba import config
21 from fba import csrf
22 from fba import fba
23 from fba import instances
24 from fba import network
25
26 from fba.helpers import tidyup
27
28 from fba.networks import lemmy
29 from fba.networks import misskey
30 from fba.networks import peertube
31
32 # "rel" identifiers (no real URLs)
33 nodeinfo_identifier = [
34     "https://nodeinfo.diaspora.software/ns/schema/2.1",
35     "https://nodeinfo.diaspora.software/ns/schema/2.0",
36     "https://nodeinfo.diaspora.software/ns/schema/1.1",
37     "https://nodeinfo.diaspora.software/ns/schema/1.0",
38     "http://nodeinfo.diaspora.software/ns/schema/2.1",
39     "http://nodeinfo.diaspora.software/ns/schema/2.0",
40     "http://nodeinfo.diaspora.software/ns/schema/1.1",
41     "http://nodeinfo.diaspora.software/ns/schema/1.0",
42 ]
43
44 def fetch_instances(domain: str, origin: str, software: str, script: str, path: str = None):
45     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
46     if not isinstance(domain, str):
47         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
48     elif domain == "":
49         raise ValueError("Parameter 'domain' is empty")
50     elif not isinstance(origin, str) and origin is not None:
51         raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'")
52     elif software is None:
53         # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
54         software = determine_software(domain, path)
55         # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
56     elif not isinstance(software, str):
57         raise ValueError(f"Parameter software[]={type(software)} is not 'str'")
58     elif not isinstance(script, str):
59         raise ValueError(f"Parameter script[]={type(script)} is not 'str'")
60     elif domain == "":
61         raise ValueError("Parameter 'domain' is empty")
62
63     if not instances.is_registered(domain):
64         # DEBUG: print("DEBUG: Adding new domain:", domain, origin)
65         instances.add(domain, origin, script, path)
66
67     # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
68     peerlist = fetch_peers(domain, software)
69
70     if peerlist is None:
71         print("ERROR: Cannot fetch peers:", domain)
72         return
73     elif instances.has_pending_instance_data(domain):
74         # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
75         instances.update_data(domain)
76
77     print(f"INFO: Checking {len(peerlist)} instances from {domain} ...")
78     for instance in peerlist:
79         if instance is None:
80             # Skip "None" types as tidup.domain() cannot parse them
81             continue
82
83         # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
84         instance = tidyup.domain(instance)
85         # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
86
87         if instance == "":
88             print("WARNING: Empty instance after tidyup.domain(), domain:", domain)
89             continue
90         elif not validators.domain(instance.split("/")[0]):
91             print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
92             continue
93         elif blacklist.is_blacklisted(instance):
94             # DEBUG: print("DEBUG: instance is blacklisted:", instance)
95             continue
96
97         # DEBUG: print("DEBUG: Handling instance:", instance)
98         if not instances.is_registered(instance):
99             # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
100             instances.add(instance, domain, script)
101
102     # DEBUG: print("DEBUG: EXIT!")
103
104 def fetch_peers(domain: str, software: str) -> list:
105     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
106     if not isinstance(domain, str):
107         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
108     elif domain == "":
109         raise ValueError("Parameter 'domain' is empty")
110     elif not isinstance(software, str) and software is not None:
111         raise ValueError(f"software[]={type(software)} is not 'str'")
112
113     if software == "misskey":
114         # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
115         return misskey.fetch_peers(domain)
116     elif software == "lemmy":
117         # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
118         return lemmy.fetch_peers(domain)
119     elif software == "peertube":
120         # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
121         return peertube.fetch_peers(domain)
122
123     # Init peers variable
124     peers = list()
125     headers = tuple()
126
127     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
128     try:
129        headers = csrf.determine(domain, dict())
130     except network.exceptions as exception:
131         print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
132         return
133
134     # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
135     data = network.get_json_api(
136         domain,
137         "/api/v1/instance/peers",
138         headers,
139         (config.get("connection_timeout"), config.get("read_timeout"))
140     )
141
142     # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
143     if "error_message" in data:
144         # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
145         data = network.get_json_api(
146             domain,
147             "/api/v3/site",
148             headers,
149             (config.get("connection_timeout"), config.get("read_timeout"))
150         )
151
152         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
153         if "error_message" in data:
154             print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
155         elif "federated_instances" in data["json"]:
156             # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
157             peers = peers + add_peers(data["json"]["federated_instances"])
158             # DEBUG: print("DEBUG: Added instance(s) to peers")
159         else:
160             message = "JSON response does not contain 'federated_instances' or 'error_message'"
161             print(f"WARNING: {message},domain='{domain}'")
162             instances.update_last_error(domain, message)
163     else:
164         # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
165         peers = data["json"]
166
167     # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
168     instances.set_data("total_peers", domain, len(peers))
169
170     # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
171     instances.update_last_instance_fetch(domain)
172
173     # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
174     return peers
175
176 def fetch_nodeinfo(domain: str, path: str = None) -> list:
177     # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
178     if not isinstance(domain, str):
179         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
180     elif domain == "":
181         raise ValueError("Parameter 'domain' is empty")
182     elif not isinstance(path, str) and path is not None:
183         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
184
185     # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
186     nodeinfo = fetch_wellknown_nodeinfo(domain)
187
188     # DEBUG: print(f"DEBUG: nodeinfo({len(nodeinfo)})={nodeinfo}")
189     if len(nodeinfo) > 0:
190         # DEBUG: print("DEBUG: nodeinfo()={len(nodeinfo))} - EXIT!")
191         return nodeinfo
192
193     headers = tuple()
194
195     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
196     try:
197        headers = csrf.determine(domain, dict())
198     except network.exceptions as exception:
199         print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
200         return
201
202     request_paths = [
203        "/nodeinfo/2.1.json",
204        "/nodeinfo/2.1",
205        "/nodeinfo/2.0.json",
206        "/nodeinfo/2.0",
207        "/nodeinfo/1.0",
208        "/api/v1/instance"
209     ]
210
211     for request in request_paths:
212         if path is not None and path != "" and path != request:
213             # DEBUG: print(f"DEBUG: path='{path}' does not match request='{request}' - SKIPPED!")
214             continue
215
216         # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
217         data = network.get_json_api(
218             domain,
219             request,
220             headers,
221             (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
222         )
223
224         # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
225         if "error_message" not in data:
226             # DEBUG: print("DEBUG: Success:", request)
227             instances.set_data("detection_mode", domain, "STATIC_CHECK")
228             instances.set_data("nodeinfo_url"  , domain, request)
229             break
230
231         print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
232
233     # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
234     return data
235
236 def fetch_wellknown_nodeinfo(domain: str) -> list:
237     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
238     if not isinstance(domain, str):
239         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
240     elif domain == "":
241         raise ValueError("Parameter 'domain' is empty")
242
243     headers = tuple()
244
245     # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
246     try:
247        headers = csrf.determine(domain, dict())
248     except network.exceptions as exception:
249         print(f"WARNING: Exception '{type(exception)}' during checking CSRF - EXIT!")
250         return
251
252     # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
253     data = network.get_json_api(
254         domain,
255         "/.well-known/nodeinfo",
256         headers,
257         (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
258     )
259
260     if "error_message" not in data:
261         nodeinfo = data["json"]
262         # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
263         if "links" in nodeinfo:
264             # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
265             for link in nodeinfo["links"]:
266                 # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"])
267                 if link["rel"] in nodeinfo_identifier:
268                     # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
269                     data = network.fetch_api_url(
270                         link["href"],
271                         (config.get("connection_timeout"), config.get("read_timeout"))
272                      )
273
274                     # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
275                     if "json" in data:
276                         # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
277                         instances.set_data("detection_mode", domain, "AUTO_DISCOVERY")
278                         instances.set_data("nodeinfo_url"  , domain, link["href"])
279                         break
280                     else:
281                         instances.update_last_error(domain, data)
282                 else:
283                     print("WARNING: Unknown 'rel' value:", domain, link["rel"])
284         else:
285             print("WARNING: nodeinfo does not contain 'links':", domain)
286
287     # DEBUG: print("DEBUG: Returning data[]:", type(data))
288     return data
289
290 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
291     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
292     if not isinstance(domain, str):
293         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
294     elif domain == "":
295         raise ValueError("Parameter 'domain' is empty")
296     elif not isinstance(path, str):
297         raise ValueError(f"path[]={type(path)} is not 'str'")
298     elif path == "":
299         raise ValueError("Parameter 'path' is empty")
300
301     # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
302     software = None
303
304     # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
305     response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
306
307     # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
308     if response.ok and response.status_code < 300 and len(response.text) > 0:
309         # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
310         doc = bs4.BeautifulSoup(response.text, "html.parser")
311
312         # DEBUG: print("DEBUG: doc[]:", type(doc))
313         generator = doc.find("meta", {"name"    : "generator"})
314         site_name = doc.find("meta", {"property": "og:site_name"})
315
316         # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
317         if isinstance(generator, bs4.element.Tag):
318             # DEBUG: print("DEBUG: Found generator meta tag:", domain)
319             software = tidyup.domain(generator.get("content"))
320             print(f"INFO: domain='{domain}' is generated by '{software}'")
321             instances.set_data("detection_mode", domain, "GENERATOR")
322         elif isinstance(site_name, bs4.element.Tag):
323             # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
324             sofware = tidyup.domain(site_name.get("content"))
325             print(f"INFO: domain='{domain}' has og:site_name='{software}'")
326             instances.set_data("detection_mode", domain, "SITE_NAME")
327
328     # DEBUG: print(f"DEBUG: software[]={type(software)}")
329     if isinstance(software, str) and software == "":
330         # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
331         software = None
332     elif isinstance(software, str) and ("." in software or " " in software):
333         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
334         software = fba.remove_version(software)
335
336     # DEBUG: print(f"DEBUG: software[]={type(software)}")
337     if isinstance(software, str) and " powered by " in software:
338         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
339         software = fba.remove_version(fba.strip_powered_by(software))
340     elif isinstance(software, str) and " hosted on " in software:
341         # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
342         software = fba.remove_version(fba.strip_hosted_on(software))
343     elif isinstance(software, str) and " by " in software:
344         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
345         software = fba.strip_until(software, " by ")
346     elif isinstance(software, str) and " see " in software:
347         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
348         software = fba.strip_until(software, " see ")
349
350     # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
351     return software
352
353 def determine_software(domain: str, path: str = None) -> str:
354     # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
355     if not isinstance(domain, str):
356         raise ValueError(f"Parameter domain[]={type(domain)} is not 'str'")
357     elif domain == "":
358         raise ValueError("Parameter 'domain' is empty")
359     elif not isinstance(path, str) and path is not None:
360         raise ValueError(f"Parameter path[]={type(path)} is not 'str'")
361
362     # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
363     software = None
364
365     # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
366     data = fetch_nodeinfo(domain, path)
367
368     # DEBUG: print("DEBUG: data[]:", type(data))
369     if "error_message" in data:
370         # DEBUG: print("DEBUG: Could not determine software type:", domain)
371         return fetch_generator_from_path(domain)
372
373     # DEBUG: print("DEBUG: data():", len(data), data)
374     if "status" in data["json"] and data["json"]["status"] == "error" and "message" in data["json"]:
375         print("WARNING: JSON response is an error:", data["json"]["message"])
376         instances.update_last_error(domain, data["json"]["message"])
377         return fetch_generator_from_path(domain)
378     elif "message" in data["json"]:
379         print("WARNING: JSON response contains only a message:", data["message"])
380         instances.update_last_error(domain, data["json"]["message"])
381         return fetch_generator_from_path(domain)
382     elif "software" not in data["json"] or "name" not in data["json"]["software"]:
383         # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
384         software = fetch_generator_from_path(domain)
385
386         # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
387         return software
388
389     software = tidyup.domain(data["json"]["software"]["name"])
390
391     # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
392     if software in ["akkoma", "rebased"]:
393         # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
394         software = "pleroma"
395     elif software in ["hometown", "ecko"]:
396         # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
397         software = "mastodon"
398     elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
399         # DEBUG: print("DEBUG: Setting misskey:", domain, software)
400         software = "misskey"
401     elif software.find("/") > 0:
402         print("WARNING: Spliting of slash:", software)
403         software = tidyup.domain(software.split("/")[-1])
404     elif software.find("|") > 0:
405         print("WARNING: Spliting of pipe:", software)
406         software = tidyup.domain(software.split("|")[0])
407     elif "powered by" in software:
408         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
409         software = fba.strip_powered_by(software)
410     elif isinstance(software, str) and " by " in software:
411         # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
412         software = fba.strip_until(software, " by ")
413     elif isinstance(software, str) and " see " in software:
414         # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
415         software = fba.strip_until(software, " see ")
416
417     # DEBUG: print(f"DEBUG: software[]={type(software)}")
418     if software == "":
419         print("WARNING: tidyup.domain() left no software name behind:", domain)
420         software = None
421
422     # DEBUG: print(f"DEBUG: software[]={type(software)}")
423     if str(software) == "":
424         # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
425         software = fetch_generator_from_path(domain)
426     elif len(str(software)) > 0 and ("." in software or " " in software):
427         # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
428         software = fba.remove_version(software)
429
430     # DEBUG: print(f"DEBUG: software[]={type(software)}")
431     if isinstance(software, str) and "powered by" in software:
432         # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
433         software = fba.remove_version(fba.strip_powered_by(software))
434
435     # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
436     return software
437
438 def find_domains(tag: bs4.element.Tag) -> list:
439     # DEBUG: print(f"DEBUG: tag[]={type(tag)} - CALLED!")
440     if not isinstance(tag, bs4.element.Tag):
441         raise ValueError(f"Parameter tag[]={type(tag)} is not type of bs4.element.Tag")
442     elif len(tag.select("tr")) == 0:
443         raise KeyError("No table rows found in table!")
444
445     domains = list()
446     for element in tag.select("tr"):
447         # DEBUG: print(f"DEBUG: element[]={type(element)}")
448         if not element.find("td"):
449             # DEBUG: print("DEBUG: Skipping element, no <td> found")
450             continue
451
452         domain = tidyup.domain(element.find("td").text)
453         reason = tidyup.reason(element.findAll("td")[1].text)
454
455         # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
456
457         if blacklist.is_blacklisted(domain):
458             print(f"WARNING: domain='{domain}' is blacklisted - skipped!")
459             continue
460         elif domain == "gab.com/.ai, develop.gab.com":
461             # DEBUG: print("DEBUG: Multiple domains detected in one row")
462             domains.append({
463                 "domain": "gab.com",
464                 "reason": reason,
465             })
466             domains.append({
467                 "domain": "gab.ai",
468                 "reason": reason,
469             })
470             domains.append({
471                 "domain": "develop.gab.com",
472                 "reason": reason,
473             })
474             continue
475         elif not validators.domain(domain):
476             print(f"WARNING: domain='{domain}' is not a valid domain - skipped!")
477             continue
478
479         # DEBUG: print(f"DEBUG: Adding domain='{domain}' ...")
480         domains.append({
481             "domain": domain,
482             "reason": reason,
483         })
484
485     # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
486     return domains
487
488 def add_peers(rows: dict) -> list:
489     # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
490     peers = list()
491     for key in ["linked", "allowed", "blocked"]:
492         # DEBUG: print(f"DEBUG: Checking key='{key}'")
493         if key in rows and rows[key] is not None:
494             # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
495             for peer in rows[key]:
496                 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
497                 peer = tidyup.domain(peer)
498
499                 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
500                 if blacklist.is_blacklisted(peer):
501                     # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
502                     continue
503
504                 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
505                 peers.append(peer)
506
507     # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")
508     return peers