1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 from urllib.parse import urlparse
21 from fba import blacklist
22 from fba import config
24 from fba import network
26 from fba.helpers import tidyup
27 from fba.helpers import version
29 from fba.models import instances
31 from fba.networks import lemmy
32 from fba.networks import misskey
33 from fba.networks import peertube
35 # "rel" identifiers (no real URLs)
36 nodeinfo_identifier = [
37 "https://nodeinfo.diaspora.software/ns/schema/2.1",
38 "https://nodeinfo.diaspora.software/ns/schema/2.0",
39 "https://nodeinfo.diaspora.software/ns/schema/1.1",
40 "https://nodeinfo.diaspora.software/ns/schema/1.0",
41 "http://nodeinfo.diaspora.software/ns/schema/2.1",
42 "http://nodeinfo.diaspora.software/ns/schema/2.0",
43 "http://nodeinfo.diaspora.software/ns/schema/1.1",
44 "http://nodeinfo.diaspora.software/ns/schema/1.0",
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
48 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
49 if not isinstance(domain, str):
50 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
52 raise ValueError("Parameter 'domain' is empty")
53 elif domain.endswith(".tld"):
54 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
55 elif not isinstance(origin, str) and origin is not None:
56 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
57 elif software is None:
58 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
59 instances.set_last_instance_fetch(domain)
61 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
64 software = determine_software(domain, path)
65 except network.exceptions as exception:
66 # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type")
69 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
70 elif not isinstance(software, str):
71 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
72 elif not isinstance(command, str):
73 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
75 raise ValueError("Parameter 'command' is empty")
76 elif domain.endswith(".arpa"):
77 print(f"WARNING: domain='{domain}' is a reversed .arpa domain and should not be used generally.")
79 elif not validators.domain(domain.split("/")[0]):
80 raise ValueError(f"domain='{domain}' is not a valid domain")
82 if not instances.is_registered(domain.split("/")[0]):
83 # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
84 instances.add(domain, origin, command, path, software)
86 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
87 instances.set_last_instance_fetch(domain)
89 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
90 peerlist = fetch_peers(domain, software)
93 print("ERROR: Cannot fetch peers:", domain)
95 elif instances.has_pending(domain):
96 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
97 instances.update_data(domain)
99 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
100 for instance in peerlist:
101 # DEBUG: print(f"DEBUG: instance='{instance}'")
103 # Skip "None" types as tidup.domain() cannot parse them
106 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
107 instance = tidyup.domain(instance)
108 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
111 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
113 elif not validators.domain(instance.split("/")[0]):
114 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}',software='{software}'")
116 elif instance.endswith(".arpa"):
117 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
119 elif blacklist.is_blacklisted(instance):
120 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
123 # DEBUG: print("DEBUG: Handling instance:", instance)
124 if instance.endswith(".arpa"):
125 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
127 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
128 # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!")
130 elif not instances.is_registered(instance):
131 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
132 instances.add(instance, domain, command)
134 # DEBUG: print("DEBUG: EXIT!")
136 def fetch_peers(domain: str, software: str) -> list:
137 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},software={software} - CALLED!")
138 if not isinstance(domain, str):
139 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
141 raise ValueError("Parameter 'domain' is empty")
142 elif not isinstance(software, str) and software is not None:
143 raise ValueError(f"software[]='{type(software)}' is not 'str'")
145 if software == "misskey":
146 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
147 return misskey.fetch_peers(domain)
148 elif software == "lemmy":
149 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
150 return lemmy.fetch_peers(domain)
151 elif software == "peertube":
152 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
153 return peertube.fetch_peers(domain)
155 # Init peers variable
158 # No CSRF by default, you don't have to add network.api_headers by yourself here
162 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
163 headers = csrf.determine(domain, dict())
164 except network.exceptions as exception:
165 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
166 instances.set_last_error(domain, exception)
169 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
170 data = network.get_json_api(
172 "/api/v1/instance/peers",
174 (config.get("connection_timeout"), config.get("read_timeout"))
177 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
178 if "error_message" in data:
179 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
180 data = network.get_json_api(
184 (config.get("connection_timeout"), config.get("read_timeout"))
187 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
188 if "error_message" in data:
189 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
190 elif "federated_instances" in data["json"]:
191 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
192 peers = peers + add_peers(data["json"]["federated_instances"])
193 # DEBUG: print("DEBUG: Added instance(s) to peers")
195 message = "JSON response does not contain 'federated_instances' or 'error_message'"
196 print(f"WARNING: {message},domain='{domain}'")
197 instances.set_last_error(domain, message)
199 # DEBUG: print("DEBUG: Querying API was successful:", domain, len(data))
202 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
203 instances.set_total_peers(domain, peers)
205 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
208 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
209 # DEBUG: print(f"DEBUG: domain='{domain}',path={path} - CALLED!")
210 if not isinstance(domain, str):
211 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
213 raise ValueError("Parameter 'domain' is empty")
214 elif not isinstance(path, str) and path is not None:
215 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
217 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
218 nodeinfo = fetch_wellknown_nodeinfo(domain)
220 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
221 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
222 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
223 return nodeinfo["json"]
225 # No CSRF by default, you don't have to add network.api_headers by yourself here
230 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
231 headers = csrf.determine(domain, dict())
232 except network.exceptions as exception:
233 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
234 instances.set_last_error(domain, exception)
237 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
238 "exception" : exception,
242 "/nodeinfo/2.1.json",
244 "/nodeinfo/2.0.json",
250 for request in request_paths:
251 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
252 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
253 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
254 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
255 print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...")
256 components = urlparse(path)
257 path = components.path
259 data = network.get_json_api(
263 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
266 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
267 if "error_message" not in data:
268 # DEBUG: print("DEBUG: Success:", request)
269 instances.set_detection_mode(domain, "STATIC_CHECK")
270 instances.set_nodeinfo_url(domain, request)
273 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
275 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
278 def fetch_wellknown_nodeinfo(domain: str) -> dict:
279 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
280 if not isinstance(domain, str):
281 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
283 raise ValueError("Parameter 'domain' is empty")
285 # No CSRF by default, you don't have to add network.api_headers by yourself here
289 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
290 headers = csrf.determine(domain, dict())
291 except network.exceptions as exception:
292 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
293 instances.set_last_error(domain, exception)
296 "error_message": type(exception),
297 "exception" : exception,
300 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
301 data = network.get_json_api(
303 "/.well-known/nodeinfo",
305 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
308 if "error_message" not in data:
309 nodeinfo = data["json"]
310 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
311 if "links" in nodeinfo:
312 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
313 for link in nodeinfo["links"]:
314 # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'")
315 if not isinstance(link, dict) or not "rel" in link:
316 print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found")
317 elif link["rel"] in nodeinfo_identifier:
318 # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"])
319 data = network.fetch_api_url(
321 (config.get("connection_timeout"), config.get("read_timeout"))
324 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
325 if not "error_message" in data and "json" in data:
326 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
327 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
328 instances.set_nodeinfo_url(domain, link["href"])
331 instances.set_last_error(domain, data)
333 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
335 print("WARNING: nodeinfo does not contain 'links':", domain)
337 # DEBUG: print("DEBUG: Returning data[]:", type(data))
340 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
341 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
342 if not isinstance(domain, str):
343 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
345 raise ValueError("Parameter 'domain' is empty")
346 elif not isinstance(path, str):
347 raise ValueError(f"path[]='{type(path)}' is not 'str'")
349 raise ValueError("Parameter 'path' is empty")
351 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
354 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
355 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
357 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
358 if response.ok and response.status_code < 300 and len(response.text) > 0:
359 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
360 doc = bs4.BeautifulSoup(response.text, "html.parser")
362 # DEBUG: print("DEBUG: doc[]:", type(doc))
363 generator = doc.find("meta", {"name" : "generator"})
364 site_name = doc.find("meta", {"property": "og:site_name"})
366 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
367 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
368 print("DEBUG: Found generator meta tag:", domain)
369 software = tidyup.domain(generator.get("content"))
370 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
371 if software is not None and software != "":
372 print(f"INFO: domain='{domain}' is generated by '{software}'")
373 instances.set_detection_mode(domain, "GENERATOR")
374 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
375 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
376 software = tidyup.domain(site_name.get("content"))
377 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
378 if software is not None and software != "":
379 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
380 instances.set_detection_mode(domain, "SITE_NAME")
382 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
383 if isinstance(software, str) and software == "":
384 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
386 elif isinstance(software, str) and ("." in software or " " in software):
387 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
388 software = version.remove(software)
390 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
391 if isinstance(software, str) and "powered by " in software:
392 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
393 software = version.remove(version.strip_powered_by(software))
394 elif isinstance(software, str) and " hosted on " in software:
395 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
396 software = version.remove(version.strip_hosted_on(software))
397 elif isinstance(software, str) and " by " in software:
398 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
399 software = version.strip_until(software, " by ")
400 elif isinstance(software, str) and " see " in software:
401 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
402 software = version.strip_until(software, " see ")
404 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
407 def determine_software(domain: str, path: str = None) -> str:
408 # DEBUG: print(f"DEBUG: domain({len(domain)})={domain},path={path} - CALLED!")
409 if not isinstance(domain, str):
410 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
412 raise ValueError("Parameter 'domain' is empty")
413 elif not isinstance(path, str) and path is not None:
414 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
416 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
419 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
420 data = fetch_nodeinfo(domain, path)
422 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
423 if "exception" in data:
424 # Continue raising it
425 raise data["exception"]
426 elif "error_message" in data:
427 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
428 return fetch_generator_from_path(domain)
429 elif "status" in data and data["status"] == "error" and "message" in data:
430 print("WARNING: JSON response is an error:", data["message"])
431 instances.set_last_error(domain, data["message"])
432 return fetch_generator_from_path(domain)
433 elif "message" in data:
434 print("WARNING: JSON response contains only a message:", data["message"])
435 instances.set_last_error(domain, data["message"])
436 return fetch_generator_from_path(domain)
437 elif "software" not in data or "name" not in data["software"]:
438 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
439 software = fetch_generator_from_path(domain)
441 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: {software}, EXIT!")
444 software = tidyup.domain(data["software"]["name"])
446 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
447 if software in ["akkoma", "rebased"]:
448 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
450 elif software in ["hometown", "ecko"]:
451 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
452 software = "mastodon"
453 elif software in ["calckey", "groundpolis", "foundkey", "cherrypick", "meisskey"]:
454 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
456 elif software == "runtube.re":
457 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
458 software = "peertube"
459 elif software == "nextcloud social":
460 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
461 software = "nextcloud"
462 elif software.find("/") > 0:
463 print("WARNING: Spliting of slash:", software)
464 software = tidyup.domain(software.split("/")[-1])
465 elif software.find("|") > 0:
466 print("WARNING: Spliting of pipe:", software)
467 software = tidyup.domain(software.split("|")[0])
468 elif "powered by" in software:
469 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
470 software = version.strip_powered_by(software)
471 elif isinstance(software, str) and " by " in software:
472 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
473 software = version.strip_until(software, " by ")
474 elif isinstance(software, str) and " see " in software:
475 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
476 software = version.strip_until(software, " see ")
478 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
480 print("WARNING: tidyup.domain() left no software name behind:", domain)
483 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
484 if str(software) == "":
485 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
486 software = fetch_generator_from_path(domain)
487 elif len(str(software)) > 0 and ("." in software or " " in software):
488 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
489 software = version.remove(software)
491 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
492 if isinstance(software, str) and "powered by" in software:
493 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
494 software = version.remove(version.strip_powered_by(software))
496 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
499 def find_domains(tag: bs4.element.Tag) -> list:
500 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
501 if not isinstance(tag, bs4.element.Tag):
502 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
503 elif len(tag.select("tr")) == 0:
504 raise KeyError("No table rows found in table!")
507 for element in tag.select("tr"):
508 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
509 if not element.find("td"):
510 # DEBUG: print("DEBUG: Skipping element, no <td> found")
513 domain = tidyup.domain(element.find("td").text)
514 reason = tidyup.reason(element.findAll("td")[1].text)
516 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
518 if blacklist.is_blacklisted(domain):
519 print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!")
521 elif domain == "gab.com/.ai, develop.gab.com":
522 # DEBUG: print("DEBUG: Multiple domains detected in one row")
532 "domain": "develop.gab.com",
536 elif not validators.domain(domain):
537 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
540 # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...")
546 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
549 def add_peers(rows: dict) -> list:
550 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
552 for key in ["linked", "allowed", "blocked"]:
553 # DEBUG: print(f"DEBUG: Checking key='{key}'")
554 if key in rows and rows[key] is not None:
555 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
556 for peer in rows[key]:
557 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
558 peer = tidyup.domain(peer)
560 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
561 if blacklist.is_blacklisted(peer):
562 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
565 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
568 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")