1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29 from fba.helpers import version
31 from fba.http import network
33 from fba.models import instances
35 from fba.networks import lemmy
36 from fba.networks import misskey
37 from fba.networks import peertube
39 logging.basicConfig(level=logging.INFO)
40 logger = logging.getLogger(__name__)
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
55 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
56 domain_helper.raise_on(domain)
58 if not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60 elif not isinstance(command, str):
61 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
63 raise ValueError("Parameter 'command' is empty")
64 elif software is None:
65 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
66 instances.set_last_instance_fetch(domain)
69 logger.debug("Software for domain='%s' is not set, determining ...", domain)
70 software = determine_software(domain, path)
71 except network.exceptions as exception:
72 logger.warning("Exception '%s' during determining software type", type(exception))
73 instances.set_last_error(domain, exception)
75 logger.debug("Determined software='%s' for domain='%s'", software, domain)
76 elif not isinstance(software, str):
77 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
79 if not instances.is_registered(domain):
80 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
81 instances.add(domain, origin, command, path, software)
83 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
84 instances.set_last_instance_fetch(domain)
86 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
87 peerlist = fetch_peers(domain, software)
89 logger.debug("peerlist[]='%s'", type(peerlist))
90 if isinstance(peerlist, list):
91 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
92 instances.set_total_peers(domain, peerlist)
94 if instances.has_pending(domain):
95 logger.debug("domain='%s' has pending nodeinfo data, flushing ...", domain)
96 instances.update_data(domain)
99 logger.warning("Cannot fetch peers: domain='%s'", domain)
102 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
103 for instance in peerlist:
104 logger.debug("instance='%s'", instance)
106 # Skip "None" types as tidup.domain() cannot parse them
109 logger.debug("instance='%s' - BEFORE!", instance)
110 instance = tidyup.domain(instance)
111 logger.debug("instance='%s' - AFTER!", instance)
114 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
116 elif not utils.is_domain_wanted(instance):
117 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
119 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
120 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
122 elif not instances.is_registered(instance):
123 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
124 instances.add(instance, domain, command)
126 logger.debug("EXIT!")
128 def fetch_peers(domain: str, software: str) -> list:
129 logger.debug("domain='%s',software='%s' - CALLED!", domain, software)
130 domain_helper.raise_on(domain)
132 if not isinstance(software, str) and software is not None:
133 raise ValueError(f"software[]='{type(software)}' is not 'str'")
135 if software == "misskey":
136 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
137 return misskey.fetch_peers(domain)
138 elif software == "lemmy":
139 logger.debug("Invoking lemmy.fetch_peers(%s) ...", domain)
140 return lemmy.fetch_peers(domain)
141 elif software == "peertube":
142 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
143 return peertube.fetch_peers(domain)
145 # No CSRF by default, you don't have to add network.api_headers by yourself here
149 logger.debug("Checking CSRF for domain='%s'", domain)
150 headers = csrf.determine(domain, dict())
151 except network.exceptions as exception:
152 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
153 instances.set_last_error(domain, exception)
157 "/api/v1/instance/peers",
161 # Init peers variable
164 logger.debug("Checking %d paths ...", len(paths))
166 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
167 data = network.get_json_api(
171 (config.get("connection_timeout"), config.get("read_timeout"))
174 logger.debug("data[]='%s'", type(data))
175 if "error_message" in data:
176 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
177 instances.set_last_error(domain, data)
178 elif "json" in data and len(data["json"]) > 0:
179 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
181 instances.set_success(domain)
184 if not isinstance(peers, list):
185 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
188 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
189 instances.set_total_peers(domain, peers)
191 logger.debug("peers()=%d - EXIT!", len(peers))
194 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
195 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
196 domain_helper.raise_on(domain)
198 if not isinstance(path, str) and path is not None:
199 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
201 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
202 nodeinfo = fetch_wellknown_nodeinfo(domain)
204 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
205 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
206 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
207 return nodeinfo["json"]
209 # No CSRF by default, you don't have to add network.api_headers by yourself here
214 logger.debug("Checking CSRF for domain='%s'", domain)
215 headers = csrf.determine(domain, dict())
216 except network.exceptions as exception:
217 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
218 instances.set_last_error(domain, exception)
221 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
222 "exception" : exception,
226 "/nodeinfo/2.1.json",
228 "/nodeinfo/2.0.json",
234 for request in request_paths:
235 logger.debug("request='%s'", request)
236 http_url = f"http://{domain}{path}"
237 https_url = f"https://{domain}{path}"
239 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
240 if path is None or path in [request, http_url, https_url]:
241 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
242 if path in [http_url, https_url]:
243 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
244 components = urlparse(path)
245 path = components.path
247 data = network.get_json_api(
251 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
254 logger.debug("data[]='%s'", type(data))
255 if "error_message" not in data and "json" in data:
256 logger.debug("Success: request='%s'", request)
257 instances.set_detection_mode(domain, "STATIC_CHECK")
258 instances.set_nodeinfo_url(domain, request)
261 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
263 logger.debug("data()=%d - EXIT!", len(data))
266 def fetch_wellknown_nodeinfo(domain: str) -> dict:
267 logger.debug("domain='%s' - CALLED!", domain)
268 domain_helper.raise_on(domain)
270 # No CSRF by default, you don't have to add network.api_headers by yourself here
274 logger.debug("Checking CSRF for domain='%s'", domain)
275 headers = csrf.determine(domain, dict())
276 except network.exceptions as exception:
277 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
278 instances.set_last_error(domain, exception)
281 "error_message": type(exception),
282 "exception" : exception,
285 logger.debug("Fetching .well-known info for domain='%s'", domain)
286 data = network.get_json_api(
288 "/.well-known/nodeinfo",
290 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
293 if "error_message" not in data:
294 nodeinfo = data["json"]
295 instances.set_success(domain)
297 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
298 if "links" in nodeinfo:
299 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
300 for link in nodeinfo["links"]:
301 logger.debug("link[%s]='%s'", type(link), link)
302 if not isinstance(link, dict) or not "rel" in link:
303 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
304 elif link["rel"] in nodeinfo_identifier:
305 # Default is that 'href' has a complete URL, but some hosts don't send that
307 components = urlparse(link["href"])
309 logger.debug("components[%s]='%s'", type(components), components)
310 if components.scheme == "" and components.netloc == "":
311 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
312 url = f"https://{domain}{url}"
313 components = urlparse(url)
315 if not utils.is_domain_wanted(components.netloc):
316 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
319 logger.debug("Fetching nodeinfo from url='%s' ...", url)
320 data = network.fetch_api_url(
322 (config.get("connection_timeout"), config.get("read_timeout"))
325 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
326 if "error_message" not in data and "json" in data:
327 logger.debug("Found JSON data()=%d", len(data))
328 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
329 instances.set_nodeinfo_url(domain, link["href"])
330 instances.set_success(domain)
333 instances.set_last_error(domain, data)
335 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
337 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
339 logger.debug("Returning data[]='%s' - EXIT!", type(data))
342 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
343 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
344 domain_helper.raise_on(domain)
346 if not isinstance(path, str):
347 raise ValueError(f"path[]='{type(path)}' is not 'str'")
349 raise ValueError("Parameter 'path' is empty")
351 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
354 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
355 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
357 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
358 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
359 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
360 doc = bs4.BeautifulSoup(response.text, "html.parser")
362 logger.debug("doc[]='%s'", type(doc))
363 generator = doc.find("meta", {"name" : "generator"})
364 site_name = doc.find("meta", {"property": "og:site_name"})
366 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
367 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
368 logger.debug("Found generator meta tag: domain='%s'", domain)
369 software = tidyup.domain(generator.get("content"))
371 logger.debug("software[%s]='%s'", type(software), software)
372 if software is not None and software != "":
373 logger.info("domain='%s' is generated by '%s'", domain, software)
374 instances.set_detection_mode(domain, "GENERATOR")
375 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
376 logger.debug("Found property=og:site_name, domain='%s'", domain)
377 software = tidyup.domain(site_name.get("content"))
379 logger.debug("software[%s]='%s'", type(software), software)
380 if software is not None and software != "":
381 logger.info("domain='%s' has og:site_name='%s'", domain, software)
382 instances.set_detection_mode(domain, "SITE_NAME")
384 logger.debug("software[]='%s'", type(software))
385 if isinstance(software, str) and software == "":
386 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
388 elif isinstance(software, str) and ("." in software or " " in software):
389 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
390 software = version.remove(software)
392 logger.debug("software[]='%s'", type(software))
393 if isinstance(software, str) and "powered by " in software:
394 logger.debug("software='%s' has 'powered by' in it", software)
395 software = version.remove(version.strip_powered_by(software))
396 elif isinstance(software, str) and " hosted on " in software:
397 logger.debug("software='%s' has 'hosted on' in it", software)
398 software = version.remove(version.strip_hosted_on(software))
399 elif isinstance(software, str) and " by " in software:
400 logger.debug("software='%s' has ' by ' in it", software)
401 software = version.strip_until(software, " by ")
402 elif isinstance(software, str) and " see " in software:
403 logger.debug("software='%s' has ' see ' in it", software)
404 software = version.strip_until(software, " see ")
406 logger.debug("software='%s' - EXIT!", software)
409 def determine_software(domain: str, path: str = None) -> str:
410 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
411 domain_helper.raise_on(domain)
413 if not isinstance(path, str) and path is not None:
414 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
416 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
419 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
420 data = fetch_nodeinfo(domain, path)
422 logger.debug("data[]='%s'", type(data))
423 if "exception" in data:
424 # Continue raising it
425 raise data["exception"]
426 elif "error_message" in data:
427 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code='%d'", data['error_message'], data['status_code'])
428 return fetch_generator_from_path(domain)
429 elif "status" in data and data["status"] == "error" and "message" in data:
430 logger.warning("JSON response is an error: '%s'", data["message"])
431 instances.set_last_error(domain, data["message"])
432 return fetch_generator_from_path(domain)
433 elif "message" in data:
434 logger.warning("JSON response contains only a message: '%s'", data["message"])
435 instances.set_last_error(domain, data["message"])
436 return fetch_generator_from_path(domain)
437 elif "software" not in data or "name" not in data["software"]:
438 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
439 software = fetch_generator_from_path(domain)
440 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
441 elif "software" in data and "name" in data["software"]:
442 logger.debug("Found data[software][name] in JSON response")
443 software = data["software"]["name"]
446 logger.debug("Returning None - EXIT!")
449 logger.debug("software='%s'- BEFORE!", software)
450 software = tidyup.domain(software)
451 logger.debug("software='%s'- AFTER!", software)
453 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
454 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
456 elif software in ["hometown", "ecko"]:
457 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
458 software = "mastodon"
459 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
460 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
462 elif software == "runtube.re":
463 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
464 software = "peertube"
465 elif software == "nextcloud social":
466 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
467 software = "nextcloud"
468 elif software.find("/") > 0:
469 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
470 software = tidyup.domain(software.split("/")[-1])
471 elif software.find("|") > 0:
472 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
473 software = tidyup.domain(software.split("|")[0])
474 elif "powered by" in software:
475 logger.debug("software='%s' has 'powered by' in it", software)
476 software = version.strip_powered_by(software)
477 elif isinstance(software, str) and " by " in software:
478 logger.debug("software='%s' has ' by ' in it", software)
479 software = version.strip_until(software, " by ")
480 elif isinstance(software, str) and " see " in software:
481 logger.debug("software='%s' has ' see ' in it", software)
482 software = version.strip_until(software, " see ")
484 logger.debug("software['%s']='%s'", type(software), software)
486 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
489 logger.debug("software[]='%s'", type(software))
490 if str(software) == "":
491 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
492 software = fetch_generator_from_path(domain)
493 elif len(str(software)) > 0 and ("." in software or " " in software):
494 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
495 software = version.remove(software)
497 logger.debug("software[]='%s'", type(software))
498 if isinstance(software, str) and "powered by" in software:
499 logger.debug("software='%s' has 'powered by' in it", software)
500 software = version.remove(version.strip_powered_by(software))
502 logger.debug("software='%s' - EXIT!", domain, software)
505 def find_domains(tag: bs4.element.Tag) -> list:
506 logger.debug("tag[]='%s' - CALLED!", type(tag))
507 if not isinstance(tag, bs4.element.Tag):
508 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
509 elif len(tag.select("tr")) == 0:
510 raise KeyError("No table rows found in table!")
513 for element in tag.select("tr"):
514 logger.debug("element[]='%s'", type(element))
515 if not element.find("td"):
516 logger.debug("Skipping element, no <td> found")
519 domain = tidyup.domain(element.find("td").text)
520 reason = tidyup.reason(element.findAll("td")[1].text)
522 logger.debug("domain='%s',reason='%s'", domain, reason)
524 if not utils.is_domain_wanted(domain):
525 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
527 elif domain == "gab.com/.ai, develop.gab.com":
528 logger.debug("Multiple domains detected in one row")
538 "domain": "develop.gab.com",
542 elif not validators.domain(domain.split("/")[0]):
543 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
546 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
552 logger.debug("domains()=%d - EXIT!", len(domains))
555 def add_peers(rows: dict) -> list:
556 logger.debug("rows[]='%s' - CALLED!", type(rows))
557 if not isinstance(rows, dict):
558 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
561 for key in ["linked", "allowed", "blocked"]:
562 logger.debug("Checking key='%s'", key)
563 if key not in rows or rows[key] is None:
564 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
567 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
568 for peer in rows[key]:
569 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
570 if peer is None or peer == "":
571 logger.debug("peer is empty - SKIPPED")
573 elif isinstance(peer, dict) and "domain" in peer:
574 logger.debug("peer[domain]='%s'", peer['domain'])
575 peer = tidyup.domain(peer["domain"])
576 elif isinstance(peer, str):
577 logger.debug("peer='%s'", peer)
578 peer = tidyup.domain(peer)
580 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
582 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
583 if not utils.is_domain_wanted(peer):
584 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
587 logger.debug("Adding peer='%s' ...", peer)
590 logger.debug("peers()=%d - EXIT!", len(peers))