1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29 from fba.helpers import version
31 from fba.http import network
33 from fba.models import instances
35 from fba.networks import lemmy
36 from fba.networks import misskey
37 from fba.networks import peertube
39 logging.basicConfig(level=logging.INFO)
40 logger = logging.getLogger(__name__)
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
55 logger.debug(f"domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
56 domain_helper.raise_on(domain)
58 if not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60 elif software is None:
61 logger.debug(f"Updating last_instance_fetch for domain='{domain}' ...")
62 instances.set_last_instance_fetch(domain)
64 logger.debug(f"software for domain='{domain}' is not set, determining ...")
67 software = determine_software(domain, path)
68 except network.exceptions as exception:
69 logger.warning("Exception '%s' during determining software type", type(exception))
70 instances.set_last_error(domain, exception)
72 logger.debug(f"Determined software='{software}' for domain='{domain}'")
73 elif not isinstance(software, str):
74 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
75 elif not isinstance(command, str):
76 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
78 raise ValueError("Parameter 'command' is empty")
80 if not instances.is_registered(domain):
81 logger.debug(f"Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
82 instances.add(domain, origin, command, path, software)
84 logger.debug(f"Updating last_instance_fetch for domain='{domain}' ...")
85 instances.set_last_instance_fetch(domain)
87 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
88 peerlist = fetch_peers(domain, software)
91 logger.warning("Cannot fetch peers: domain='%s'", domain)
93 elif instances.has_pending(domain):
94 logger.debug(f"domain='{domain}' has pending nodeinfo data, flushing ...")
95 instances.update_data(domain)
97 logger.info("Checking %d instances from domain='%s' ...", len(peerlist), domain)
98 for instance in peerlist:
99 logger.debug(f"instance='{instance}'")
101 # Skip "None" types as tidup.domain() cannot parse them
104 logger.debug(f"instance='{instance}' - BEFORE")
105 instance = tidyup.domain(instance)
106 logger.debug(f"instance='{instance}' - AFTER")
109 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
111 elif not utils.is_domain_wanted(instance):
112 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
114 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
115 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
117 elif not instances.is_registered(instance):
118 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
119 instances.add(instance, domain, command)
121 logger.debug("EXIT!")
123 def fetch_peers(domain: str, software: str) -> list:
124 logger.debug(f"domain({len(domain)})='{domain}',software='{software}' - CALLED!")
125 domain_helper.raise_on(domain)
127 if not isinstance(software, str) and software is not None:
128 raise ValueError(f"software[]='{type(software)}' is not 'str'")
130 if software == "misskey":
131 logger.debug(f"Invoking misskey.fetch_peers({domain}) ...")
132 return misskey.fetch_peers(domain)
133 elif software == "lemmy":
134 logger.debug(f"Invoking lemmy.fetch_peers({domain}) ...")
135 return lemmy.fetch_peers(domain)
136 elif software == "peertube":
137 logger.debug(f"Invoking peertube.fetch_peers({domain}) ...")
138 return peertube.fetch_peers(domain)
140 # Init peers variable
143 # No CSRF by default, you don't have to add network.api_headers by yourself here
147 logger.debug("Checking CSRF for domain='%s'", domain)
148 headers = csrf.determine(domain, dict())
149 except network.exceptions as exception:
150 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
151 instances.set_last_error(domain, exception)
154 logger.debug(f"Fetching peers from '{domain}',software='{software}' ...")
155 data = network.get_json_api(
157 "/api/v1/instance/peers",
159 (config.get("connection_timeout"), config.get("read_timeout"))
162 logger.debug("data[]='%s'", type(data))
163 if "error_message" in data:
164 logger.debug("Was not able to fetch peers, trying alternative ...")
165 data = network.get_json_api(
169 (config.get("connection_timeout"), config.get("read_timeout"))
172 logger.debug("data[]='%s'", type(data))
173 if "error_message" in data:
174 logger.warning("Could not reach any JSON API at domain='%s',status_code='%d',error_message='%s'", domain, data['status_code'], data['error_message'])
175 elif "federated_instances" in data["json"]:
176 logger.debug("Found federated_instances for domain='%s'", domain)
177 peers = peers + add_peers(data["json"]["federated_instances"])
178 logger.debug("Added instance(s) to peers")
180 message = "JSON response does not contain 'federated_instances' or 'error_message'"
181 logger.warning("message='%s',domain='%s'", message, domain)
182 instances.set_last_error(domain, message)
183 elif isinstance(data["json"], list):
184 logger.debug("Querying API was successful: domain='%s',data[json]()=%d", domain, len(data['json']))
187 logger.warning("Cannot parse data[json][]='%s'", type(data['json']))
189 logger.debug("Adding %d for domain='%s'", len(peers), domain)
190 instances.set_total_peers(domain, peers)
192 logger.debug("peers()=%d - EXIT!", len(peers))
195 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
196 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
197 domain_helper.raise_on(domain)
199 if not isinstance(path, str) and path is not None:
200 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
202 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
203 nodeinfo = fetch_wellknown_nodeinfo(domain)
205 logger.debug("nodeinfo[%s]({len(nodeinfo)}='%s'", type(nodeinfo), nodeinfo)
206 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
207 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
208 return nodeinfo["json"]
210 # No CSRF by default, you don't have to add network.api_headers by yourself here
215 logger.debug("Checking CSRF for domain='%s'", domain)
216 headers = csrf.determine(domain, dict())
217 except network.exceptions as exception:
218 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
219 instances.set_last_error(domain, exception)
222 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
223 "exception" : exception,
227 "/nodeinfo/2.1.json",
229 "/nodeinfo/2.0.json",
235 for request in request_paths:
236 logger.debug("path[%s]='%s',request='%s'", type(path), path, request)
237 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
238 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
239 if path in [f"http://{domain}{path}", f"https://{domain}{path}"]:
240 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
241 components = urlparse(path)
242 path = components.path
244 data = network.get_json_api(
248 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
251 logger.debug("data[]='%s'", type(data))
252 if "error_message" not in data:
253 logger.debug("Success: request='%s'", request)
254 instances.set_detection_mode(domain, "STATIC_CHECK")
255 instances.set_nodeinfo_url(domain, request)
258 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
260 logger.debug("data()=%d - EXIT!", len(data))
263 def fetch_wellknown_nodeinfo(domain: str) -> dict:
264 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
265 domain_helper.raise_on(domain)
267 # No CSRF by default, you don't have to add network.api_headers by yourself here
271 logger.debug("Checking CSRF for domain='%s'", domain)
272 headers = csrf.determine(domain, dict())
273 except network.exceptions as exception:
274 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
275 instances.set_last_error(domain, exception)
278 "error_message": type(exception),
279 "exception" : exception,
282 logger.debug("Fetching .well-known info for domain='%s'", domain)
283 data = network.get_json_api(
285 "/.well-known/nodeinfo",
287 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
290 if "error_message" not in data:
291 nodeinfo = data["json"]
292 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
293 if "links" in nodeinfo:
294 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
295 for link in nodeinfo["links"]:
296 logger.debug("link[%s]='%s'", type(link), link)
297 if not isinstance(link, dict) or not "rel" in link:
298 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
299 elif link["rel"] in nodeinfo_identifier:
300 # Default is that 'href' has a complete URL, but some hosts don't send that
302 components = urlparse(link["href"])
304 logger.debug("components[%s]='%s'", type(components), components)
305 if components.scheme == "" and components.netloc == "":
306 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
307 url = f"https://{domain}{url}"
308 components = urlparse(url)
310 if not utils.is_domain_wanted(components.netloc):
311 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
314 logger.debug("Fetching nodeinfo from url='%s' ...", url)
315 data = network.fetch_api_url(
317 (config.get("connection_timeout"), config.get("read_timeout"))
320 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
321 if "error_message" not in data and "json" in data:
322 logger.debug("Found JSON nodeinfo()=%d", len(data))
323 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
324 instances.set_nodeinfo_url(domain, link["href"])
327 instances.set_last_error(domain, data)
329 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
331 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
333 logger.debug("Returning data[]='%s' - EXIT!", type(data))
336 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
337 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
338 domain_helper.raise_on(domain)
340 if not isinstance(path, str):
341 raise ValueError(f"path[]='{type(path)}' is not 'str'")
343 raise ValueError("Parameter 'path' is empty")
345 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
348 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
349 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
351 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
352 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
353 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
354 doc = bs4.BeautifulSoup(response.text, "html.parser")
356 logger.debug("doc[]='%s'", type(doc))
357 generator = doc.find("meta", {"name" : "generator"})
358 site_name = doc.find("meta", {"property": "og:site_name"})
360 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
361 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
362 logger.debug("Found generator meta tag: domain='%s'", domain)
363 software = tidyup.domain(generator.get("content"))
365 logger.debug("software[%s]='%s'", type(software), software)
366 if software is not None and software != "":
367 logger.info("domain='%s' is generated by '%s'", domain, software)
368 instances.set_detection_mode(domain, "GENERATOR")
369 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
370 logger.debug("Found property=og:site_name, domain='%s'", domain)
371 software = tidyup.domain(site_name.get("content"))
373 logger.debug("software[%s]='%s'", type(software), software)
374 if software is not None and software != "":
375 logger.info("domain='%s' has og:site_name='%s'", domain, software)
376 instances.set_detection_mode(domain, "SITE_NAME")
378 logger.debug("software[]='%s'", type(software))
379 if isinstance(software, str) and software == "":
380 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
382 elif isinstance(software, str) and ("." in software or " " in software):
383 logger.debug("software='%s' may contain a version number, domain='{domain}', removing it ...", software)
384 software = version.remove(software)
386 logger.debug("software[]='%s'", type(software))
387 if isinstance(software, str) and "powered by " in software:
388 logger.debug("software='%s' has 'powered by' in it", software)
389 software = version.remove(version.strip_powered_by(software))
390 elif isinstance(software, str) and " hosted on " in software:
391 logger.debug("software='%s' has 'hosted on' in it", software)
392 software = version.remove(version.strip_hosted_on(software))
393 elif isinstance(software, str) and " by " in software:
394 logger.debug("software='%s' has ' by ' in it", software)
395 software = version.strip_until(software, " by ")
396 elif isinstance(software, str) and " see " in software:
397 logger.debug("software='%s' has ' see ' in it", software)
398 software = version.strip_until(software, " see ")
400 logger.debug("software='%s' - EXIT!", software)
403 def determine_software(domain: str, path: str = None) -> str:
404 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
405 domain_helper.raise_on(domain)
407 if not isinstance(path, str) and path is not None:
408 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
410 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
413 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
414 data = fetch_nodeinfo(domain, path)
416 logger.debug("data[]='%s'", type(data))
417 if "exception" in data:
418 # Continue raising it
419 raise data["exception"]
420 elif "error_message" in data:
421 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code='%d'", data['error_message'], data['status_code'])
422 return fetch_generator_from_path(domain)
423 elif "status" in data and data["status"] == "error" and "message" in data:
424 logger.warning("JSON response is an error: '%s'", data["message"])
425 instances.set_last_error(domain, data["message"])
426 return fetch_generator_from_path(domain)
427 elif "message" in data:
428 logger.warning("JSON response contains only a message: '%s'", data["message"])
429 instances.set_last_error(domain, data["message"])
430 return fetch_generator_from_path(domain)
431 elif "software" not in data or "name" not in data["software"]:
432 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
433 software = fetch_generator_from_path(domain)
434 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
435 elif "software" in data and "name" in data["software"]:
436 logger.debug("Found data[software][name] in JSON response")
437 software = data["software"]["name"]
440 logger.debug("Returning None - EXIT!")
443 logger.debug("software='%s'- BEFORE!", software)
444 software = tidyup.domain(software)
445 logger.debug("software='%s'- AFTER!", software)
447 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
448 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
450 elif software in ["hometown", "ecko"]:
451 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
452 software = "mastodon"
453 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
454 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
456 elif software == "runtube.re":
457 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
458 software = "peertube"
459 elif software == "nextcloud social":
460 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
461 software = "nextcloud"
462 elif software.find("/") > 0:
463 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
464 software = tidyup.domain(software.split("/")[-1])
465 elif software.find("|") > 0:
466 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
467 software = tidyup.domain(software.split("|")[0])
468 elif "powered by" in software:
469 logger.debug("software='%s' has 'powered by' in it", software)
470 software = version.strip_powered_by(software)
471 elif isinstance(software, str) and " by " in software:
472 logger.debug("software='%s' has ' by ' in it", software)
473 software = version.strip_until(software, " by ")
474 elif isinstance(software, str) and " see " in software:
475 logger.debug("software='%s' has ' see ' in it", software)
476 software = version.strip_until(software, " see ")
478 logger.debug("software['%s']='%s'", type(software), software)
480 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
483 logger.debug("software[]='%s'", type(software))
484 if str(software) == "":
485 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
486 software = fetch_generator_from_path(domain)
487 elif len(str(software)) > 0 and ("." in software or " " in software):
488 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
489 software = version.remove(software)
491 logger.debug("software[]='%s'", type(software))
492 if isinstance(software, str) and "powered by" in software:
493 logger.debug("software='%s' has 'powered by' in it", software)
494 software = version.remove(version.strip_powered_by(software))
496 logger.debug("software='%s' - EXIT!", domain, software)
499 def find_domains(tag: bs4.element.Tag) -> list:
500 logger.debug("tag[]='%s' - CALLED!", type(tag))
501 if not isinstance(tag, bs4.element.Tag):
502 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
503 elif len(tag.select("tr")) == 0:
504 raise KeyError("No table rows found in table!")
507 for element in tag.select("tr"):
508 logger.debug("element[]='%s'", type(element))
509 if not element.find("td"):
510 logger.debug("Skipping element, no <td> found")
513 domain = tidyup.domain(element.find("td").text)
514 reason = tidyup.reason(element.findAll("td")[1].text)
516 logger.debug("domain='%s',reason='%s'", domain, reason)
518 if not utils.is_domain_wanted(domain):
519 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
521 elif domain == "gab.com/.ai, develop.gab.com":
522 logger.debug("Multiple domains detected in one row")
532 "domain": "develop.gab.com",
536 elif not validators.domain(domain.split("/")[0]):
537 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
540 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
546 logger.debug("domains()=%d - EXIT!", len(domains))
549 def add_peers(rows: dict) -> list:
550 logger.debug("rows[]='%s' - CALLED!", type(rows))
551 if not isinstance(rows, dict):
552 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
555 for key in ["linked", "allowed", "blocked"]:
556 logger.debug("Checking key='%s'", key)
557 if key not in rows or rows[key] is None:
558 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
561 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
562 for peer in rows[key]:
563 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
564 if peer is None or peer == "":
565 logger.debug("peer is empty - SKIPPED")
567 elif isinstance(peer, dict) and "domain" in peer:
568 logger.debug("peer[domain]='%s'", peer['domain'])
569 peer = tidyup.domain(peer["domain"])
570 elif isinstance(peer, str):
571 logger.debug("peer='%s'", peer)
572 peer = tidyup.domain(peer)
574 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
576 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
577 if not utils.is_domain_wanted(peer):
578 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
581 logger.debug("Adding peer='%s' ...", peer)
584 logger.debug("peers()=%d - EXIT!", len(peers))