1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29 from fba.helpers import version
31 from fba.http import network
33 from fba.models import instances
35 from fba.networks import lemmy
36 from fba.networks import misskey
37 from fba.networks import peertube
39 logging.basicConfig(level=logging.INFO)
40 logger = logging.getLogger(__name__)
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
55 logger.debug("domain='%s',origin='%s',software='%s',path='%s' - CALLED!", domain, origin, software, path)
56 domain_helper.raise_on(domain)
58 if not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60 elif software is None:
61 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
62 instances.set_last_instance_fetch(domain)
65 logger.debug("Software for domain='%s' is not set, determining ...", domain)
66 software = determine_software(domain, path)
67 except network.exceptions as exception:
68 logger.warning("Exception '%s' during determining software type", type(exception))
69 instances.set_last_error(domain, exception)
71 logger.debug("Determined software='%s' for domain='%s'", software, domain)
72 elif not isinstance(software, str):
73 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
74 elif not isinstance(command, str):
75 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
77 raise ValueError("Parameter 'command' is empty")
79 if not instances.is_registered(domain):
80 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
81 instances.add(domain, origin, command, path, software)
83 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
84 instances.set_last_instance_fetch(domain)
86 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
87 peerlist = fetch_peers(domain, software)
90 logger.warning("Cannot fetch peers: domain='%s'", domain)
92 elif instances.has_pending(domain):
93 logger.debug("domain='%s' has pending nodeinfo data, flushing ...", domain)
94 instances.update_data(domain)
96 logger.info("Checking %d instances from domain='%s' ...", len(peerlist), domain)
97 for instance in peerlist:
98 logger.debug("instance='%s'", instance)
100 # Skip "None" types as tidup.domain() cannot parse them
103 logger.debug("instance='%s' - BEFORE", instance)
104 instance = tidyup.domain(instance)
105 logger.debug("instance='%s' - AFTER", instance)
108 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
110 elif not utils.is_domain_wanted(instance):
111 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
113 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
114 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
116 elif not instances.is_registered(instance):
117 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
118 instances.add(instance, domain, command)
120 logger.debug("EXIT!")
122 def fetch_peers(domain: str, software: str) -> list:
123 logger.debug("domain='%s',software='%s' - CALLED!", domain, software)
124 domain_helper.raise_on(domain)
126 if not isinstance(software, str) and software is not None:
127 raise ValueError(f"software[]='{type(software)}' is not 'str'")
129 if software == "misskey":
130 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
131 return misskey.fetch_peers(domain)
132 elif software == "lemmy":
133 logger.debug("Invoking lemmy.fetch_peers(%s) ...", domain)
134 return lemmy.fetch_peers(domain)
135 elif software == "peertube":
136 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
137 return peertube.fetch_peers(domain)
139 # Init peers variable
142 # No CSRF by default, you don't have to add network.api_headers by yourself here
146 logger.debug("Checking CSRF for domain='%s'", domain)
147 headers = csrf.determine(domain, dict())
148 except network.exceptions as exception:
149 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
150 instances.set_last_error(domain, exception)
153 logger.debug("Fetching peers from domain='%s',software='%s' ...", domain, software)
154 data = network.get_json_api(
156 "/api/v1/instance/peers",
158 (config.get("connection_timeout"), config.get("read_timeout"))
161 logger.debug("data[]='%s'", type(data))
162 if "error_message" in data:
163 logger.debug("Was not able to fetch peers, trying alternative ...")
164 data = network.get_json_api(
168 (config.get("connection_timeout"), config.get("read_timeout"))
171 logger.debug("data[]='%s'", type(data))
172 if "error_message" in data:
173 logger.warning("Could not reach any JSON API at domain='%s',status_code='%d',error_message='%s'", domain, data['status_code'], data['error_message'])
174 elif "federated_instances" in data["json"]:
175 logger.debug("Found federated_instances for domain='%s'", domain)
176 peers = peers + add_peers(data["json"]["federated_instances"])
177 logger.debug("Added instance(s) to peers")
179 message = "JSON response does not contain 'federated_instances' or 'error_message'"
180 logger.warning("message='%s',domain='%s'", message, domain)
181 instances.set_last_error(domain, message)
182 elif isinstance(data["json"], list):
183 logger.debug("Querying API was successful: domain='%s',data[json]()=%d", domain, len(data['json']))
186 logger.warning("Cannot parse data[json][]='%s'", type(data['json']))
188 logger.debug("Adding %d for domain='%s'", len(peers), domain)
189 instances.set_total_peers(domain, peers)
191 logger.debug("peers()=%d - EXIT!", len(peers))
194 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
195 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
196 domain_helper.raise_on(domain)
198 if not isinstance(path, str) and path is not None:
199 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
201 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
202 nodeinfo = fetch_wellknown_nodeinfo(domain)
204 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
205 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
206 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
207 return nodeinfo["json"]
209 # No CSRF by default, you don't have to add network.api_headers by yourself here
214 logger.debug("Checking CSRF for domain='%s'", domain)
215 headers = csrf.determine(domain, dict())
216 except network.exceptions as exception:
217 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
218 instances.set_last_error(domain, exception)
221 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
222 "exception" : exception,
226 "/nodeinfo/2.1.json",
228 "/nodeinfo/2.0.json",
234 for request in request_paths:
235 logger.debug("path[%s]='%s',request='%s'", type(path), path, request)
236 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
237 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
238 if path in [f"http://{domain}{path}", f"https://{domain}{path}"]:
239 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
240 components = urlparse(path)
241 path = components.path
243 data = network.get_json_api(
247 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
250 logger.debug("data[]='%s'", type(data))
251 if "error_message" not in data:
252 logger.debug("Success: request='%s'", request)
253 instances.set_detection_mode(domain, "STATIC_CHECK")
254 instances.set_nodeinfo_url(domain, request)
257 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
259 logger.debug("data()=%d - EXIT!", len(data))
262 def fetch_wellknown_nodeinfo(domain: str) -> dict:
263 logger.debug("domain='%s' - CALLED!", domain)
264 domain_helper.raise_on(domain)
266 # No CSRF by default, you don't have to add network.api_headers by yourself here
270 logger.debug("Checking CSRF for domain='%s'", domain)
271 headers = csrf.determine(domain, dict())
272 except network.exceptions as exception:
273 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
274 instances.set_last_error(domain, exception)
277 "error_message": type(exception),
278 "exception" : exception,
281 logger.debug("Fetching .well-known info for domain='%s'", domain)
282 data = network.get_json_api(
284 "/.well-known/nodeinfo",
286 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
289 if "error_message" not in data:
290 nodeinfo = data["json"]
291 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
292 if "links" in nodeinfo:
293 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
294 for link in nodeinfo["links"]:
295 logger.debug("link[%s]='%s'", type(link), link)
296 if not isinstance(link, dict) or not "rel" in link:
297 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
298 elif link["rel"] in nodeinfo_identifier:
299 # Default is that 'href' has a complete URL, but some hosts don't send that
301 components = urlparse(link["href"])
303 logger.debug("components[%s]='%s'", type(components), components)
304 if components.scheme == "" and components.netloc == "":
305 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
306 url = f"https://{domain}{url}"
307 components = urlparse(url)
309 if not utils.is_domain_wanted(components.netloc):
310 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
313 logger.debug("Fetching nodeinfo from url='%s' ...", url)
314 data = network.fetch_api_url(
316 (config.get("connection_timeout"), config.get("read_timeout"))
319 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
320 if "error_message" not in data and "json" in data:
321 logger.debug("Found JSON nodeinfo()=%d", len(data))
322 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
323 instances.set_nodeinfo_url(domain, link["href"])
326 instances.set_last_error(domain, data)
328 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
330 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
332 logger.debug("Returning data[]='%s' - EXIT!", type(data))
335 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
336 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
337 domain_helper.raise_on(domain)
339 if not isinstance(path, str):
340 raise ValueError(f"path[]='{type(path)}' is not 'str'")
342 raise ValueError("Parameter 'path' is empty")
344 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
347 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
348 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
350 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
351 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
352 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
353 doc = bs4.BeautifulSoup(response.text, "html.parser")
355 logger.debug("doc[]='%s'", type(doc))
356 generator = doc.find("meta", {"name" : "generator"})
357 site_name = doc.find("meta", {"property": "og:site_name"})
359 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
360 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
361 logger.debug("Found generator meta tag: domain='%s'", domain)
362 software = tidyup.domain(generator.get("content"))
364 logger.debug("software[%s]='%s'", type(software), software)
365 if software is not None and software != "":
366 logger.info("domain='%s' is generated by '%s'", domain, software)
367 instances.set_detection_mode(domain, "GENERATOR")
368 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
369 logger.debug("Found property=og:site_name, domain='%s'", domain)
370 software = tidyup.domain(site_name.get("content"))
372 logger.debug("software[%s]='%s'", type(software), software)
373 if software is not None and software != "":
374 logger.info("domain='%s' has og:site_name='%s'", domain, software)
375 instances.set_detection_mode(domain, "SITE_NAME")
377 logger.debug("software[]='%s'", type(software))
378 if isinstance(software, str) and software == "":
379 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
381 elif isinstance(software, str) and ("." in software or " " in software):
382 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
383 software = version.remove(software)
385 logger.debug("software[]='%s'", type(software))
386 if isinstance(software, str) and "powered by " in software:
387 logger.debug("software='%s' has 'powered by' in it", software)
388 software = version.remove(version.strip_powered_by(software))
389 elif isinstance(software, str) and " hosted on " in software:
390 logger.debug("software='%s' has 'hosted on' in it", software)
391 software = version.remove(version.strip_hosted_on(software))
392 elif isinstance(software, str) and " by " in software:
393 logger.debug("software='%s' has ' by ' in it", software)
394 software = version.strip_until(software, " by ")
395 elif isinstance(software, str) and " see " in software:
396 logger.debug("software='%s' has ' see ' in it", software)
397 software = version.strip_until(software, " see ")
399 logger.debug("software='%s' - EXIT!", software)
402 def determine_software(domain: str, path: str = None) -> str:
403 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
404 domain_helper.raise_on(domain)
406 if not isinstance(path, str) and path is not None:
407 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
409 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
412 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
413 data = fetch_nodeinfo(domain, path)
415 logger.debug("data[]='%s'", type(data))
416 if "exception" in data:
417 # Continue raising it
418 raise data["exception"]
419 elif "error_message" in data:
420 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code='%d'", data['error_message'], data['status_code'])
421 return fetch_generator_from_path(domain)
422 elif "status" in data and data["status"] == "error" and "message" in data:
423 logger.warning("JSON response is an error: '%s'", data["message"])
424 instances.set_last_error(domain, data["message"])
425 return fetch_generator_from_path(domain)
426 elif "message" in data:
427 logger.warning("JSON response contains only a message: '%s'", data["message"])
428 instances.set_last_error(domain, data["message"])
429 return fetch_generator_from_path(domain)
430 elif "software" not in data or "name" not in data["software"]:
431 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
432 software = fetch_generator_from_path(domain)
433 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
434 elif "software" in data and "name" in data["software"]:
435 logger.debug("Found data[software][name] in JSON response")
436 software = data["software"]["name"]
439 logger.debug("Returning None - EXIT!")
442 logger.debug("software='%s'- BEFORE!", software)
443 software = tidyup.domain(software)
444 logger.debug("software='%s'- AFTER!", software)
446 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
447 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
449 elif software in ["hometown", "ecko"]:
450 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
451 software = "mastodon"
452 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
453 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
455 elif software == "runtube.re":
456 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
457 software = "peertube"
458 elif software == "nextcloud social":
459 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
460 software = "nextcloud"
461 elif software.find("/") > 0:
462 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
463 software = tidyup.domain(software.split("/")[-1])
464 elif software.find("|") > 0:
465 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
466 software = tidyup.domain(software.split("|")[0])
467 elif "powered by" in software:
468 logger.debug("software='%s' has 'powered by' in it", software)
469 software = version.strip_powered_by(software)
470 elif isinstance(software, str) and " by " in software:
471 logger.debug("software='%s' has ' by ' in it", software)
472 software = version.strip_until(software, " by ")
473 elif isinstance(software, str) and " see " in software:
474 logger.debug("software='%s' has ' see ' in it", software)
475 software = version.strip_until(software, " see ")
477 logger.debug("software['%s']='%s'", type(software), software)
479 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
482 logger.debug("software[]='%s'", type(software))
483 if str(software) == "":
484 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
485 software = fetch_generator_from_path(domain)
486 elif len(str(software)) > 0 and ("." in software or " " in software):
487 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
488 software = version.remove(software)
490 logger.debug("software[]='%s'", type(software))
491 if isinstance(software, str) and "powered by" in software:
492 logger.debug("software='%s' has 'powered by' in it", software)
493 software = version.remove(version.strip_powered_by(software))
495 logger.debug("software='%s' - EXIT!", domain, software)
498 def find_domains(tag: bs4.element.Tag) -> list:
499 logger.debug("tag[]='%s' - CALLED!", type(tag))
500 if not isinstance(tag, bs4.element.Tag):
501 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
502 elif len(tag.select("tr")) == 0:
503 raise KeyError("No table rows found in table!")
506 for element in tag.select("tr"):
507 logger.debug("element[]='%s'", type(element))
508 if not element.find("td"):
509 logger.debug("Skipping element, no <td> found")
512 domain = tidyup.domain(element.find("td").text)
513 reason = tidyup.reason(element.findAll("td")[1].text)
515 logger.debug("domain='%s',reason='%s'", domain, reason)
517 if not utils.is_domain_wanted(domain):
518 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
520 elif domain == "gab.com/.ai, develop.gab.com":
521 logger.debug("Multiple domains detected in one row")
531 "domain": "develop.gab.com",
535 elif not validators.domain(domain.split("/")[0]):
536 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
539 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
545 logger.debug("domains()=%d - EXIT!", len(domains))
548 def add_peers(rows: dict) -> list:
549 logger.debug("rows[]='%s' - CALLED!", type(rows))
550 if not isinstance(rows, dict):
551 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
554 for key in ["linked", "allowed", "blocked"]:
555 logger.debug("Checking key='%s'", key)
556 if key not in rows or rows[key] is None:
557 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
560 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
561 for peer in rows[key]:
562 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
563 if peer is None or peer == "":
564 logger.debug("peer is empty - SKIPPED")
566 elif isinstance(peer, dict) and "domain" in peer:
567 logger.debug("peer[domain]='%s'", peer['domain'])
568 peer = tidyup.domain(peer["domain"])
569 elif isinstance(peer, str):
570 logger.debug("peer='%s'", peer)
571 peer = tidyup.domain(peer)
573 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
575 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
576 if not utils.is_domain_wanted(peer):
577 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
580 logger.debug("Adding peer='%s' ...", peer)
583 logger.debug("peers()=%d - EXIT!", len(peers))