1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29 from fba.helpers import version
31 from fba.http import network
33 from fba.models import instances
35 from fba.networks import lemmy
36 from fba.networks import misskey
37 from fba.networks import peertube
39 logging.basicConfig(level=logging.INFO)
40 logger = logging.getLogger(__name__)
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
55 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
56 domain_helper.raise_on(domain)
58 if not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60 elif not isinstance(command, str):
61 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
63 raise ValueError("Parameter 'command' is empty")
64 elif software is None:
65 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
66 instances.set_last_instance_fetch(domain)
69 logger.debug("Software for domain='%s' is not set, determining ...", domain)
70 software = determine_software(domain, path)
71 except network.exceptions as exception:
72 logger.warning("Exception '%s' during determining software type", type(exception))
73 instances.set_last_error(domain, exception)
75 logger.debug("Determined software='%s' for domain='%s'", software, domain)
76 elif not isinstance(software, str):
77 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
79 if not instances.is_registered(domain):
80 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
81 instances.add(domain, origin, command, path, software)
83 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
84 instances.set_last_instance_fetch(domain)
86 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
87 peerlist = fetch_peers(domain, software)
90 logger.warning("Cannot fetch peers: domain='%s'", domain)
92 elif instances.has_pending(domain):
93 logger.debug("domain='%s' has pending nodeinfo data, flushing ...", domain)
94 instances.update_data(domain)
96 logger.info("Checking %d instances from domain='%s' ...", len(peerlist), domain)
97 for instance in peerlist:
98 logger.debug("instance='%s'", instance)
100 # Skip "None" types as tidup.domain() cannot parse them
103 logger.debug("instance='%s' - BEFORE", instance)
104 instance = tidyup.domain(instance)
105 logger.debug("instance='%s' - AFTER", instance)
108 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
110 elif not utils.is_domain_wanted(instance):
111 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
113 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
114 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
116 elif not instances.is_registered(instance):
117 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
118 instances.add(instance, domain, command)
120 logger.debug("EXIT!")
122 def fetch_peers(domain: str, software: str) -> list:
123 logger.debug("domain='%s',software='%s' - CALLED!", domain, software)
124 domain_helper.raise_on(domain)
126 if not isinstance(software, str) and software is not None:
127 raise ValueError(f"software[]='{type(software)}' is not 'str'")
129 if software == "misskey":
130 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
131 return misskey.fetch_peers(domain)
132 elif software == "lemmy":
133 logger.debug("Invoking lemmy.fetch_peers(%s) ...", domain)
134 return lemmy.fetch_peers(domain)
135 elif software == "peertube":
136 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
137 return peertube.fetch_peers(domain)
139 # No CSRF by default, you don't have to add network.api_headers by yourself here
143 logger.debug("Checking CSRF for domain='%s'", domain)
144 headers = csrf.determine(domain, dict())
145 except network.exceptions as exception:
146 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
147 instances.set_last_error(domain, exception)
151 "/api/v1/instance/peers",
155 # Init peers variable
158 logger.debug("Checking %d paths ...", len(paths))
160 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
161 data = network.get_json_api(
165 (config.get("connection_timeout"), config.get("read_timeout"))
168 logger.debug("data[]='%s'", type(data))
169 if "error_message" in data:
170 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
171 instances.set_last_error(domain, data)
172 elif "json" in data and len(data["json"]) > 0:
173 logger.debug("Querying API path='%s' was successful: domain='%s',data[json]()=%d", path, domain, len(data['json']))
175 instances.set_success(domain)
178 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
179 instances.set_total_peers(domain, peers)
181 logger.debug("peers()=%d - EXIT!", len(peers))
184 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
185 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
186 domain_helper.raise_on(domain)
188 if not isinstance(path, str) and path is not None:
189 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
191 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
192 nodeinfo = fetch_wellknown_nodeinfo(domain)
194 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
195 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
196 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
197 return nodeinfo["json"]
199 # No CSRF by default, you don't have to add network.api_headers by yourself here
204 logger.debug("Checking CSRF for domain='%s'", domain)
205 headers = csrf.determine(domain, dict())
206 except network.exceptions as exception:
207 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
208 instances.set_last_error(domain, exception)
211 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
212 "exception" : exception,
216 "/nodeinfo/2.1.json",
218 "/nodeinfo/2.0.json",
224 for request in request_paths:
225 logger.debug("path[%s]='%s',request='%s'", type(path), path, request)
226 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
227 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
228 if path in [f"http://{domain}{path}", f"https://{domain}{path}"]:
229 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
230 components = urlparse(path)
231 path = components.path
233 data = network.get_json_api(
237 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
240 logger.debug("data[]='%s'", type(data))
241 if "error_message" not in data:
242 logger.debug("Success: request='%s'", request)
243 instances.set_detection_mode(domain, "STATIC_CHECK")
244 instances.set_nodeinfo_url(domain, request)
247 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
249 logger.debug("data()=%d - EXIT!", len(data))
252 def fetch_wellknown_nodeinfo(domain: str) -> dict:
253 logger.debug("domain='%s' - CALLED!", domain)
254 domain_helper.raise_on(domain)
256 # No CSRF by default, you don't have to add network.api_headers by yourself here
260 logger.debug("Checking CSRF for domain='%s'", domain)
261 headers = csrf.determine(domain, dict())
262 except network.exceptions as exception:
263 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
264 instances.set_last_error(domain, exception)
267 "error_message": type(exception),
268 "exception" : exception,
271 logger.debug("Fetching .well-known info for domain='%s'", domain)
272 data = network.get_json_api(
274 "/.well-known/nodeinfo",
276 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
279 if "error_message" not in data:
280 nodeinfo = data["json"]
281 instances.set_success(domain)
283 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
284 if "links" in nodeinfo:
285 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
286 for link in nodeinfo["links"]:
287 logger.debug("link[%s]='%s'", type(link), link)
288 if not isinstance(link, dict) or not "rel" in link:
289 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
290 elif link["rel"] in nodeinfo_identifier:
291 # Default is that 'href' has a complete URL, but some hosts don't send that
293 components = urlparse(link["href"])
295 logger.debug("components[%s]='%s'", type(components), components)
296 if components.scheme == "" and components.netloc == "":
297 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
298 url = f"https://{domain}{url}"
299 components = urlparse(url)
301 if not utils.is_domain_wanted(components.netloc):
302 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
305 logger.debug("Fetching nodeinfo from url='%s' ...", url)
306 data = network.fetch_api_url(
308 (config.get("connection_timeout"), config.get("read_timeout"))
311 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
312 if "error_message" not in data and "json" in data:
313 logger.debug("Found JSON data()=%d", len(data))
314 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
315 instances.set_nodeinfo_url(domain, link["href"])
316 instances.set_success(domain)
319 instances.set_last_error(domain, data)
321 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
323 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
325 logger.debug("Returning data[]='%s' - EXIT!", type(data))
328 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
329 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
330 domain_helper.raise_on(domain)
332 if not isinstance(path, str):
333 raise ValueError(f"path[]='{type(path)}' is not 'str'")
335 raise ValueError("Parameter 'path' is empty")
337 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
340 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
341 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
343 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
344 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
345 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
346 doc = bs4.BeautifulSoup(response.text, "html.parser")
348 logger.debug("doc[]='%s'", type(doc))
349 generator = doc.find("meta", {"name" : "generator"})
350 site_name = doc.find("meta", {"property": "og:site_name"})
352 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
353 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
354 logger.debug("Found generator meta tag: domain='%s'", domain)
355 software = tidyup.domain(generator.get("content"))
357 logger.debug("software[%s]='%s'", type(software), software)
358 if software is not None and software != "":
359 logger.info("domain='%s' is generated by '%s'", domain, software)
360 instances.set_detection_mode(domain, "GENERATOR")
361 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
362 logger.debug("Found property=og:site_name, domain='%s'", domain)
363 software = tidyup.domain(site_name.get("content"))
365 logger.debug("software[%s]='%s'", type(software), software)
366 if software is not None and software != "":
367 logger.info("domain='%s' has og:site_name='%s'", domain, software)
368 instances.set_detection_mode(domain, "SITE_NAME")
370 logger.debug("software[]='%s'", type(software))
371 if isinstance(software, str) and software == "":
372 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
374 elif isinstance(software, str) and ("." in software or " " in software):
375 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
376 software = version.remove(software)
378 logger.debug("software[]='%s'", type(software))
379 if isinstance(software, str) and "powered by " in software:
380 logger.debug("software='%s' has 'powered by' in it", software)
381 software = version.remove(version.strip_powered_by(software))
382 elif isinstance(software, str) and " hosted on " in software:
383 logger.debug("software='%s' has 'hosted on' in it", software)
384 software = version.remove(version.strip_hosted_on(software))
385 elif isinstance(software, str) and " by " in software:
386 logger.debug("software='%s' has ' by ' in it", software)
387 software = version.strip_until(software, " by ")
388 elif isinstance(software, str) and " see " in software:
389 logger.debug("software='%s' has ' see ' in it", software)
390 software = version.strip_until(software, " see ")
392 logger.debug("software='%s' - EXIT!", software)
395 def determine_software(domain: str, path: str = None) -> str:
396 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
397 domain_helper.raise_on(domain)
399 if not isinstance(path, str) and path is not None:
400 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
402 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
405 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
406 data = fetch_nodeinfo(domain, path)
408 logger.debug("data[]='%s'", type(data))
409 if "exception" in data:
410 # Continue raising it
411 raise data["exception"]
412 elif "error_message" in data:
413 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code='%d'", data['error_message'], data['status_code'])
414 return fetch_generator_from_path(domain)
415 elif "status" in data and data["status"] == "error" and "message" in data:
416 logger.warning("JSON response is an error: '%s'", data["message"])
417 instances.set_last_error(domain, data["message"])
418 return fetch_generator_from_path(domain)
419 elif "message" in data:
420 logger.warning("JSON response contains only a message: '%s'", data["message"])
421 instances.set_last_error(domain, data["message"])
422 return fetch_generator_from_path(domain)
423 elif "software" not in data or "name" not in data["software"]:
424 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
425 software = fetch_generator_from_path(domain)
426 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
427 elif "software" in data and "name" in data["software"]:
428 logger.debug("Found data[software][name] in JSON response")
429 software = data["software"]["name"]
432 logger.debug("Returning None - EXIT!")
435 logger.debug("software='%s'- BEFORE!", software)
436 software = tidyup.domain(software)
437 logger.debug("software='%s'- AFTER!", software)
439 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
440 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
442 elif software in ["hometown", "ecko"]:
443 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
444 software = "mastodon"
445 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
446 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
448 elif software == "runtube.re":
449 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
450 software = "peertube"
451 elif software == "nextcloud social":
452 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
453 software = "nextcloud"
454 elif software.find("/") > 0:
455 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
456 software = tidyup.domain(software.split("/")[-1])
457 elif software.find("|") > 0:
458 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
459 software = tidyup.domain(software.split("|")[0])
460 elif "powered by" in software:
461 logger.debug("software='%s' has 'powered by' in it", software)
462 software = version.strip_powered_by(software)
463 elif isinstance(software, str) and " by " in software:
464 logger.debug("software='%s' has ' by ' in it", software)
465 software = version.strip_until(software, " by ")
466 elif isinstance(software, str) and " see " in software:
467 logger.debug("software='%s' has ' see ' in it", software)
468 software = version.strip_until(software, " see ")
470 logger.debug("software['%s']='%s'", type(software), software)
472 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
475 logger.debug("software[]='%s'", type(software))
476 if str(software) == "":
477 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
478 software = fetch_generator_from_path(domain)
479 elif len(str(software)) > 0 and ("." in software or " " in software):
480 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
481 software = version.remove(software)
483 logger.debug("software[]='%s'", type(software))
484 if isinstance(software, str) and "powered by" in software:
485 logger.debug("software='%s' has 'powered by' in it", software)
486 software = version.remove(version.strip_powered_by(software))
488 logger.debug("software='%s' - EXIT!", domain, software)
491 def find_domains(tag: bs4.element.Tag) -> list:
492 logger.debug("tag[]='%s' - CALLED!", type(tag))
493 if not isinstance(tag, bs4.element.Tag):
494 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
495 elif len(tag.select("tr")) == 0:
496 raise KeyError("No table rows found in table!")
499 for element in tag.select("tr"):
500 logger.debug("element[]='%s'", type(element))
501 if not element.find("td"):
502 logger.debug("Skipping element, no <td> found")
505 domain = tidyup.domain(element.find("td").text)
506 reason = tidyup.reason(element.findAll("td")[1].text)
508 logger.debug("domain='%s',reason='%s'", domain, reason)
510 if not utils.is_domain_wanted(domain):
511 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
513 elif domain == "gab.com/.ai, develop.gab.com":
514 logger.debug("Multiple domains detected in one row")
524 "domain": "develop.gab.com",
528 elif not validators.domain(domain.split("/")[0]):
529 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
532 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
538 logger.debug("domains()=%d - EXIT!", len(domains))
541 def add_peers(rows: dict) -> list:
542 logger.debug("rows[]='%s' - CALLED!", type(rows))
543 if not isinstance(rows, dict):
544 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
547 for key in ["linked", "allowed", "blocked"]:
548 logger.debug("Checking key='%s'", key)
549 if key not in rows or rows[key] is None:
550 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
553 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
554 for peer in rows[key]:
555 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
556 if peer is None or peer == "":
557 logger.debug("peer is empty - SKIPPED")
559 elif isinstance(peer, dict) and "domain" in peer:
560 logger.debug("peer[domain]='%s'", peer['domain'])
561 peer = tidyup.domain(peer["domain"])
562 elif isinstance(peer, str):
563 logger.debug("peer='%s'", peer)
564 peer = tidyup.domain(peer)
566 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
568 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
569 if not utils.is_domain_wanted(peer):
570 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
573 logger.debug("Adding peer='%s' ...", peer)
576 logger.debug("peers()=%d - EXIT!", len(peers))