1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29 from fba.helpers import version
31 from fba.http import network
33 from fba.models import instances
35 from fba.networks import lemmy
36 from fba.networks import misskey
37 from fba.networks import peertube
39 logging.basicConfig(level=logging.INFO)
40 logger = logging.getLogger(__name__)
42 # "rel" identifiers (no real URLs)
43 nodeinfo_identifier = [
44 "https://nodeinfo.diaspora.software/ns/schema/2.1",
45 "https://nodeinfo.diaspora.software/ns/schema/2.0",
46 "https://nodeinfo.diaspora.software/ns/schema/1.1",
47 "https://nodeinfo.diaspora.software/ns/schema/1.0",
48 "http://nodeinfo.diaspora.software/ns/schema/2.1",
49 "http://nodeinfo.diaspora.software/ns/schema/2.0",
50 "http://nodeinfo.diaspora.software/ns/schema/1.1",
51 "http://nodeinfo.diaspora.software/ns/schema/1.0",
54 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
55 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
56 domain_helper.raise_on(domain)
58 if not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60 elif not isinstance(command, str):
61 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
63 raise ValueError("Parameter 'command' is empty")
64 elif software is None:
65 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
66 instances.set_last_instance_fetch(domain)
69 logger.debug("Software for domain='%s' is not set, determining ...", domain)
70 software = determine_software(domain, path)
71 except network.exceptions as exception:
72 logger.warning("Exception '%s' during determining software type", type(exception))
73 instances.set_last_error(domain, exception)
75 logger.debug("Determined software='%s' for domain='%s'", software, domain)
76 elif not isinstance(software, str):
77 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
79 if not instances.is_registered(domain):
80 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
81 instances.add(domain, origin, command, path, software)
83 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
84 instances.set_last_instance_fetch(domain)
86 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
87 peerlist = fetch_peers(domain, software)
89 logger.debug("peerlist[]='%s'", type(peerlist))
90 if isinstance(peerlist, list):
91 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
92 instances.set_total_peers(domain, peerlist)
95 logger.warning("Cannot fetch peers: domain='%s'", domain)
97 elif instances.has_pending(domain):
98 logger.debug("domain='%s' has pending nodeinfo data, flushing ...", domain)
99 instances.update_data(domain)
101 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
102 for instance in peerlist:
103 logger.debug("instance='%s'", instance)
105 # Skip "None" types as tidup.domain() cannot parse them
108 logger.debug("instance='%s' - BEFORE!", instance)
109 instance = tidyup.domain(instance)
110 logger.debug("instance='%s' - AFTER!", instance)
113 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
115 elif not utils.is_domain_wanted(instance):
116 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
118 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
119 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
121 elif not instances.is_registered(instance):
122 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
123 instances.add(instance, domain, command)
125 logger.debug("EXIT!")
127 def fetch_peers(domain: str, software: str) -> list:
128 logger.debug("domain='%s',software='%s' - CALLED!", domain, software)
129 domain_helper.raise_on(domain)
131 if not isinstance(software, str) and software is not None:
132 raise ValueError(f"software[]='{type(software)}' is not 'str'")
134 if software == "misskey":
135 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
136 return misskey.fetch_peers(domain)
137 elif software == "lemmy":
138 logger.debug("Invoking lemmy.fetch_peers(%s) ...", domain)
139 return lemmy.fetch_peers(domain)
140 elif software == "peertube":
141 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
142 return peertube.fetch_peers(domain)
144 # No CSRF by default, you don't have to add network.api_headers by yourself here
148 logger.debug("Checking CSRF for domain='%s'", domain)
149 headers = csrf.determine(domain, dict())
150 except network.exceptions as exception:
151 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
152 instances.set_last_error(domain, exception)
156 "/api/v1/instance/peers",
160 # Init peers variable
163 logger.debug("Checking %d paths ...", len(paths))
165 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
166 data = network.get_json_api(
170 (config.get("connection_timeout"), config.get("read_timeout"))
173 logger.debug("data[]='%s'", type(data))
174 if "error_message" in data:
175 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
176 instances.set_last_error(domain, data)
177 elif "json" in data and len(data["json"]) > 0:
178 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
180 instances.set_success(domain)
183 if not isinstance(peers, list):
184 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
187 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
188 instances.set_total_peers(domain, peers)
190 logger.debug("peers()=%d - EXIT!", len(peers))
193 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
194 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
195 domain_helper.raise_on(domain)
197 if not isinstance(path, str) and path is not None:
198 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
200 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
201 nodeinfo = fetch_wellknown_nodeinfo(domain)
203 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
204 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
205 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
206 return nodeinfo["json"]
208 # No CSRF by default, you don't have to add network.api_headers by yourself here
213 logger.debug("Checking CSRF for domain='%s'", domain)
214 headers = csrf.determine(domain, dict())
215 except network.exceptions as exception:
216 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
217 instances.set_last_error(domain, exception)
220 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
221 "exception" : exception,
225 "/nodeinfo/2.1.json",
227 "/nodeinfo/2.0.json",
233 for request in request_paths:
234 logger.debug("request='%s'", request)
235 http_url = f"http://{domain}{path}"
236 https_url = f"https://{domain}{path}"
238 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
239 if path is None or path in [request, http_url, https_url]:
240 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
241 if path in [http_url, https_url]:
242 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
243 components = urlparse(path)
244 path = components.path
246 data = network.get_json_api(
250 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
253 logger.debug("data[]='%s'", type(data))
254 if "error_message" not in data and "json" in data:
255 logger.debug("Success: request='%s'", request)
256 instances.set_detection_mode(domain, "STATIC_CHECK")
257 instances.set_nodeinfo_url(domain, request)
260 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
262 logger.debug("data()=%d - EXIT!", len(data))
265 def fetch_wellknown_nodeinfo(domain: str) -> dict:
266 logger.debug("domain='%s' - CALLED!", domain)
267 domain_helper.raise_on(domain)
269 # No CSRF by default, you don't have to add network.api_headers by yourself here
273 logger.debug("Checking CSRF for domain='%s'", domain)
274 headers = csrf.determine(domain, dict())
275 except network.exceptions as exception:
276 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
277 instances.set_last_error(domain, exception)
280 "error_message": type(exception),
281 "exception" : exception,
284 logger.debug("Fetching .well-known info for domain='%s'", domain)
285 data = network.get_json_api(
287 "/.well-known/nodeinfo",
289 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
292 if "error_message" not in data:
293 nodeinfo = data["json"]
294 instances.set_success(domain)
296 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
297 if "links" in nodeinfo:
298 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
299 for link in nodeinfo["links"]:
300 logger.debug("link[%s]='%s'", type(link), link)
301 if not isinstance(link, dict) or not "rel" in link:
302 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
303 elif link["rel"] in nodeinfo_identifier:
304 # Default is that 'href' has a complete URL, but some hosts don't send that
306 components = urlparse(link["href"])
308 logger.debug("components[%s]='%s'", type(components), components)
309 if components.scheme == "" and components.netloc == "":
310 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
311 url = f"https://{domain}{url}"
312 components = urlparse(url)
314 if not utils.is_domain_wanted(components.netloc):
315 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
318 logger.debug("Fetching nodeinfo from url='%s' ...", url)
319 data = network.fetch_api_url(
321 (config.get("connection_timeout"), config.get("read_timeout"))
324 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
325 if "error_message" not in data and "json" in data:
326 logger.debug("Found JSON data()=%d", len(data))
327 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
328 instances.set_nodeinfo_url(domain, link["href"])
329 instances.set_success(domain)
332 instances.set_last_error(domain, data)
334 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
336 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
338 logger.debug("Returning data[]='%s' - EXIT!", type(data))
341 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
342 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
343 domain_helper.raise_on(domain)
345 if not isinstance(path, str):
346 raise ValueError(f"path[]='{type(path)}' is not 'str'")
348 raise ValueError("Parameter 'path' is empty")
350 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
353 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
354 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
356 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
357 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
358 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
359 doc = bs4.BeautifulSoup(response.text, "html.parser")
361 logger.debug("doc[]='%s'", type(doc))
362 generator = doc.find("meta", {"name" : "generator"})
363 site_name = doc.find("meta", {"property": "og:site_name"})
365 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
366 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
367 logger.debug("Found generator meta tag: domain='%s'", domain)
368 software = tidyup.domain(generator.get("content"))
370 logger.debug("software[%s]='%s'", type(software), software)
371 if software is not None and software != "":
372 logger.info("domain='%s' is generated by '%s'", domain, software)
373 instances.set_detection_mode(domain, "GENERATOR")
374 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
375 logger.debug("Found property=og:site_name, domain='%s'", domain)
376 software = tidyup.domain(site_name.get("content"))
378 logger.debug("software[%s]='%s'", type(software), software)
379 if software is not None and software != "":
380 logger.info("domain='%s' has og:site_name='%s'", domain, software)
381 instances.set_detection_mode(domain, "SITE_NAME")
383 logger.debug("software[]='%s'", type(software))
384 if isinstance(software, str) and software == "":
385 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
387 elif isinstance(software, str) and ("." in software or " " in software):
388 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
389 software = version.remove(software)
391 logger.debug("software[]='%s'", type(software))
392 if isinstance(software, str) and "powered by " in software:
393 logger.debug("software='%s' has 'powered by' in it", software)
394 software = version.remove(version.strip_powered_by(software))
395 elif isinstance(software, str) and " hosted on " in software:
396 logger.debug("software='%s' has 'hosted on' in it", software)
397 software = version.remove(version.strip_hosted_on(software))
398 elif isinstance(software, str) and " by " in software:
399 logger.debug("software='%s' has ' by ' in it", software)
400 software = version.strip_until(software, " by ")
401 elif isinstance(software, str) and " see " in software:
402 logger.debug("software='%s' has ' see ' in it", software)
403 software = version.strip_until(software, " see ")
405 logger.debug("software='%s' - EXIT!", software)
408 def determine_software(domain: str, path: str = None) -> str:
409 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
410 domain_helper.raise_on(domain)
412 if not isinstance(path, str) and path is not None:
413 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
415 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
418 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
419 data = fetch_nodeinfo(domain, path)
421 logger.debug("data[]='%s'", type(data))
422 if "exception" in data:
423 # Continue raising it
424 raise data["exception"]
425 elif "error_message" in data:
426 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code='%d'", data['error_message'], data['status_code'])
427 return fetch_generator_from_path(domain)
428 elif "status" in data and data["status"] == "error" and "message" in data:
429 logger.warning("JSON response is an error: '%s'", data["message"])
430 instances.set_last_error(domain, data["message"])
431 return fetch_generator_from_path(domain)
432 elif "message" in data:
433 logger.warning("JSON response contains only a message: '%s'", data["message"])
434 instances.set_last_error(domain, data["message"])
435 return fetch_generator_from_path(domain)
436 elif "software" not in data or "name" not in data["software"]:
437 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
438 software = fetch_generator_from_path(domain)
439 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
440 elif "software" in data and "name" in data["software"]:
441 logger.debug("Found data[software][name] in JSON response")
442 software = data["software"]["name"]
445 logger.debug("Returning None - EXIT!")
448 logger.debug("software='%s'- BEFORE!", software)
449 software = tidyup.domain(software)
450 logger.debug("software='%s'- AFTER!", software)
452 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
453 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
455 elif software in ["hometown", "ecko"]:
456 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
457 software = "mastodon"
458 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
459 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
461 elif software == "runtube.re":
462 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
463 software = "peertube"
464 elif software == "nextcloud social":
465 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
466 software = "nextcloud"
467 elif software.find("/") > 0:
468 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
469 software = tidyup.domain(software.split("/")[-1])
470 elif software.find("|") > 0:
471 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
472 software = tidyup.domain(software.split("|")[0])
473 elif "powered by" in software:
474 logger.debug("software='%s' has 'powered by' in it", software)
475 software = version.strip_powered_by(software)
476 elif isinstance(software, str) and " by " in software:
477 logger.debug("software='%s' has ' by ' in it", software)
478 software = version.strip_until(software, " by ")
479 elif isinstance(software, str) and " see " in software:
480 logger.debug("software='%s' has ' see ' in it", software)
481 software = version.strip_until(software, " see ")
483 logger.debug("software['%s']='%s'", type(software), software)
485 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
488 logger.debug("software[]='%s'", type(software))
489 if str(software) == "":
490 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
491 software = fetch_generator_from_path(domain)
492 elif len(str(software)) > 0 and ("." in software or " " in software):
493 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
494 software = version.remove(software)
496 logger.debug("software[]='%s'", type(software))
497 if isinstance(software, str) and "powered by" in software:
498 logger.debug("software='%s' has 'powered by' in it", software)
499 software = version.remove(version.strip_powered_by(software))
501 logger.debug("software='%s' - EXIT!", domain, software)
504 def find_domains(tag: bs4.element.Tag) -> list:
505 logger.debug("tag[]='%s' - CALLED!", type(tag))
506 if not isinstance(tag, bs4.element.Tag):
507 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
508 elif len(tag.select("tr")) == 0:
509 raise KeyError("No table rows found in table!")
512 for element in tag.select("tr"):
513 logger.debug("element[]='%s'", type(element))
514 if not element.find("td"):
515 logger.debug("Skipping element, no <td> found")
518 domain = tidyup.domain(element.find("td").text)
519 reason = tidyup.reason(element.findAll("td")[1].text)
521 logger.debug("domain='%s',reason='%s'", domain, reason)
523 if not utils.is_domain_wanted(domain):
524 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
526 elif domain == "gab.com/.ai, develop.gab.com":
527 logger.debug("Multiple domains detected in one row")
537 "domain": "develop.gab.com",
541 elif not validators.domain(domain.split("/")[0]):
542 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
545 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
551 logger.debug("domains()=%d - EXIT!", len(domains))
554 def add_peers(rows: dict) -> list:
555 logger.debug("rows[]='%s' - CALLED!", type(rows))
556 if not isinstance(rows, dict):
557 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
560 for key in ["linked", "allowed", "blocked"]:
561 logger.debug("Checking key='%s'", key)
562 if key not in rows or rows[key] is None:
563 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
566 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
567 for peer in rows[key]:
568 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
569 if peer is None or peer == "":
570 logger.debug("peer is empty - SKIPPED")
572 elif isinstance(peer, dict) and "domain" in peer:
573 logger.debug("peer[domain]='%s'", peer['domain'])
574 peer = tidyup.domain(peer["domain"])
575 elif isinstance(peer, str):
576 logger.debug("peer='%s'", peer)
577 peer = tidyup.domain(peer)
579 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
581 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
582 if not utils.is_domain_wanted(peer):
583 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
586 logger.debug("Adding peer='%s' ...", peer)
589 logger.debug("peers()=%d - EXIT!", len(peers))