1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import tidyup
30 from fba.helpers import version
32 from fba.http import network
34 from fba.models import instances
36 from fba.networks import lemmy
37 from fba.networks import misskey
38 from fba.networks import peertube
40 logging.basicConfig(level=logging.INFO)
41 logger = logging.getLogger(__name__)
43 # "rel" identifiers (no real URLs)
44 nodeinfo_identifier = [
45 "https://nodeinfo.diaspora.software/ns/schema/2.1",
46 "https://nodeinfo.diaspora.software/ns/schema/2.0",
47 "https://nodeinfo.diaspora.software/ns/schema/1.1",
48 "https://nodeinfo.diaspora.software/ns/schema/1.0",
49 "http://nodeinfo.diaspora.software/ns/schema/2.1",
50 "http://nodeinfo.diaspora.software/ns/schema/2.0",
51 "http://nodeinfo.diaspora.software/ns/schema/1.1",
52 "http://nodeinfo.diaspora.software/ns/schema/1.0",
55 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
56 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
57 domain_helper.raise_on(domain)
59 if not isinstance(origin, str) and origin is not None:
60 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
61 elif not isinstance(command, str):
62 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
64 raise ValueError("Parameter 'command' is empty")
65 elif software is None:
67 logger.debug("Software for domain='%s' is not set, determining ...", domain)
68 software = determine_software(domain, path)
69 except network.exceptions as exception:
70 logger.warning("Exception '%s' during determining software type", type(exception))
71 instances.set_last_error(domain, exception)
73 logger.debug("Determined software='%s' for domain='%s'", software, domain)
74 elif not isinstance(software, str):
75 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
77 if not instances.is_registered(domain):
78 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
79 instances.add(domain, origin, command, path, software)
81 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
82 instances.set_last_instance_fetch(domain)
84 logger.debug("Fetching instances for domain='%s',software='%s'", domain, software)
85 peerlist = fetch_peers(domain, software)
87 logger.debug("peerlist[]='%s'", type(peerlist))
88 if isinstance(peerlist, list):
89 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
90 instances.set_total_peers(domain, peerlist)
92 logger.debug("Checking if domain='%s' has pending updates ...", domain)
93 if instances.has_pending(domain):
94 logger.debug("Flushing updates for domain='%s' ...", domain)
95 instances.update_data(domain)
97 logger.debug("peerlist[]='%s'", type(peerlist))
99 logger.warning("Cannot fetch peers: domain='%s'", domain)
101 logger.debug("Invoking cookies.clear(%s) ...", domain)
102 cookies.clear(domain)
104 logger.debug("EXIT!")
107 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
108 for instance in peerlist:
109 logger.debug("instance='%s'", instance)
111 # Skip "None" types as tidup.domain() cannot parse them
114 logger.debug("instance='%s' - BEFORE!", instance)
115 instance = tidyup.domain(instance)
116 logger.debug("instance='%s' - AFTER!", instance)
119 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
121 elif not utils.is_domain_wanted(instance):
122 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
124 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
125 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
127 elif not instances.is_registered(instance):
128 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
129 instances.add(instance, domain, command)
131 logger.debug("Invoking cookies.clear(%s) ...", domain)
132 cookies.clear(domain)
134 logger.debug("EXIT!")
136 def fetch_peers(domain: str, software: str) -> list:
137 logger.debug("domain='%s',software='%s' - CALLED!", domain, software)
138 domain_helper.raise_on(domain)
140 if not isinstance(software, str) and software is not None:
141 raise ValueError(f"software[]='{type(software)}' is not 'str'")
143 if software == "misskey":
144 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
145 return misskey.fetch_peers(domain)
146 elif software == "lemmy":
147 logger.debug("Invoking lemmy.fetch_peers(%s) ...", domain)
148 return lemmy.fetch_peers(domain)
149 elif software == "peertube":
150 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
151 return peertube.fetch_peers(domain)
153 # No CSRF by default, you don't have to add network.api_headers by yourself here
157 logger.debug("Checking CSRF for domain='%s'", domain)
158 headers = csrf.determine(domain, dict())
159 except network.exceptions as exception:
160 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
161 instances.set_last_error(domain, exception)
165 "/api/v1/instance/peers",
169 # Init peers variable
172 logger.debug("Checking %d paths ...", len(paths))
174 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
175 data = network.get_json_api(
179 (config.get("connection_timeout"), config.get("read_timeout"))
182 logger.debug("data[]='%s'", type(data))
183 if "error_message" in data:
184 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
185 instances.set_last_error(domain, data)
186 elif "json" in data and len(data["json"]) > 0:
187 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
189 instances.set_success(domain)
192 if not isinstance(peers, list):
193 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
196 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
197 instances.set_total_peers(domain, peers)
199 logger.debug("peers()=%d - EXIT!", len(peers))
202 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
203 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
204 domain_helper.raise_on(domain)
206 if not isinstance(path, str) and path is not None:
207 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
209 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
210 nodeinfo = fetch_wellknown_nodeinfo(domain)
212 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
213 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
214 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
215 return nodeinfo["json"]
217 # No CSRF by default, you don't have to add network.api_headers by yourself here
222 logger.debug("Checking CSRF for domain='%s'", domain)
223 headers = csrf.determine(domain, dict())
224 except network.exceptions as exception:
225 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
226 instances.set_last_error(domain, exception)
229 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
230 "exception" : exception,
234 "/nodeinfo/2.1.json",
236 "/nodeinfo/2.0.json",
242 for request in request_paths:
243 logger.debug("request='%s'", request)
244 http_url = f"http://{domain}{path}"
245 https_url = f"https://{domain}{path}"
247 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
248 if path is None or path in [request, http_url, https_url]:
249 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
250 if path in [http_url, https_url]:
251 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
252 components = urlparse(path)
253 path = components.path
255 data = network.get_json_api(
259 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
262 logger.debug("data[]='%s'", type(data))
263 if "error_message" not in data and "json" in data:
264 logger.debug("Success: request='%s'", request)
265 instances.set_detection_mode(domain, "STATIC_CHECK")
266 instances.set_nodeinfo_url(domain, request)
269 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
271 logger.debug("data()=%d - EXIT!", len(data))
274 def fetch_wellknown_nodeinfo(domain: str) -> dict:
275 logger.debug("domain='%s' - CALLED!", domain)
276 domain_helper.raise_on(domain)
278 # No CSRF by default, you don't have to add network.api_headers by yourself here
282 logger.debug("Checking CSRF for domain='%s'", domain)
283 headers = csrf.determine(domain, dict())
284 except network.exceptions as exception:
285 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
286 instances.set_last_error(domain, exception)
289 "error_message": type(exception),
290 "exception" : exception,
293 logger.debug("Fetching .well-known info for domain='%s'", domain)
294 data = network.get_json_api(
296 "/.well-known/nodeinfo",
298 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
301 if "error_message" not in data:
302 nodeinfo = data["json"]
303 instances.set_success(domain)
305 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
306 if "links" in nodeinfo:
307 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
308 for link in nodeinfo["links"]:
309 logger.debug("link[%s]='%s'", type(link), link)
310 if not isinstance(link, dict) or not "rel" in link:
311 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
312 elif link["rel"] in nodeinfo_identifier:
313 # Default is that 'href' has a complete URL, but some hosts don't send that
315 components = urlparse(link["href"])
317 logger.debug("components[%s]='%s'", type(components), components)
318 if components.scheme == "" and components.netloc == "":
319 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
320 url = f"https://{domain}{url}"
321 components = urlparse(url)
323 if not utils.is_domain_wanted(components.netloc):
324 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
327 logger.debug("Fetching nodeinfo from url='%s' ...", url)
328 data = network.fetch_api_url(
330 (config.get("connection_timeout"), config.get("read_timeout"))
333 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
334 if "error_message" not in data and "json" in data:
335 logger.debug("Found JSON data()=%d", len(data))
336 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
337 instances.set_nodeinfo_url(domain, link["href"])
338 instances.set_success(domain)
341 instances.set_last_error(domain, data)
343 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
345 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
347 logger.debug("Returning data[]='%s' - EXIT!", type(data))
350 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
351 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
352 domain_helper.raise_on(domain)
354 if not isinstance(path, str):
355 raise ValueError(f"path[]='{type(path)}' is not 'str'")
357 raise ValueError("Parameter 'path' is empty")
359 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
362 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
363 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
365 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
366 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
367 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
368 doc = bs4.BeautifulSoup(response.text, "html.parser")
370 logger.debug("doc[]='%s'", type(doc))
371 generator = doc.find("meta", {"name" : "generator"})
372 site_name = doc.find("meta", {"property": "og:site_name"})
374 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
375 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
376 logger.debug("Found generator meta tag: domain='%s'", domain)
377 software = tidyup.domain(generator.get("content"))
379 logger.debug("software[%s]='%s'", type(software), software)
380 if software is not None and software != "":
381 logger.info("domain='%s' is generated by '%s'", domain, software)
382 instances.set_detection_mode(domain, "GENERATOR")
383 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
384 logger.debug("Found property=og:site_name, domain='%s'", domain)
385 software = tidyup.domain(site_name.get("content"))
387 logger.debug("software[%s]='%s'", type(software), software)
388 if software is not None and software != "":
389 logger.info("domain='%s' has og:site_name='%s'", domain, software)
390 instances.set_detection_mode(domain, "SITE_NAME")
392 logger.debug("software[]='%s'", type(software))
393 if isinstance(software, str) and software == "":
394 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
396 elif isinstance(software, str) and ("." in software or " " in software):
397 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
398 software = version.remove(software)
400 logger.debug("software[]='%s'", type(software))
401 if isinstance(software, str) and "powered by " in software:
402 logger.debug("software='%s' has 'powered by' in it", software)
403 software = version.remove(version.strip_powered_by(software))
404 elif isinstance(software, str) and " hosted on " in software:
405 logger.debug("software='%s' has 'hosted on' in it", software)
406 software = version.remove(version.strip_hosted_on(software))
407 elif isinstance(software, str) and " by " in software:
408 logger.debug("software='%s' has ' by ' in it", software)
409 software = version.strip_until(software, " by ")
410 elif isinstance(software, str) and " see " in software:
411 logger.debug("software='%s' has ' see ' in it", software)
412 software = version.strip_until(software, " see ")
414 logger.debug("software='%s' - EXIT!", software)
417 def determine_software(domain: str, path: str = None) -> str:
418 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
419 domain_helper.raise_on(domain)
421 if not isinstance(path, str) and path is not None:
422 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
424 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
427 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
428 data = fetch_nodeinfo(domain, path)
430 logger.debug("data[]='%s'", type(data))
431 if "exception" in data:
432 # Continue raising it
433 raise data["exception"]
434 elif "error_message" in data:
435 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
436 return fetch_generator_from_path(domain)
437 elif "status" in data and data["status"] == "error" and "message" in data:
438 logger.warning("JSON response is an error: '%s'", data["message"])
439 instances.set_last_error(domain, data["message"])
440 return fetch_generator_from_path(domain)
441 elif "message" in data:
442 logger.warning("JSON response contains only a message: '%s'", data["message"])
443 instances.set_last_error(domain, data["message"])
444 return fetch_generator_from_path(domain)
445 elif "software" not in data or "name" not in data["software"]:
446 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
447 software = fetch_generator_from_path(domain)
448 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
449 elif "software" in data and "name" in data["software"]:
450 logger.debug("Found data[software][name] in JSON response")
451 software = data["software"]["name"]
454 logger.debug("Returning None - EXIT!")
457 logger.debug("software='%s'- BEFORE!", software)
458 software = tidyup.domain(software)
459 logger.debug("software='%s'- AFTER!", software)
461 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
462 logger.debug("Setting pleroma: domain='%s',software='%s'", domain, software)
464 elif software in ["hometown", "ecko"]:
465 logger.debug("Setting mastodon: domain='%s',software='%s'", domain, software)
466 software = "mastodon"
467 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
468 logger.debug("Setting misskey: domain='%s',software='%s'", domain, software)
470 elif software == "runtube.re":
471 logger.debug("Setting peertube: domain='%s',software='%s'", domain, software)
472 software = "peertube"
473 elif software == "nextcloud social":
474 logger.debug("Setting nextcloud: domain='%s',software='%s'", domain, software)
475 software = "nextcloud"
476 elif software.find("/") > 0:
477 logger.warning("Spliting of slash: domain='%s',software='%s'", domain, software)
478 software = tidyup.domain(software.split("/")[-1])
479 elif software.find("|") > 0:
480 logger.warning("Spliting of pipe: domain='%s',software='%s'", domain, software)
481 software = tidyup.domain(software.split("|")[0])
482 elif "powered by" in software:
483 logger.debug("software='%s' has 'powered by' in it", software)
484 software = version.strip_powered_by(software)
485 elif isinstance(software, str) and " by " in software:
486 logger.debug("software='%s' has ' by ' in it", software)
487 software = version.strip_until(software, " by ")
488 elif isinstance(software, str) and " see " in software:
489 logger.debug("software='%s' has ' see ' in it", software)
490 software = version.strip_until(software, " see ")
492 logger.debug("software['%s']='%s'", type(software), software)
494 logger.warning("tidyup.domain() left no software name behind: domain='%s'", domain)
497 logger.debug("software[]='%s'", type(software))
498 if str(software) == "":
499 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
500 software = fetch_generator_from_path(domain)
501 elif len(str(software)) > 0 and ("." in software or " " in software):
502 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
503 software = version.remove(software)
505 logger.debug("software[]='%s'", type(software))
506 if isinstance(software, str) and "powered by" in software:
507 logger.debug("software='%s' has 'powered by' in it", software)
508 software = version.remove(version.strip_powered_by(software))
510 logger.debug("software='%s' - EXIT!", domain, software)
513 def find_domains(tag: bs4.element.Tag) -> list:
514 logger.debug("tag[]='%s' - CALLED!", type(tag))
515 if not isinstance(tag, bs4.element.Tag):
516 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
517 elif len(tag.select("tr")) == 0:
518 raise KeyError("No table rows found in table!")
521 for element in tag.select("tr"):
522 logger.debug("element[]='%s'", type(element))
523 if not element.find("td"):
524 logger.debug("Skipping element, no <td> found")
527 domain = tidyup.domain(element.find("td").text)
528 reason = tidyup.reason(element.findAll("td")[1].text)
530 logger.debug("domain='%s',reason='%s'", domain, reason)
532 if not utils.is_domain_wanted(domain):
533 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
535 elif domain == "gab.com/.ai, develop.gab.com":
536 logger.debug("Multiple domains detected in one row")
546 "domain": "develop.gab.com",
550 elif not validators.domain(domain.split("/")[0]):
551 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
554 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
560 logger.debug("domains()=%d - EXIT!", len(domains))
563 def add_peers(rows: dict) -> list:
564 logger.debug("rows[]='%s' - CALLED!", type(rows))
565 if not isinstance(rows, dict):
566 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
569 for key in ["linked", "allowed", "blocked"]:
570 logger.debug("Checking key='%s'", key)
571 if key not in rows or rows[key] is None:
572 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
575 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
576 for peer in rows[key]:
577 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
578 if peer is None or peer == "":
579 logger.debug("peer is empty - SKIPPED")
581 elif isinstance(peer, dict) and "domain" in peer:
582 logger.debug("peer[domain]='%s'", peer['domain'])
583 peer = tidyup.domain(peer["domain"])
584 elif isinstance(peer, str):
585 logger.debug("peer='%s'", peer)
586 peer = tidyup.domain(peer)
588 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
590 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
591 if not utils.is_domain_wanted(peer):
592 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
595 logger.debug("Adding peer='%s' ...", peer)
598 logger.debug("peers()=%d - EXIT!", len(peers))