1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
35 from fba.models import instances
37 from fba.networks import lemmy
38 from fba.networks import misskey
39 from fba.networks import peertube
41 logging.basicConfig(level=logging.INFO)
42 logger = logging.getLogger(__name__)
44 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
45 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
46 domain_helper.raise_on(domain)
48 if not isinstance(origin, str) and origin is not None:
49 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
50 elif not isinstance(command, str):
51 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
53 raise ValueError("Parameter 'command' is empty")
54 elif software is None:
56 logger.debug("Software for domain='%s' is not set, determining ...", domain)
57 software = determine_software(domain, path)
58 except network.exceptions as exception:
59 logger.warning("Exception '%s' during determining software type", type(exception))
60 instances.set_last_error(domain, exception)
62 logger.debug("Determined software='%s' for domain='%s'", software, domain)
63 elif not isinstance(software, str):
64 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
66 logger.debug("Checking if domain='%s' is registered ...", domain)
67 if not instances.is_registered(domain):
68 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
69 instances.add(domain, origin, command, path, software)
71 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
72 instances.set_last_instance_fetch(domain)
76 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
77 peerlist = fetch_peers(domain, software, origin)
78 except network.exceptions as exception:
79 logger.warning("Cannot fetch peers from domain='%s': '%s'", domain, type(exception))
81 logger.debug("peerlist[]='%s'", type(peerlist))
82 if isinstance(peerlist, list):
83 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
84 instances.set_total_peers(domain, peerlist)
86 logger.debug("peerlist[]='%s'", type(peerlist))
87 if peerlist is None or len(peerlist) == 0:
88 logger.warning("Cannot fetch peers: domain='%s'", domain)
90 if instances.has_pending(domain):
91 logger.debug("Flushing updates for domain='%s' ...", domain)
92 instances.update_data(domain)
94 logger.debug("Invoking cookies.clear(%s) ...", domain)
100 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
101 for instance in peerlist:
102 logger.debug("instance='%s'", instance)
104 # Skip "None" types as tidup.domain() cannot parse them
107 logger.debug("instance='%s' - BEFORE!", instance)
108 instance = tidyup.domain(instance)
109 logger.debug("instance='%s' - AFTER!", instance)
112 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
114 elif not utils.is_domain_wanted(instance):
115 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
117 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
118 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
120 elif instance.find("/tag/") > 0:
121 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
123 elif not instances.is_registered(instance):
124 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
125 instances.add(instance, domain, command)
127 logger.debug("Invoking cookies.clear(%s) ...", domain)
128 cookies.clear(domain)
130 logger.debug("Checking if domain='%s' has pending updates ...", domain)
131 if instances.has_pending(domain):
132 logger.debug("Flushing updates for domain='%s' ...", domain)
133 instances.update_data(domain)
135 logger.debug("EXIT!")
137 def fetch_peers(domain: str, software: str, origin: str) -> list:
138 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
139 domain_helper.raise_on(domain)
141 if not isinstance(software, str) and software is not None:
142 raise ValueError(f"software[]='{type(software)}' is not 'str'")
144 if software == "misskey":
145 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
146 return misskey.fetch_peers(domain)
147 elif software == "lemmy":
148 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
149 return lemmy.fetch_peers(domain, origin)
150 elif software == "peertube":
151 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
152 return peertube.fetch_peers(domain)
154 # No CSRF by default, you don't have to add network.api_headers by yourself here
158 logger.debug("Checking CSRF for domain='%s'", domain)
159 headers = csrf.determine(domain, dict())
160 except network.exceptions as exception:
161 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
162 instances.set_last_error(domain, exception)
166 "/api/v1/instance/peers",
170 # Init peers variable
173 logger.debug("Checking %d paths ...", len(paths))
175 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
176 data = network.get_json_api(
180 (config.get("connection_timeout"), config.get("read_timeout"))
183 logger.debug("data[]='%s'", type(data))
184 if "error_message" in data:
185 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
186 instances.set_last_error(domain, data)
187 elif "json" in data and len(data["json"]) > 0:
188 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
191 logger.debug("Marking domain='%s' as successfully handled ...", domain)
192 instances.set_success(domain)
195 if not isinstance(peers, list):
196 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
199 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
200 instances.set_total_peers(domain, peers)
202 logger.debug("peers()=%d - EXIT!", len(peers))
205 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
206 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
207 domain_helper.raise_on(domain)
209 if not isinstance(path, str) and path is not None:
210 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
212 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
213 nodeinfo = fetch_wellknown_nodeinfo(domain)
215 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
216 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
217 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
218 return nodeinfo["json"]
220 # No CSRF by default, you don't have to add network.api_headers by yourself here
225 logger.debug("Checking CSRF for domain='%s'", domain)
226 headers = csrf.determine(domain, dict())
227 except network.exceptions as exception:
228 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
229 instances.set_last_error(domain, exception)
232 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
233 "exception" : exception,
237 "/nodeinfo/2.1.json",
239 "/nodeinfo/2.0.json",
241 "/nodeinfo/1.0.json",
246 for request in request_paths:
247 logger.debug("request='%s'", request)
248 http_url = f"http://{domain}{path}"
249 https_url = f"https://{domain}{path}"
251 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
252 if path is None or path in [request, http_url, https_url]:
253 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
254 if path in [http_url, https_url]:
255 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
256 components = urlparse(path)
257 path = components.path
259 data = network.get_json_api(
263 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
266 logger.debug("data[]='%s'", type(data))
267 if "error_message" not in data and "json" in data:
268 logger.debug("Success: request='%s' - Setting detection_mode=STATIC_CHECK ...", request)
269 instances.set_detection_mode(domain, "STATIC_CHECK")
270 instances.set_nodeinfo_url(domain, request)
273 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
275 logger.debug("data()=%d - EXIT!", len(data))
278 def fetch_wellknown_nodeinfo(domain: str) -> dict:
279 logger.debug("domain='%s' - CALLED!", domain)
280 domain_helper.raise_on(domain)
282 # "rel" identifiers (no real URLs)
283 nodeinfo_identifier = [
284 "https://nodeinfo.diaspora.software/ns/schema/2.1",
285 "http://nodeinfo.diaspora.software/ns/schema/2.1",
286 "https://nodeinfo.diaspora.software/ns/schema/2.0",
287 "http://nodeinfo.diaspora.software/ns/schema/2.0",
288 "https://nodeinfo.diaspora.software/ns/schema/1.1",
289 "http://nodeinfo.diaspora.software/ns/schema/1.1",
290 "https://nodeinfo.diaspora.software/ns/schema/1.0",
291 "http://nodeinfo.diaspora.software/ns/schema/1.0",
294 # No CSRF by default, you don't have to add network.api_headers by yourself here
298 logger.debug("Checking CSRF for domain='%s'", domain)
299 headers = csrf.determine(domain, dict())
300 except network.exceptions as exception:
301 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
302 instances.set_last_error(domain, exception)
305 "error_message": type(exception),
306 "exception" : exception,
309 logger.debug("Fetching .well-known info for domain='%s'", domain)
310 data = network.get_json_api(
312 "/.well-known/nodeinfo",
314 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
317 logger.debug("data[]='%s'", type(data))
318 if "error_message" not in data:
319 nodeinfo = data["json"]
321 logger.debug("Marking domain='%s' as successfully handled ...", domain)
322 instances.set_success(domain)
324 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
325 if "links" in nodeinfo:
326 logger.debug("Found nodeinfo[links]()=%d record(s),", len(nodeinfo["links"]))
327 for niid in nodeinfo_identifier:
330 logger.debug("Checking niid='%s' ...", niid)
331 for link in nodeinfo["links"]:
332 logger.debug("link[%s]='%s'", type(link), link)
333 if not isinstance(link, dict) or not "rel" in link:
334 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
335 elif link["rel"] == niid:
336 # Default is that 'href' has a complete URL, but some hosts don't send that
337 logger.debug("link[href]='%s' matches niid='%s'", link["href"], niid)
339 components = urlparse(link["href"])
341 logger.debug("components[%s]='%s'", type(components), components)
342 if components.scheme == "" and components.netloc == "":
343 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
344 url = f"https://{domain}{url}"
345 components = urlparse(url)
347 if not utils.is_domain_wanted(components.netloc):
348 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
351 logger.debug("Fetching nodeinfo from url='%s' ...", url)
352 data = network.fetch_api_url(
354 (config.get("connection_timeout"), config.get("read_timeout"))
357 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
358 if "error_message" not in data and "json" in data:
359 logger.debug("Found JSON data()=%d,link[href]='%s' - Setting detection_mode=AUTO_DISCOVERY ...", len(data), link["href"])
360 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
361 instances.set_nodeinfo_url(domain, link["href"])
363 logger.debug("Marking domain='%s' as successfully handled ...", domain)
364 instances.set_success(domain)
367 logger.debug("Setting last error for domain='%s',data[]='%s'", domain, type(data))
368 instances.set_last_error(domain, data)
370 logger.debug("data()=%d", len(data))
371 if "error_message" not in data and "json" in data:
372 logger.debug("Auto-discovery successful: domain='%s'", domain)
375 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
377 logger.debug("Returning data[]='%s' - EXIT!", type(data))
380 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
381 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
382 domain_helper.raise_on(domain)
384 if not isinstance(path, str):
385 raise ValueError(f"path[]='{type(path)}' is not 'str'")
387 raise ValueError("Parameter 'path' is empty")
389 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
392 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
393 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
395 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
396 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
397 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
398 doc = bs4.BeautifulSoup(response.text, "html.parser")
400 logger.debug("doc[]='%s'", type(doc))
401 generator = doc.find("meta", {"name" : "generator"})
402 site_name = doc.find("meta", {"property": "og:site_name"})
404 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
405 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
406 logger.debug("Found generator meta tag: domain='%s'", domain)
407 software = tidyup.domain(generator.get("content"))
409 logger.debug("software[%s]='%s'", type(software), software)
410 if software is not None and software != "":
411 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
412 instances.set_detection_mode(domain, "GENERATOR")
413 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
414 logger.debug("Found property=og:site_name, domain='%s'", domain)
415 software = tidyup.domain(site_name.get("content"))
417 logger.debug("software[%s]='%s'", type(software), software)
418 if software is not None and software != "":
419 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
420 instances.set_detection_mode(domain, "SITE_NAME")
422 logger.debug("software[]='%s'", type(software))
423 if isinstance(software, str) and software == "":
424 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
426 elif isinstance(software, str) and ("." in software or " " in software):
427 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
428 software = version.remove(software)
430 logger.debug("software[]='%s'", type(software))
431 if isinstance(software, str) and "powered by " in software:
432 logger.debug("software='%s' has 'powered by' in it", software)
433 software = version.remove(version.strip_powered_by(software))
434 elif isinstance(software, str) and " hosted on " in software:
435 logger.debug("software='%s' has 'hosted on' in it", software)
436 software = version.remove(version.strip_hosted_on(software))
437 elif isinstance(software, str) and " by " in software:
438 logger.debug("software='%s' has ' by ' in it", software)
439 software = version.strip_until(software, " by ")
440 elif isinstance(software, str) and " see " in software:
441 logger.debug("software='%s' has ' see ' in it", software)
442 software = version.strip_until(software, " see ")
444 logger.debug("software='%s' - EXIT!", software)
447 def determine_software(domain: str, path: str = None) -> str:
448 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
449 domain_helper.raise_on(domain)
451 if not isinstance(path, str) and path is not None:
452 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
454 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
457 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
458 data = fetch_nodeinfo(domain, path)
460 logger.debug("data[]='%s'", type(data))
461 if "exception" in data:
462 # Continue raising it
463 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
464 raise data["exception"]
465 elif "error_message" in data:
466 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
467 software = fetch_generator_from_path(domain)
468 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
470 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
473 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
474 software = fetch_generator_from_path(domain)
475 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
477 if "status" in data and data["status"] == "error" and "message" in data:
478 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
479 instances.set_last_error(domain, data["message"])
480 instances.set_detection_mode(domain, None)
481 instances.set_nodeinfo_url(domain, None)
482 software = fetch_generator_from_path(domain)
483 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
484 elif "software" in data and "name" in data["software"]:
485 logger.debug("Found data[json][software][name] in JSON response")
486 software = data["software"]["name"]
487 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
488 elif "message" in data:
489 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
490 instances.set_last_error(domain, data["message"])
491 instances.set_detection_mode(domain, None)
492 instances.set_nodeinfo_url(domain, None)
494 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
495 software = fetch_generator_from_path(domain)
496 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
497 elif "software" not in data or "name" not in data["software"]:
498 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
499 instances.set_detection_mode(domain, None)
500 instances.set_nodeinfo_url(domain, None)
502 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
503 software = fetch_generator_from_path(domain)
504 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
506 logger.debug("software[%s]='%s'", type(software), software)
508 logger.debug("Returning None - EXIT!")
511 logger.debug("software='%s'- BEFORE!", software)
512 software = software_helper.alias(software)
513 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
515 if str(software) == "":
516 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
517 software = fetch_generator_from_path(domain)
518 elif len(str(software)) > 0 and ("." in software or " " in software):
519 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
520 software = version.remove(software)
522 logger.debug("software[]='%s'", type(software))
523 if isinstance(software, str) and "powered by" in software:
524 logger.debug("software='%s' has 'powered by' in it", software)
525 software = version.remove(version.strip_powered_by(software))
527 logger.debug("software='%s' - EXIT!", software)
530 def find_domains(tag: bs4.element.Tag) -> list:
531 logger.debug("tag[]='%s' - CALLED!", type(tag))
532 if not isinstance(tag, bs4.element.Tag):
533 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
534 elif len(tag.select("tr")) == 0:
535 raise KeyError("No table rows found in table!")
538 for element in tag.select("tr"):
539 logger.debug("element[]='%s'", type(element))
540 if not element.find("td"):
541 logger.debug("Skipping element, no <td> found")
544 domain = tidyup.domain(element.find("td").text)
545 reason = tidyup.reason(element.findAll("td")[1].text)
547 logger.debug("domain='%s',reason='%s'", domain, reason)
549 if not utils.is_domain_wanted(domain):
550 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
552 elif domain == "gab.com/.ai, develop.gab.com":
553 logger.debug("Multiple domains detected in one row")
563 "domain": "develop.gab.com",
567 elif not validators.domain(domain.split("/")[0]):
568 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
571 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
577 logger.debug("domains()=%d - EXIT!", len(domains))
580 def add_peers(rows: dict) -> list:
581 logger.debug("rows[]='%s' - CALLED!", type(rows))
582 if not isinstance(rows, dict):
583 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
586 for key in ["linked", "allowed", "blocked"]:
587 logger.debug("Checking key='%s'", key)
588 if key not in rows or rows[key] is None:
589 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
592 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
593 for peer in rows[key]:
594 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
595 if peer is None or peer == "":
596 logger.debug("peer is empty - SKIPPED")
598 elif isinstance(peer, dict) and "domain" in peer:
599 logger.debug("peer[domain]='%s'", peer["domain"])
600 peer = tidyup.domain(peer["domain"])
601 elif isinstance(peer, str):
602 logger.debug("peer='%s'", peer)
603 peer = tidyup.domain(peer)
605 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
607 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
608 if not utils.is_domain_wanted(peer):
609 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
612 logger.debug("Appending peer='%s' ...", peer)
615 logger.debug("peers()=%d - EXIT!", len(peers))