1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
35 from fba.models import instances
37 from fba.networks import lemmy
38 from fba.networks import misskey
39 from fba.networks import peertube
41 logging.basicConfig(level=logging.INFO)
42 logger = logging.getLogger(__name__)
44 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
45 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
46 domain_helper.raise_on(domain)
48 if not isinstance(origin, str) and origin is not None:
49 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
50 elif not isinstance(command, str):
51 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
53 raise ValueError("Parameter 'command' is empty")
54 elif software is None:
56 logger.debug("Software for domain='%s' is not set, determining ...", domain)
57 software = determine_software(domain, path)
58 except network.exceptions as exception:
59 logger.warning("Exception '%s' during determining software type", type(exception))
60 instances.set_last_error(domain, exception)
62 logger.debug("Determined software='%s' for domain='%s'", software, domain)
63 elif not isinstance(software, str):
64 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
66 if not instances.is_registered(domain):
67 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
68 instances.add(domain, origin, command, path, software)
70 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
71 instances.set_last_instance_fetch(domain)
75 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
76 peerlist = fetch_peers(domain, software, origin)
77 except network.exceptions as exception:
78 logger.warning("Cannot fetch peers from domain='%s': '%s'", domain, type(exception))
80 logger.debug("peerlist[]='%s'", type(peerlist))
81 if isinstance(peerlist, list):
82 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
83 instances.set_total_peers(domain, peerlist)
85 logger.debug("peerlist[]='%s'", type(peerlist))
86 if peerlist is None or len(peerlist) == 0:
87 logger.warning("Cannot fetch peers: domain='%s'", domain)
89 logger.debug("Invoking cookies.clear(%s) ...", domain)
95 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
96 for instance in peerlist:
97 logger.debug("instance='%s'", instance)
99 # Skip "None" types as tidup.domain() cannot parse them
102 logger.debug("instance='%s' - BEFORE!", instance)
103 instance = tidyup.domain(instance)
104 logger.debug("instance='%s' - AFTER!", instance)
107 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
109 elif not utils.is_domain_wanted(instance):
110 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
112 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
113 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
115 elif instance.find("/tag/") > 0:
116 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
118 elif not instances.is_registered(instance):
119 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
120 instances.add(instance, domain, command)
122 logger.debug("Invoking cookies.clear(%s) ...", domain)
123 cookies.clear(domain)
125 logger.debug("Checking if domain='%s' has pending updates ...", domain)
126 if instances.has_pending(domain):
127 logger.debug("Flushing updates for domain='%s' ...", domain)
128 instances.update_data(domain)
130 logger.debug("EXIT!")
132 def fetch_peers(domain: str, software: str, origin: str) -> list:
133 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
134 domain_helper.raise_on(domain)
136 if not isinstance(software, str) and software is not None:
137 raise ValueError(f"software[]='{type(software)}' is not 'str'")
139 if software == "misskey":
140 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
141 return misskey.fetch_peers(domain)
142 elif software == "lemmy":
143 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
144 return lemmy.fetch_peers(domain, origin)
145 elif software == "peertube":
146 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
147 return peertube.fetch_peers(domain)
149 # No CSRF by default, you don't have to add network.api_headers by yourself here
153 logger.debug("Checking CSRF for domain='%s'", domain)
154 headers = csrf.determine(domain, dict())
155 except network.exceptions as exception:
156 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
157 instances.set_last_error(domain, exception)
161 "/api/v1/instance/peers",
165 # Init peers variable
168 logger.debug("Checking %d paths ...", len(paths))
170 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
171 data = network.get_json_api(
175 (config.get("connection_timeout"), config.get("read_timeout"))
178 logger.debug("data[]='%s'", type(data))
179 if "error_message" in data:
180 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
181 instances.set_last_error(domain, data)
182 elif "json" in data and len(data["json"]) > 0:
183 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
186 logger.debug("Marking domain='%s' as successfully handled ...", domain)
187 instances.set_success(domain)
190 if not isinstance(peers, list):
191 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
194 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
195 instances.set_total_peers(domain, peers)
197 logger.debug("peers()=%d - EXIT!", len(peers))
200 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
201 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
202 domain_helper.raise_on(domain)
204 if not isinstance(path, str) and path is not None:
205 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
207 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
208 nodeinfo = fetch_wellknown_nodeinfo(domain)
210 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
211 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
212 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
213 return nodeinfo["json"]
215 # No CSRF by default, you don't have to add network.api_headers by yourself here
220 logger.debug("Checking CSRF for domain='%s'", domain)
221 headers = csrf.determine(domain, dict())
222 except network.exceptions as exception:
223 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
224 instances.set_last_error(domain, exception)
227 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
228 "exception" : exception,
232 "/nodeinfo/2.1.json",
234 "/nodeinfo/2.0.json",
236 "/nodeinfo/1.0.json",
241 for request in request_paths:
242 logger.debug("request='%s'", request)
243 http_url = f"http://{domain}{path}"
244 https_url = f"https://{domain}{path}"
246 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
247 if path is None or path in [request, http_url, https_url]:
248 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
249 if path in [http_url, https_url]:
250 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
251 components = urlparse(path)
252 path = components.path
254 data = network.get_json_api(
258 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
261 logger.debug("data[]='%s'", type(data))
262 if "error_message" not in data and "json" in data:
263 logger.debug("Success: request='%s'", request)
264 instances.set_detection_mode(domain, "STATIC_CHECK")
265 instances.set_nodeinfo_url(domain, request)
268 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
270 logger.debug("data()=%d - EXIT!", len(data))
273 def fetch_wellknown_nodeinfo(domain: str) -> dict:
274 logger.debug("domain='%s' - CALLED!", domain)
275 domain_helper.raise_on(domain)
277 # "rel" identifiers (no real URLs)
278 nodeinfo_identifier = [
279 "https://nodeinfo.diaspora.software/ns/schema/2.1",
280 "http://nodeinfo.diaspora.software/ns/schema/2.1",
281 "https://nodeinfo.diaspora.software/ns/schema/2.0",
282 "http://nodeinfo.diaspora.software/ns/schema/2.0",
283 "https://nodeinfo.diaspora.software/ns/schema/1.1",
284 "http://nodeinfo.diaspora.software/ns/schema/1.1",
285 "https://nodeinfo.diaspora.software/ns/schema/1.0",
286 "http://nodeinfo.diaspora.software/ns/schema/1.0",
289 # No CSRF by default, you don't have to add network.api_headers by yourself here
293 logger.debug("Checking CSRF for domain='%s'", domain)
294 headers = csrf.determine(domain, dict())
295 except network.exceptions as exception:
296 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
297 instances.set_last_error(domain, exception)
300 "error_message": type(exception),
301 "exception" : exception,
304 logger.debug("Fetching .well-known info for domain='%s'", domain)
305 data = network.get_json_api(
307 "/.well-known/nodeinfo",
309 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
312 logger.debug("data[]='%s'", type(data))
313 if "error_message" not in data:
314 nodeinfo = data["json"]
316 logger.debug("Marking domain='%s' as successfully handled ...", domain)
317 instances.set_success(domain)
319 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
320 if "links" in nodeinfo:
321 logger.debug("Found nodeinfo[links]()=%d record(s),", len(nodeinfo["links"]))
322 for niid in nodeinfo_identifier:
325 logger.debug("Checking niid='%s' ...", niid)
326 for link in nodeinfo["links"]:
327 logger.debug("link[%s]='%s'", type(link), link)
328 if not isinstance(link, dict) or not "rel" in link:
329 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
330 elif link["rel"] == niid:
331 # Default is that 'href' has a complete URL, but some hosts don't send that
332 logger.debug("link[href]='%s' matches niid='%s'", link["href"], niid)
334 components = urlparse(link["href"])
336 logger.debug("components[%s]='%s'", type(components), components)
337 if components.scheme == "" and components.netloc == "":
338 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
339 url = f"https://{domain}{url}"
340 components = urlparse(url)
342 if not utils.is_domain_wanted(components.netloc):
343 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
346 logger.debug("Fetching nodeinfo from url='%s' ...", url)
347 data = network.fetch_api_url(
349 (config.get("connection_timeout"), config.get("read_timeout"))
352 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
353 if "error_message" not in data and "json" in data:
354 logger.debug("Found JSON data()=%d,link[href]='%s'", len(data), link["href"])
355 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
356 instances.set_nodeinfo_url(domain, link["href"])
358 logger.debug("Marking domain='%s' as successfully handled ...", domain)
359 instances.set_success(domain)
362 logger.debug("Setting last error for domain='%s',data[]='%s'", domain, type(data))
363 instances.set_last_error(domain, data)
365 logger.debug("data()=%d", len(data))
366 if "error_message" not in data and "json" in data:
367 logger.debug("Auto-discovery successful: domain='%s'", domain)
370 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
372 logger.debug("Returning data[]='%s' - EXIT!", type(data))
375 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
376 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
377 domain_helper.raise_on(domain)
379 if not isinstance(path, str):
380 raise ValueError(f"path[]='{type(path)}' is not 'str'")
382 raise ValueError("Parameter 'path' is empty")
384 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
387 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
388 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
390 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
391 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
392 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
393 doc = bs4.BeautifulSoup(response.text, "html.parser")
395 logger.debug("doc[]='%s'", type(doc))
396 generator = doc.find("meta", {"name" : "generator"})
397 site_name = doc.find("meta", {"property": "og:site_name"})
399 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
400 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
401 logger.debug("Found generator meta tag: domain='%s'", domain)
402 software = tidyup.domain(generator.get("content"))
404 logger.debug("software[%s]='%s'", type(software), software)
405 if software is not None and software != "":
406 logger.info("domain='%s' is generated by '%s'", domain, software)
407 instances.set_detection_mode(domain, "GENERATOR")
408 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
409 logger.debug("Found property=og:site_name, domain='%s'", domain)
410 software = tidyup.domain(site_name.get("content"))
412 logger.debug("software[%s]='%s'", type(software), software)
413 if software is not None and software != "":
414 logger.info("domain='%s' has og:site_name='%s'", domain, software)
415 instances.set_detection_mode(domain, "SITE_NAME")
417 logger.debug("software[]='%s'", type(software))
418 if isinstance(software, str) and software == "":
419 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
421 elif isinstance(software, str) and ("." in software or " " in software):
422 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
423 software = version.remove(software)
425 logger.debug("software[]='%s'", type(software))
426 if isinstance(software, str) and "powered by " in software:
427 logger.debug("software='%s' has 'powered by' in it", software)
428 software = version.remove(version.strip_powered_by(software))
429 elif isinstance(software, str) and " hosted on " in software:
430 logger.debug("software='%s' has 'hosted on' in it", software)
431 software = version.remove(version.strip_hosted_on(software))
432 elif isinstance(software, str) and " by " in software:
433 logger.debug("software='%s' has ' by ' in it", software)
434 software = version.strip_until(software, " by ")
435 elif isinstance(software, str) and " see " in software:
436 logger.debug("software='%s' has ' see ' in it", software)
437 software = version.strip_until(software, " see ")
439 logger.debug("software='%s' - EXIT!", software)
442 def determine_software(domain: str, path: str = None) -> str:
443 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
444 domain_helper.raise_on(domain)
446 if not isinstance(path, str) and path is not None:
447 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
449 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
452 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
453 data = fetch_nodeinfo(domain, path)
455 logger.debug("data[]='%s'", type(data))
456 if "exception" in data:
457 # Continue raising it
458 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
459 raise data["exception"]
460 elif "error_message" in data:
461 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
462 software = fetch_generator_from_path(domain)
463 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
465 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
468 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
469 software = fetch_generator_from_path(domain)
470 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
472 if "status" in data and data["status"] == "error" and "message" in data:
473 logger.warning("JSON response is an error: '%s'", data["message"])
474 instances.set_last_error(domain, data["message"])
475 instances.set_detection_mode(domain, None)
476 instances.set_nodeinfo_url(domain, None)
477 software = fetch_generator_from_path(domain)
478 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
479 elif "software" in data and "name" in data["software"]:
480 logger.debug("Found data[json][software][name] in JSON response")
481 software = data["software"]["name"]
482 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
483 elif "message" in data:
484 logger.warning("JSON response contains only a message: '%s'", data["message"])
485 instances.set_last_error(domain, data["message"])
486 instances.set_detection_mode(domain, None)
487 instances.set_nodeinfo_url(domain, None)
488 software = fetch_generator_from_path(domain)
489 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
490 elif "software" not in data or "name" not in data["software"]:
491 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
492 instances.set_detection_mode(domain, None)
493 instances.set_nodeinfo_url(domain, None)
494 software = fetch_generator_from_path(domain)
495 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
497 logger.debug("software[%s]='%s'", type(software), software)
499 logger.debug("Returning None - EXIT!")
502 logger.debug("software='%s'- BEFORE!", software)
503 software = software_helper.alias(software)
504 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
506 if str(software) == "":
507 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
508 software = fetch_generator_from_path(domain)
509 elif len(str(software)) > 0 and ("." in software or " " in software):
510 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
511 software = version.remove(software)
513 logger.debug("software[]='%s'", type(software))
514 if isinstance(software, str) and "powered by" in software:
515 logger.debug("software='%s' has 'powered by' in it", software)
516 software = version.remove(version.strip_powered_by(software))
518 logger.debug("software='%s' - EXIT!", software)
521 def find_domains(tag: bs4.element.Tag) -> list:
522 logger.debug("tag[]='%s' - CALLED!", type(tag))
523 if not isinstance(tag, bs4.element.Tag):
524 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
525 elif len(tag.select("tr")) == 0:
526 raise KeyError("No table rows found in table!")
529 for element in tag.select("tr"):
530 logger.debug("element[]='%s'", type(element))
531 if not element.find("td"):
532 logger.debug("Skipping element, no <td> found")
535 domain = tidyup.domain(element.find("td").text)
536 reason = tidyup.reason(element.findAll("td")[1].text)
538 logger.debug("domain='%s',reason='%s'", domain, reason)
540 if not utils.is_domain_wanted(domain):
541 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
543 elif domain == "gab.com/.ai, develop.gab.com":
544 logger.debug("Multiple domains detected in one row")
554 "domain": "develop.gab.com",
558 elif not validators.domain(domain.split("/")[0]):
559 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
562 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
568 logger.debug("domains()=%d - EXIT!", len(domains))
571 def add_peers(rows: dict) -> list:
572 logger.debug("rows[]='%s' - CALLED!", type(rows))
573 if not isinstance(rows, dict):
574 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
577 for key in ["linked", "allowed", "blocked"]:
578 logger.debug("Checking key='%s'", key)
579 if key not in rows or rows[key] is None:
580 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
583 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
584 for peer in rows[key]:
585 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
586 if peer is None or peer == "":
587 logger.debug("peer is empty - SKIPPED")
589 elif isinstance(peer, dict) and "domain" in peer:
590 logger.debug("peer[domain]='%s'", peer["domain"])
591 peer = tidyup.domain(peer["domain"])
592 elif isinstance(peer, str):
593 logger.debug("peer='%s'", peer)
594 peer = tidyup.domain(peer)
596 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
598 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
599 if not utils.is_domain_wanted(peer):
600 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
603 logger.debug("Appending peer='%s' ...", peer)
606 logger.debug("peers()=%d - EXIT!", len(peers))