1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
35 from fba.models import instances
37 from fba.networks import lemmy
38 from fba.networks import misskey
39 from fba.networks import peertube
41 logging.basicConfig(level=logging.INFO)
42 logger = logging.getLogger(__name__)
44 # "rel" identifiers (no real URLs)
45 nodeinfo_identifier = [
46 "https://nodeinfo.diaspora.software/ns/schema/2.1",
47 "https://nodeinfo.diaspora.software/ns/schema/2.0",
48 "https://nodeinfo.diaspora.software/ns/schema/1.1",
49 "https://nodeinfo.diaspora.software/ns/schema/1.0",
50 "http://nodeinfo.diaspora.software/ns/schema/2.1",
51 "http://nodeinfo.diaspora.software/ns/schema/2.0",
52 "http://nodeinfo.diaspora.software/ns/schema/1.1",
53 "http://nodeinfo.diaspora.software/ns/schema/1.0",
56 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
57 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
58 domain_helper.raise_on(domain)
60 if not isinstance(origin, str) and origin is not None:
61 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
62 elif not isinstance(command, str):
63 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
65 raise ValueError("Parameter 'command' is empty")
66 elif software is None:
68 logger.debug("Software for domain='%s' is not set, determining ...", domain)
69 software = determine_software(domain, path)
70 except network.exceptions as exception:
71 logger.warning("Exception '%s' during determining software type", type(exception))
72 instances.set_last_error(domain, exception)
74 logger.debug("Determined software='%s' for domain='%s'", software, domain)
75 elif not isinstance(software, str):
76 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
78 if not instances.is_registered(domain):
79 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
80 instances.add(domain, origin, command, path, software)
82 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
83 instances.set_last_instance_fetch(domain)
87 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
88 peerlist = fetch_peers(domain, software, origin)
89 except network.exceptions as exception:
90 logger.warning("Cannot fetch peers from domain='%s': '%s'", domain, type(exception))
92 logger.debug("peerlist[]='%s'", type(peerlist))
93 if isinstance(peerlist, list):
94 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
95 instances.set_total_peers(domain, peerlist)
97 logger.debug("peerlist[]='%s'", type(peerlist))
98 if peerlist is None or len(peerlist) == 0:
99 logger.warning("Cannot fetch peers: domain='%s'", domain)
101 logger.debug("Invoking cookies.clear(%s) ...", domain)
102 cookies.clear(domain)
104 logger.debug("EXIT!")
107 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
108 for instance in peerlist:
109 logger.debug("instance='%s'", instance)
111 # Skip "None" types as tidup.domain() cannot parse them
114 logger.debug("instance='%s' - BEFORE!", instance)
115 instance = tidyup.domain(instance)
116 logger.debug("instance='%s' - AFTER!", instance)
119 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
121 elif not utils.is_domain_wanted(instance):
122 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
124 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
125 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
127 elif instance.find("/tag/") > 0:
128 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
130 elif not instances.is_registered(instance):
131 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
132 instances.add(instance, domain, command)
134 logger.debug("Invoking cookies.clear(%s) ...", domain)
135 cookies.clear(domain)
137 logger.debug("Checking if domain='%s' has pending updates ...", domain)
138 if instances.has_pending(domain):
139 logger.debug("Flushing updates for domain='%s' ...", domain)
140 instances.update_data(domain)
142 logger.debug("EXIT!")
144 def fetch_peers(domain: str, software: str, origin: str) -> list:
145 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
146 domain_helper.raise_on(domain)
148 if not isinstance(software, str) and software is not None:
149 raise ValueError(f"software[]='{type(software)}' is not 'str'")
151 if software == "misskey":
152 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
153 return misskey.fetch_peers(domain)
154 elif software == "lemmy":
155 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
156 return lemmy.fetch_peers(domain, origin)
157 elif software == "peertube":
158 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
159 return peertube.fetch_peers(domain)
161 # No CSRF by default, you don't have to add network.api_headers by yourself here
165 logger.debug("Checking CSRF for domain='%s'", domain)
166 headers = csrf.determine(domain, dict())
167 except network.exceptions as exception:
168 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
169 instances.set_last_error(domain, exception)
173 "/api/v1/instance/peers",
177 # Init peers variable
180 logger.debug("Checking %d paths ...", len(paths))
182 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
183 data = network.get_json_api(
187 (config.get("connection_timeout"), config.get("read_timeout"))
190 logger.debug("data[]='%s'", type(data))
191 if "error_message" in data:
192 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
193 instances.set_last_error(domain, data)
194 elif "json" in data and len(data["json"]) > 0:
195 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
198 logger.debug("Marking domain='%s' as successfully handled ...", domain)
199 instances.set_success(domain)
202 if not isinstance(peers, list):
203 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
206 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
207 instances.set_total_peers(domain, peers)
209 logger.debug("peers()=%d - EXIT!", len(peers))
212 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
213 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
214 domain_helper.raise_on(domain)
216 if not isinstance(path, str) and path is not None:
217 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
219 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
220 nodeinfo = fetch_wellknown_nodeinfo(domain)
222 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
223 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
224 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
225 return nodeinfo["json"]
227 # No CSRF by default, you don't have to add network.api_headers by yourself here
232 logger.debug("Checking CSRF for domain='%s'", domain)
233 headers = csrf.determine(domain, dict())
234 except network.exceptions as exception:
235 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
236 instances.set_last_error(domain, exception)
239 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
240 "exception" : exception,
244 "/nodeinfo/2.1.json",
246 "/nodeinfo/2.0.json",
248 "/nodeinfo/1.0.json",
253 for request in request_paths:
254 logger.debug("request='%s'", request)
255 http_url = f"http://{domain}{path}"
256 https_url = f"https://{domain}{path}"
258 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
259 if path is None or path in [request, http_url, https_url]:
260 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
261 if path in [http_url, https_url]:
262 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
263 components = urlparse(path)
264 path = components.path
266 data = network.get_json_api(
270 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
273 logger.debug("data[]='%s'", type(data))
274 if "error_message" not in data and "json" in data:
275 logger.debug("Success: request='%s'", request)
276 instances.set_detection_mode(domain, "STATIC_CHECK")
277 instances.set_nodeinfo_url(domain, request)
280 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
282 logger.debug("data()=%d - EXIT!", len(data))
285 def fetch_wellknown_nodeinfo(domain: str) -> dict:
286 logger.debug("domain='%s' - CALLED!", domain)
287 domain_helper.raise_on(domain)
289 # No CSRF by default, you don't have to add network.api_headers by yourself here
293 logger.debug("Checking CSRF for domain='%s'", domain)
294 headers = csrf.determine(domain, dict())
295 except network.exceptions as exception:
296 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
297 instances.set_last_error(domain, exception)
300 "error_message": type(exception),
301 "exception" : exception,
304 logger.debug("Fetching .well-known info for domain='%s'", domain)
305 data = network.get_json_api(
307 "/.well-known/nodeinfo",
309 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
312 if "error_message" not in data:
313 nodeinfo = data["json"]
315 logger.debug("Marking domain='%s' as successfully handled ...", domain)
316 instances.set_success(domain)
318 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
319 if "links" in nodeinfo:
320 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
321 for link in nodeinfo["links"]:
322 logger.debug("link[%s]='%s'", type(link), link)
323 if not isinstance(link, dict) or not "rel" in link:
324 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
325 elif link["rel"] in nodeinfo_identifier:
326 # Default is that 'href' has a complete URL, but some hosts don't send that
328 components = urlparse(link["href"])
330 logger.debug("components[%s]='%s'", type(components), components)
331 if components.scheme == "" and components.netloc == "":
332 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
333 url = f"https://{domain}{url}"
334 components = urlparse(url)
336 if not utils.is_domain_wanted(components.netloc):
337 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
340 logger.debug("Fetching nodeinfo from url='%s' ...", url)
341 data = network.fetch_api_url(
343 (config.get("connection_timeout"), config.get("read_timeout"))
346 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
347 if "error_message" not in data and "json" in data:
348 logger.debug("Found JSON data()=%d", len(data))
349 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
350 instances.set_nodeinfo_url(domain, link["href"])
352 logger.debug("Marking domain='%s' as successfully handled ...", domain)
353 instances.set_success(domain)
356 logger.debug("Setting last error for domain='%s',data[]='%s'", domain, type(data))
357 instances.set_last_error(domain, data)
359 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
361 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
363 logger.debug("Returning data[]='%s' - EXIT!", type(data))
366 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
367 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
368 domain_helper.raise_on(domain)
370 if not isinstance(path, str):
371 raise ValueError(f"path[]='{type(path)}' is not 'str'")
373 raise ValueError("Parameter 'path' is empty")
375 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
378 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
379 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
381 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
382 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
383 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
384 doc = bs4.BeautifulSoup(response.text, "html.parser")
386 logger.debug("doc[]='%s'", type(doc))
387 generator = doc.find("meta", {"name" : "generator"})
388 site_name = doc.find("meta", {"property": "og:site_name"})
390 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
391 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
392 logger.debug("Found generator meta tag: domain='%s'", domain)
393 software = tidyup.domain(generator.get("content"))
395 logger.debug("software[%s]='%s'", type(software), software)
396 if software is not None and software != "":
397 logger.info("domain='%s' is generated by '%s'", domain, software)
398 instances.set_detection_mode(domain, "GENERATOR")
399 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
400 logger.debug("Found property=og:site_name, domain='%s'", domain)
401 software = tidyup.domain(site_name.get("content"))
403 logger.debug("software[%s]='%s'", type(software), software)
404 if software is not None and software != "":
405 logger.info("domain='%s' has og:site_name='%s'", domain, software)
406 instances.set_detection_mode(domain, "SITE_NAME")
408 logger.debug("software[]='%s'", type(software))
409 if isinstance(software, str) and software == "":
410 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
412 elif isinstance(software, str) and ("." in software or " " in software):
413 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
414 software = version.remove(software)
416 logger.debug("software[]='%s'", type(software))
417 if isinstance(software, str) and "powered by " in software:
418 logger.debug("software='%s' has 'powered by' in it", software)
419 software = version.remove(version.strip_powered_by(software))
420 elif isinstance(software, str) and " hosted on " in software:
421 logger.debug("software='%s' has 'hosted on' in it", software)
422 software = version.remove(version.strip_hosted_on(software))
423 elif isinstance(software, str) and " by " in software:
424 logger.debug("software='%s' has ' by ' in it", software)
425 software = version.strip_until(software, " by ")
426 elif isinstance(software, str) and " see " in software:
427 logger.debug("software='%s' has ' see ' in it", software)
428 software = version.strip_until(software, " see ")
430 logger.debug("software='%s' - EXIT!", software)
433 def determine_software(domain: str, path: str = None) -> str:
434 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
435 domain_helper.raise_on(domain)
437 if not isinstance(path, str) and path is not None:
438 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
440 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
443 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
444 data = fetch_nodeinfo(domain, path)
446 logger.debug("data[]='%s'", type(data))
447 if "exception" in data:
448 # Continue raising it
449 raise data["exception"]
450 elif "error_message" in data:
451 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
452 return fetch_generator_from_path(domain)
453 elif "status" in data and data["status"] == "error" and "message" in data:
454 logger.warning("JSON response is an error: '%s'", data["message"])
455 instances.set_last_error(domain, data["message"])
456 return fetch_generator_from_path(domain)
457 elif "message" in data:
458 logger.warning("JSON response contains only a message: '%s'", data["message"])
459 instances.set_last_error(domain, data["message"])
460 return fetch_generator_from_path(domain)
461 elif "software" not in data or "name" not in data["software"]:
462 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
463 software = fetch_generator_from_path(domain)
464 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
465 elif "software" in data and "name" in data["software"]:
466 logger.debug("Found data[software][name] in JSON response")
467 software = data["software"]["name"]
470 logger.debug("Returning None - EXIT!")
473 logger.debug("software='%s'- BEFORE!", software)
474 software = software_helper.alias(software)
475 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
477 if str(software) == "":
478 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
479 software = fetch_generator_from_path(domain)
480 elif len(str(software)) > 0 and ("." in software or " " in software):
481 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
482 software = version.remove(software)
484 logger.debug("software[]='%s'", type(software))
485 if isinstance(software, str) and "powered by" in software:
486 logger.debug("software='%s' has 'powered by' in it", software)
487 software = version.remove(version.strip_powered_by(software))
489 logger.debug("software='%s' - EXIT!", software)
492 def find_domains(tag: bs4.element.Tag) -> list:
493 logger.debug("tag[]='%s' - CALLED!", type(tag))
494 if not isinstance(tag, bs4.element.Tag):
495 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
496 elif len(tag.select("tr")) == 0:
497 raise KeyError("No table rows found in table!")
500 for element in tag.select("tr"):
501 logger.debug("element[]='%s'", type(element))
502 if not element.find("td"):
503 logger.debug("Skipping element, no <td> found")
506 domain = tidyup.domain(element.find("td").text)
507 reason = tidyup.reason(element.findAll("td")[1].text)
509 logger.debug("domain='%s',reason='%s'", domain, reason)
511 if not utils.is_domain_wanted(domain):
512 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
514 elif domain == "gab.com/.ai, develop.gab.com":
515 logger.debug("Multiple domains detected in one row")
525 "domain": "develop.gab.com",
529 elif not validators.domain(domain.split("/")[0]):
530 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
533 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
539 logger.debug("domains()=%d - EXIT!", len(domains))
542 def add_peers(rows: dict) -> list:
543 logger.debug("rows[]='%s' - CALLED!", type(rows))
544 if not isinstance(rows, dict):
545 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
548 for key in ["linked", "allowed", "blocked"]:
549 logger.debug("Checking key='%s'", key)
550 if key not in rows or rows[key] is None:
551 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
554 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
555 for peer in rows[key]:
556 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
557 if peer is None or peer == "":
558 logger.debug("peer is empty - SKIPPED")
560 elif isinstance(peer, dict) and "domain" in peer:
561 logger.debug("peer[domain]='%s'", peer["domain"])
562 peer = tidyup.domain(peer["domain"])
563 elif isinstance(peer, str):
564 logger.debug("peer='%s'", peer)
565 peer = tidyup.domain(peer)
567 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
569 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
570 if not utils.is_domain_wanted(peer):
571 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
574 logger.debug("Appending peer='%s' ...", peer)
577 logger.debug("peers()=%d - EXIT!", len(peers))