1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
35 from fba.models import instances
37 from fba.networks import lemmy
38 from fba.networks import misskey
39 from fba.networks import peertube
41 logging.basicConfig(level=logging.INFO)
42 logger = logging.getLogger(__name__)
44 # "rel" identifiers (no real URLs)
45 nodeinfo_identifier = [
46 "https://nodeinfo.diaspora.software/ns/schema/2.1",
47 "https://nodeinfo.diaspora.software/ns/schema/2.0",
48 "https://nodeinfo.diaspora.software/ns/schema/1.1",
49 "https://nodeinfo.diaspora.software/ns/schema/1.0",
50 "http://nodeinfo.diaspora.software/ns/schema/2.1",
51 "http://nodeinfo.diaspora.software/ns/schema/2.0",
52 "http://nodeinfo.diaspora.software/ns/schema/1.1",
53 "http://nodeinfo.diaspora.software/ns/schema/1.0",
56 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
57 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s' - CALLED!", domain, origin, software, command, path)
58 domain_helper.raise_on(domain)
60 if not isinstance(origin, str) and origin is not None:
61 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
62 elif not isinstance(command, str):
63 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
65 raise ValueError("Parameter 'command' is empty")
66 elif software is None:
68 logger.debug("Software for domain='%s' is not set, determining ...", domain)
69 software = determine_software(domain, path)
70 except network.exceptions as exception:
71 logger.warning("Exception '%s' during determining software type", type(exception))
72 instances.set_last_error(domain, exception)
74 logger.debug("Determined software='%s' for domain='%s'", software, domain)
75 elif not isinstance(software, str):
76 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
78 if not instances.is_registered(domain):
79 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
80 instances.add(domain, origin, command, path, software)
82 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
83 instances.set_last_instance_fetch(domain)
85 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
86 peerlist = fetch_peers(domain, software, origin)
88 logger.debug("peerlist[]='%s'", type(peerlist))
89 if isinstance(peerlist, list):
90 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
91 instances.set_total_peers(domain, peerlist)
93 logger.debug("Checking if domain='%s' has pending updates ...", domain)
94 if instances.has_pending(domain):
95 logger.debug("Flushing updates for domain='%s' ...", domain)
96 instances.update_data(domain)
98 logger.debug("peerlist[]='%s'", type(peerlist))
100 logger.warning("Cannot fetch peers: domain='%s'", domain)
102 logger.debug("Invoking cookies.clear(%s) ...", domain)
103 cookies.clear(domain)
105 logger.debug("EXIT!")
108 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
109 for instance in peerlist:
110 logger.debug("instance='%s'", instance)
112 # Skip "None" types as tidup.domain() cannot parse them
115 logger.debug("instance='%s' - BEFORE!", instance)
116 instance = tidyup.domain(instance)
117 logger.debug("instance='%s' - AFTER!", instance)
120 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
122 elif not utils.is_domain_wanted(instance):
123 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
125 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
126 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
128 elif instance.find("/tag/") > 0:
129 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
131 elif not instances.is_registered(instance):
132 logger.debug("Adding new instance='%s',domain='%s',command='%s'", instance, domain, command)
133 instances.add(instance, domain, command)
135 logger.debug("Invoking cookies.clear(%s) ...", domain)
136 cookies.clear(domain)
138 logger.debug("EXIT!")
140 def fetch_peers(domain: str, software: str, origin: str) -> list:
141 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
142 domain_helper.raise_on(domain)
144 if not isinstance(software, str) and software is not None:
145 raise ValueError(f"software[]='{type(software)}' is not 'str'")
147 if software == "misskey":
148 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
149 return misskey.fetch_peers(domain)
150 elif software == "lemmy":
151 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
152 return lemmy.fetch_peers(domain, origin)
153 elif software == "peertube":
154 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
155 return peertube.fetch_peers(domain)
157 # No CSRF by default, you don't have to add network.api_headers by yourself here
161 logger.debug("Checking CSRF for domain='%s'", domain)
162 headers = csrf.determine(domain, dict())
163 except network.exceptions as exception:
164 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
165 instances.set_last_error(domain, exception)
169 "/api/v1/instance/peers",
173 # Init peers variable
176 logger.debug("Checking %d paths ...", len(paths))
178 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
179 data = network.get_json_api(
183 (config.get("connection_timeout"), config.get("read_timeout"))
186 logger.debug("data[]='%s'", type(data))
187 if "error_message" in data:
188 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
189 instances.set_last_error(domain, data)
190 elif "json" in data and len(data["json"]) > 0:
191 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
194 logger.debug("Marking domain='%s' as successfully handled ...")
195 instances.set_success(domain)
198 if not isinstance(peers, list):
199 logger.warning("peers[]='%s' is not 'list', maybe bad API response?", type(peers))
202 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
203 instances.set_total_peers(domain, peers)
205 logger.debug("peers()=%d - EXIT!", len(peers))
208 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
209 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
210 domain_helper.raise_on(domain)
212 if not isinstance(path, str) and path is not None:
213 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
215 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
216 nodeinfo = fetch_wellknown_nodeinfo(domain)
218 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
219 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
220 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
221 return nodeinfo["json"]
223 # No CSRF by default, you don't have to add network.api_headers by yourself here
228 logger.debug("Checking CSRF for domain='%s'", domain)
229 headers = csrf.determine(domain, dict())
230 except network.exceptions as exception:
231 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
232 instances.set_last_error(domain, exception)
235 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
236 "exception" : exception,
240 "/nodeinfo/2.1.json",
242 "/nodeinfo/2.0.json",
248 for request in request_paths:
249 logger.debug("request='%s'", request)
250 http_url = f"http://{domain}{path}"
251 https_url = f"https://{domain}{path}"
253 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
254 if path is None or path in [request, http_url, https_url]:
255 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
256 if path in [http_url, https_url]:
257 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
258 components = urlparse(path)
259 path = components.path
261 data = network.get_json_api(
265 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
268 logger.debug("data[]='%s'", type(data))
269 if "error_message" not in data and "json" in data:
270 logger.debug("Success: request='%s'", request)
271 instances.set_detection_mode(domain, "STATIC_CHECK")
272 instances.set_nodeinfo_url(domain, request)
275 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
277 logger.debug("data()=%d - EXIT!", len(data))
280 def fetch_wellknown_nodeinfo(domain: str) -> dict:
281 logger.debug("domain='%s' - CALLED!", domain)
282 domain_helper.raise_on(domain)
284 # No CSRF by default, you don't have to add network.api_headers by yourself here
288 logger.debug("Checking CSRF for domain='%s'", domain)
289 headers = csrf.determine(domain, dict())
290 except network.exceptions as exception:
291 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
292 instances.set_last_error(domain, exception)
295 "error_message": type(exception),
296 "exception" : exception,
299 logger.debug("Fetching .well-known info for domain='%s'", domain)
300 data = network.get_json_api(
302 "/.well-known/nodeinfo",
304 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
307 if "error_message" not in data:
308 nodeinfo = data["json"]
310 logger.debug("Marking domain='%s' as successfully handled ...")
311 instances.set_success(domain)
313 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
314 if "links" in nodeinfo:
315 logger.debug("Found nodeinfo[links]()=%d record(s)", len(nodeinfo["links"]))
316 for link in nodeinfo["links"]:
317 logger.debug("link[%s]='%s'", type(link), link)
318 if not isinstance(link, dict) or not "rel" in link:
319 logger.warning("link[]='%s' is not 'dict' or no element 'rel' found", type(link))
320 elif link["rel"] in nodeinfo_identifier:
321 # Default is that 'href' has a complete URL, but some hosts don't send that
323 components = urlparse(link["href"])
325 logger.debug("components[%s]='%s'", type(components), components)
326 if components.scheme == "" and components.netloc == "":
327 logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
328 url = f"https://{domain}{url}"
329 components = urlparse(url)
331 if not utils.is_domain_wanted(components.netloc):
332 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
335 logger.debug("Fetching nodeinfo from url='%s' ...", url)
336 data = network.fetch_api_url(
338 (config.get("connection_timeout"), config.get("read_timeout"))
341 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
342 if "error_message" not in data and "json" in data:
343 logger.debug("Found JSON data()=%d", len(data))
344 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
345 instances.set_nodeinfo_url(domain, link["href"])
347 logger.debug("Marking domain='%s' as successfully handled ...")
348 instances.set_success(domain)
351 instances.set_last_error(domain, data)
353 logger.warning("Unknown 'rel' value: domain='%s',link[rel]='%s'", domain, link["rel"])
355 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
357 logger.debug("Returning data[]='%s' - EXIT!", type(data))
360 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
361 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
362 domain_helper.raise_on(domain)
364 if not isinstance(path, str):
365 raise ValueError(f"path[]='{type(path)}' is not 'str'")
367 raise ValueError("Parameter 'path' is empty")
369 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
372 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
373 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
375 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
376 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
377 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
378 doc = bs4.BeautifulSoup(response.text, "html.parser")
380 logger.debug("doc[]='%s'", type(doc))
381 generator = doc.find("meta", {"name" : "generator"})
382 site_name = doc.find("meta", {"property": "og:site_name"})
384 logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
385 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
386 logger.debug("Found generator meta tag: domain='%s'", domain)
387 software = tidyup.domain(generator.get("content"))
389 logger.debug("software[%s]='%s'", type(software), software)
390 if software is not None and software != "":
391 logger.info("domain='%s' is generated by '%s'", domain, software)
392 instances.set_detection_mode(domain, "GENERATOR")
393 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
394 logger.debug("Found property=og:site_name, domain='%s'", domain)
395 software = tidyup.domain(site_name.get("content"))
397 logger.debug("software[%s]='%s'", type(software), software)
398 if software is not None and software != "":
399 logger.info("domain='%s' has og:site_name='%s'", domain, software)
400 instances.set_detection_mode(domain, "SITE_NAME")
402 logger.debug("software[]='%s'", type(software))
403 if isinstance(software, str) and software == "":
404 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
406 elif isinstance(software, str) and ("." in software or " " in software):
407 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
408 software = version.remove(software)
410 logger.debug("software[]='%s'", type(software))
411 if isinstance(software, str) and "powered by " in software:
412 logger.debug("software='%s' has 'powered by' in it", software)
413 software = version.remove(version.strip_powered_by(software))
414 elif isinstance(software, str) and " hosted on " in software:
415 logger.debug("software='%s' has 'hosted on' in it", software)
416 software = version.remove(version.strip_hosted_on(software))
417 elif isinstance(software, str) and " by " in software:
418 logger.debug("software='%s' has ' by ' in it", software)
419 software = version.strip_until(software, " by ")
420 elif isinstance(software, str) and " see " in software:
421 logger.debug("software='%s' has ' see ' in it", software)
422 software = version.strip_until(software, " see ")
424 logger.debug("software='%s' - EXIT!", software)
427 def determine_software(domain: str, path: str = None) -> str:
428 logger.debug("domain(%d)='%s',path='%s' - CALLED!", len(domain), domain, path)
429 domain_helper.raise_on(domain)
431 if not isinstance(path, str) and path is not None:
432 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
434 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
437 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
438 data = fetch_nodeinfo(domain, path)
440 logger.debug("data[]='%s'", type(data))
441 if "exception" in data:
442 # Continue raising it
443 raise data["exception"]
444 elif "error_message" in data:
445 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
446 return fetch_generator_from_path(domain)
447 elif "status" in data and data["status"] == "error" and "message" in data:
448 logger.warning("JSON response is an error: '%s'", data["message"])
449 instances.set_last_error(domain, data["message"])
450 return fetch_generator_from_path(domain)
451 elif "message" in data:
452 logger.warning("JSON response contains only a message: '%s'", data["message"])
453 instances.set_last_error(domain, data["message"])
454 return fetch_generator_from_path(domain)
455 elif "software" not in data or "name" not in data["software"]:
456 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
457 software = fetch_generator_from_path(domain)
458 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
459 elif "software" in data and "name" in data["software"]:
460 logger.debug("Found data[software][name] in JSON response")
461 software = data["software"]["name"]
464 logger.debug("Returning None - EXIT!")
467 logger.debug("software='%s'- BEFORE!", software)
468 software = software_helper.alias(software)
469 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
471 if str(software) == "":
472 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
473 software = fetch_generator_from_path(domain)
474 elif len(str(software)) > 0 and ("." in software or " " in software):
475 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
476 software = version.remove(software)
478 logger.debug("software[]='%s'", type(software))
479 if isinstance(software, str) and "powered by" in software:
480 logger.debug("software='%s' has 'powered by' in it", software)
481 software = version.remove(version.strip_powered_by(software))
483 logger.debug("software='%s' - EXIT!", software)
486 def find_domains(tag: bs4.element.Tag) -> list:
487 logger.debug("tag[]='%s' - CALLED!", type(tag))
488 if not isinstance(tag, bs4.element.Tag):
489 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
490 elif len(tag.select("tr")) == 0:
491 raise KeyError("No table rows found in table!")
494 for element in tag.select("tr"):
495 logger.debug("element[]='%s'", type(element))
496 if not element.find("td"):
497 logger.debug("Skipping element, no <td> found")
500 domain = tidyup.domain(element.find("td").text)
501 reason = tidyup.reason(element.findAll("td")[1].text)
503 logger.debug("domain='%s',reason='%s'", domain, reason)
505 if not utils.is_domain_wanted(domain):
506 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
508 elif domain == "gab.com/.ai, develop.gab.com":
509 logger.debug("Multiple domains detected in one row")
519 "domain": "develop.gab.com",
523 elif not validators.domain(domain.split("/")[0]):
524 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
527 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
533 logger.debug("domains()=%d - EXIT!", len(domains))
536 def add_peers(rows: dict) -> list:
537 logger.debug("rows[]='%s' - CALLED!", type(rows))
538 if not isinstance(rows, dict):
539 raise ValueError(f"Parameter rows[]='{type(rows)}' is not 'dict'")
542 for key in ["linked", "allowed", "blocked"]:
543 logger.debug("Checking key='%s'", key)
544 if key not in rows or rows[key] is None:
545 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
548 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
549 for peer in rows[key]:
550 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
551 if peer is None or peer == "":
552 logger.debug("peer is empty - SKIPPED")
554 elif isinstance(peer, dict) and "domain" in peer:
555 logger.debug("peer[domain]='%s'", peer['domain'])
556 peer = tidyup.domain(peer["domain"])
557 elif isinstance(peer, str):
558 logger.debug("peer='%s'", peer)
559 peer = tidyup.domain(peer)
561 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
563 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
564 if not utils.is_domain_wanted(peer):
565 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
568 logger.debug("Adding peer='%s' ...", peer)
571 logger.debug("peers()=%d - EXIT!", len(peers))