1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
42 # Depth counter, being raised and lowered
45 logging.basicConfig(level=logging.INFO)
46 logger = logging.getLogger(__name__)
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
50 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
52 domain_helper.raise_on(domain)
54 if not isinstance(origin, str) and origin is not None:
55 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
56 elif not isinstance(command, str):
57 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
59 raise ValueError("Parameter 'command' is empty")
60 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
61 raise ValueError(f"Parameter command='{command}' but origin is None, please fix invoking this function.")
62 elif not isinstance(path, str) and path is not None:
63 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
64 elif instances.is_recent(domain, "last_instance_fetch"):
65 raise ValueError(f"domain='{domain}' has recently been fetched but function was invoked")
66 elif software is None and not instances.is_recent(domain, "last_nodeinfo"):
68 logger.debug("Software for domain='%s' is not set, determining ...", domain)
69 software = determine_software(domain, path)
70 except network.exceptions as exception:
71 logger.warning("Exception '%s' during determining software type", type(exception))
72 instances.set_last_error(domain, exception)
74 logger.debug("Determined software='%s' for domain='%s'", software, domain)
75 elif software is None:
76 logger.debug("domain='%s' has unknown software or nodeinfo has recently being fetched", domain)
77 elif not isinstance(software, str):
78 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
80 logger.debug("Checking if domain='%s' is registered ...", domain)
81 if not instances.is_registered(domain):
82 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
83 instances.add(domain, origin, command, path, software)
85 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
86 instances.set_last_instance_fetch(domain)
89 logger.debug("software='%s'", software)
90 if software is not None:
92 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
93 peerlist = fetch_peers(domain, software, origin)
94 except network.exceptions as exception:
95 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
97 logger.debug("peerlist[]='%s'", type(peerlist))
98 if isinstance(peerlist, list):
99 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
100 instances.set_total_peers(domain, peerlist)
102 logger.debug("peerlist[]='%s'", type(peerlist))
103 if peerlist is None or len(peerlist) == 0:
104 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
106 if instances.has_pending(domain):
107 logger.debug("Flushing updates for domain='%s' ...", domain)
108 instances.update_data(domain)
110 logger.debug("Invoking cookies.clear(%s) ...", domain)
111 cookies.clear(domain)
114 logger.debug("EXIT!")
117 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
118 for instance in peerlist:
119 logger.debug("instance='%s'", instance)
120 if instance is None or instance == "":
121 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
124 logger.debug("instance='%s' - BEFORE!", instance)
125 instance = tidyup.domain(instance)
126 logger.debug("instance='%s' - AFTER!", instance)
129 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
131 elif ".." in instance:
132 logger.warning("instance='%s' contains double-dot, removing ...", instance)
133 instance = instance.replace("..", ".")
135 logger.debug("instance='%s' - BEFORE!", instance)
136 instance = instance.encode("idna").decode("utf-8")
137 logger.debug("instance='%s' - AFTER!", instance)
139 if not domain_helper.is_wanted(instance):
140 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
142 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
143 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
145 elif instance.find("/tag/") > 0:
146 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
148 elif not instances.is_registered(instance):
149 logger.debug("Checking if domain='%s' has pending updates ...", domain)
150 if instances.has_pending(domain):
151 logger.debug("Flushing updates for domain='%s' ...", domain)
152 instances.update_data(domain)
154 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
155 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
156 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
157 fetch_instances(instance, domain, None, command, path)
159 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
160 instances.add(instance, domain, command)
162 logger.debug("Invoking cookies.clear(%s) ...", domain)
163 cookies.clear(domain)
165 logger.debug("Checking if domain='%s' has pending updates ...", domain)
166 if instances.has_pending(domain):
167 logger.debug("Flushing updates for domain='%s' ...", domain)
168 instances.update_data(domain)
171 logger.debug("EXIT!")
173 def fetch_peers(domain: str, software: str, origin: str) -> list:
174 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
175 domain_helper.raise_on(domain)
177 if not isinstance(software, str) and software is not None:
178 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
179 elif not isinstance(origin, str) and origin is not None:
180 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
181 elif isinstance(origin, str) and origin == "":
182 raise ValueError("Parameter 'origin' is empty")
184 if software == "misskey":
185 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
186 return misskey.fetch_peers(domain)
187 elif software == "lemmy":
188 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
189 return lemmy.fetch_peers(domain, origin)
190 elif software == "peertube":
191 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
192 return peertube.fetch_peers(domain)
194 # No CSRF by default, you don't have to add network.api_headers by yourself here
198 logger.debug("Checking CSRF for domain='%s'", domain)
199 headers = csrf.determine(domain, dict())
200 except network.exceptions as exception:
201 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
202 instances.set_last_error(domain, exception)
204 logger.debug("Returning empty list ... - EXIT!")
208 "/api/v1/instance/peers",
212 # Init peers variable
215 logger.debug("Checking %d paths ...", len(paths))
217 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
218 data = network.get_json_api(
222 (config.get("connection_timeout"), config.get("read_timeout"))
225 logger.debug("data[]='%s'", type(data))
226 if "error_message" in data:
227 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
228 instances.set_last_error(domain, data)
229 elif "json" in data and len(data["json"]) > 0:
230 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
233 logger.debug("Marking domain='%s' as successfully handled ...", domain)
234 instances.set_success(domain)
237 if not isinstance(peers, list):
238 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
241 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
242 instances.set_total_peers(domain, peers)
244 logger.debug("peers()=%d - EXIT!", len(peers))
247 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
248 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
249 domain_helper.raise_on(domain)
251 if not isinstance(path, str):
252 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
254 raise ValueError("Parameter 'path' is empty")
256 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
259 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
260 response = network.fetch_response(
264 (config.get("connection_timeout"), config.get("read_timeout")),
268 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
269 if ((response.ok and response.status_code < 300) or response.status_code == 410) and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
270 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
271 doc = bs4.BeautifulSoup(response.text, "html.parser")
273 logger.debug("doc[]='%s'", type(doc))
274 platform = doc.find("meta", {"property": "og:platform"})
275 generator = doc.find("meta", {"name" : "generator"})
276 site_name = doc.find("meta", {"property": "og:site_name"})
277 app_name = doc.find("meta", {"name" : "application-name"})
279 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s',app_name[]='%s'", type(generator), type(site_name), type(platform), type(app_name))
280 if isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
281 logger.debug("Found property=og:platform, domain='%s'", domain)
282 software = tidyup.domain(platform.get("content"))
284 logger.debug("software[%s]='%s'", type(software), software)
285 if software is not None and software != "":
286 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
287 instances.set_detection_mode(domain, "PLATFORM")
288 elif isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
289 logger.debug("Found generator meta tag: domain='%s'", domain)
290 software = tidyup.domain(generator.get("content"))
292 logger.debug("software[%s]='%s'", type(software), software)
293 if software is not None and software != "":
294 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
295 instances.set_detection_mode(domain, "GENERATOR")
296 elif isinstance(app_name, bs4.element.Tag) and isinstance(app_name.get("content"), str):
297 logger.debug("Found property=og:app_name, domain='%s'", domain)
298 software = tidyup.domain(app_name.get("content"))
300 logger.debug("software[%s]='%s'", type(software), software)
301 if software is not None and software != "":
302 logger.debug("domain='%s' has application-name='%s' - Setting detection_mode=app_name ...", domain, software)
303 instances.set_detection_mode(domain, "APP_NAME")
304 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
305 logger.debug("Found property=og:site_name, domain='%s'", domain)
306 software = tidyup.domain(site_name.get("content"))
308 logger.debug("software[%s]='%s'", type(software), software)
309 if software is not None and software != "":
310 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
311 instances.set_detection_mode(domain, "SITE_NAME")
312 elif not domain_helper.is_in_url(domain, response.url):
313 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
315 components = urlparse(response.url)
317 logger.debug("components[]='%s'", type(components))
318 if not instances.is_registered(components.netloc):
319 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
320 fetch_instances(components.netloc, domain, None, "fetch_generator")
322 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
323 instances.set_last_error(domain, message)
324 instances.set_software(domain, None)
325 instances.set_detection_mode(domain, None)
326 instances.set_nodeinfo_url(domain, None)
328 raise requests.exceptions.TooManyRedirects(message)
330 logger.debug("software[]='%s'", type(software))
331 if isinstance(software, str) and software == "":
332 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
334 elif isinstance(software, str) and ("." in software or " " in software):
335 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
336 software = version.remove(software)
338 logger.debug("software[]='%s'", type(software))
339 if isinstance(software, str) and "powered by " in software:
340 logger.debug("software='%s' has 'powered by' in it", software)
341 software = version.remove(software_helper.strip_powered_by(software))
342 elif isinstance(software, str) and " hosted on " in software:
343 logger.debug("software='%s' has 'hosted on' in it", software)
344 software = version.remove(software_helper.strip_hosted_on(software))
345 elif isinstance(software, str) and " by " in software:
346 logger.debug("software='%s' has ' by ' in it", software)
347 software = software_helper.strip_until(software, " by ")
348 elif isinstance(software, str) and " see " in software:
349 logger.debug("software='%s' has ' see ' in it", software)
350 software = software_helper.strip_until(software, " see ")
352 logger.debug("software='%s' - EXIT!", software)
355 def determine_software(domain: str, path: str = None) -> str:
356 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
357 domain_helper.raise_on(domain)
359 if not isinstance(path, str) and path is not None:
360 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
362 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
365 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
366 data = nodeinfo.fetch_nodeinfo(domain, path)
368 logger.debug("data[%s]='%s'", type(data), data)
369 if "exception" in data:
370 # Continue raising it
371 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
372 raise data["exception"]
373 elif "error_message" in data:
374 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
375 software = fetch_generator_from_path(domain)
376 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
378 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
381 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
382 software = fetch_generator_from_path(domain)
383 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
385 if "status" in data and data["status"] == "error" and "message" in data:
386 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
387 instances.set_last_error(domain, data["message"])
388 instances.set_detection_mode(domain, None)
389 instances.set_nodeinfo_url(domain, None)
390 software = fetch_generator_from_path(domain)
391 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
392 elif "software" in data and "name" in data["software"]:
393 logger.debug("Found data[json][software][name] in JSON response")
394 software = data["software"]["name"]
395 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
396 elif "message" in data:
397 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
398 instances.set_last_error(domain, data["message"])
399 instances.set_detection_mode(domain, None)
400 instances.set_nodeinfo_url(domain, None)
402 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
403 software = fetch_generator_from_path(domain)
404 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
405 elif "server" in data and "software" in data["server"]:
406 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
407 software = data["server"]["software"].lower()
408 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
409 elif "software" not in data or "name" not in data["software"]:
410 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
411 instances.set_detection_mode(domain, None)
412 instances.set_nodeinfo_url(domain, None)
414 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
415 software = fetch_generator_from_path(domain)
416 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
418 logger.debug("software[%s]='%s'", type(software), software)
420 logger.debug("Returning None - EXIT!")
423 logger.debug("software='%s'- BEFORE!", software)
424 software = software_helper.alias(software)
425 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
427 if str(software) == "":
428 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
429 software = fetch_generator_from_path(domain)
430 elif len(str(software)) > 0 and ("." in software or " " in software):
431 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
432 software = version.remove(software)
434 logger.debug("software[]='%s'", type(software))
435 if isinstance(software, str) and "powered by" in software:
436 logger.debug("software='%s' has 'powered by' in it", software)
437 software = version.remove(software_helper.strip_powered_by(software))
439 logger.debug("software='%s' - EXIT!", software)
442 def find_domains(tag: bs4.element.Tag) -> list:
443 logger.debug("tag[]='%s' - CALLED!", type(tag))
444 if not isinstance(tag, bs4.element.Tag):
445 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
446 elif len(tag.select("tr")) == 0:
447 raise KeyError("No table rows found in table!")
450 for element in tag.select("tr"):
451 logger.debug("element[]='%s'", type(element))
452 if not element.find("td"):
453 logger.debug("Skipping element, no <td> found")
456 domain = tidyup.domain(element.find("td").text)
457 reason = tidyup.reason(element.findAll("td")[1].text)
459 logger.debug("domain='%s',reason='%s'", domain, reason)
461 if not domain_helper.is_wanted(domain):
462 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
464 elif domain == "gab.com/.ai, develop.gab.com":
465 logger.debug("Multiple domains detected in one row")
475 "domain": "develop.gab.com",
479 elif not validators.domain(domain.split("/")[0]):
480 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
483 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
489 logger.debug("domains()=%d - EXIT!", len(domains))
492 def add_peers(rows: dict) -> list:
493 logger.debug("rows[]='%s' - CALLED!", type(rows))
494 if not isinstance(rows, dict):
495 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
498 for key in ["linked", "allowed", "blocked"]:
499 logger.debug("Checking key='%s'", key)
500 if key not in rows or rows[key] is None:
501 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
504 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
505 for peer in rows[key]:
506 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
507 if peer is None or peer == "":
508 logger.debug("peer is empty - SKIPPED")
510 elif isinstance(peer, dict) and "domain" in peer:
511 logger.debug("peer[domain]='%s'", peer["domain"])
512 peer = tidyup.domain(peer["domain"])
513 elif isinstance(peer, str):
514 logger.debug("peer='%s'", peer)
515 peer = tidyup.domain(peer)
517 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
519 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
520 if not domain_helper.is_wanted(peer):
521 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
524 logger.debug("Appending peer='%s' ...", peer)
527 logger.debug("peers()=%d - EXIT!", len(peers))