1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
42 # Depth counter, being raised and lowered
45 logging.basicConfig(level=logging.INFO)
46 logger = logging.getLogger(__name__)
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
50 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
60 raise ValueError(f"Parameter command='{command}' but origin is None, please fix invoking this function.")
61 elif not isinstance(path, str) and path is not None:
62 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
63 elif _DEPTH > 0 and instances.is_recent(domain, "last_instance_fetch"):
64 raise ValueError(f"domain='{domain}' has recently been fetched but function was invoked")
65 elif software is None and not instances.is_recent(domain, "last_nodeinfo"):
67 logger.debug("Software for domain='%s' is not set, determining ...", domain)
68 software = determine_software(domain, path)
69 except network.exceptions as exception:
70 logger.warning("Exception '%s' during determining software type", type(exception))
71 instances.set_last_error(domain, exception)
73 logger.debug("Determined software='%s' for domain='%s'", software, domain)
74 elif software is None:
75 logger.debug("domain='%s' has unknown software or nodeinfo has recently being fetched", domain)
76 elif not isinstance(software, str):
77 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
82 logger.debug("Checking if domain='%s' is registered ...", domain)
83 if not instances.is_registered(domain):
84 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
85 instances.add(domain, origin, command, path, software)
87 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
88 instances.set_last_instance_fetch(domain)
91 logger.debug("software='%s'", software)
92 if software is not None:
94 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
95 peerlist = fetch_peers(domain, software, origin)
96 except network.exceptions as exception:
97 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
99 logger.debug("peerlist[]='%s'", type(peerlist))
100 if isinstance(peerlist, list):
101 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
102 instances.set_total_peers(domain, peerlist)
104 logger.debug("peerlist[]='%s'", type(peerlist))
105 if peerlist is None or len(peerlist) == 0:
106 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
108 if instances.has_pending(domain):
109 logger.debug("Flushing updates for domain='%s' ...", domain)
110 instances.update(domain)
112 logger.debug("Invoking cookies.clear(%s) ...", domain)
113 cookies.clear(domain)
116 logger.debug("EXIT!")
119 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
120 for instance in peerlist:
121 logger.debug("instance='%s'", instance)
122 if instance is None or instance == "":
123 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
126 logger.debug("instance='%s' - BEFORE!", instance)
127 instance = tidyup.domain(instance)
128 logger.debug("instance='%s' - AFTER!", instance)
131 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
133 elif ".." in instance:
134 logger.warning("instance='%s' contains double-dot, removing ...", instance)
135 instance = instance.replace("..", ".")
137 logger.debug("instance='%s' - BEFORE!", instance)
138 instance = instance.encode("idna").decode("utf-8")
139 logger.debug("instance='%s' - AFTER!", instance)
141 if not domain_helper.is_wanted(instance):
142 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
144 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
145 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
147 elif instance.find("/tag/") > 0:
148 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
150 elif not instances.is_registered(instance):
151 logger.debug("Checking if domain='%s' has pending updates ...", domain)
152 if instances.has_pending(domain):
153 logger.debug("Flushing updates for domain='%s' ...", domain)
154 instances.update(domain)
156 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
157 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
158 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
159 fetch_instances(instance, domain, None, command, path)
161 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
162 instances.add(instance, domain, command)
164 logger.debug("Invoking cookies.clear(%s) ...", domain)
165 cookies.clear(domain)
167 logger.debug("Checking if domain='%s' has pending updates ...", domain)
168 if instances.has_pending(domain):
169 logger.debug("Flushing updates for domain='%s' ...", domain)
170 instances.update(domain)
173 logger.debug("EXIT!")
175 def fetch_peers(domain: str, software: str, origin: str) -> list:
176 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
177 domain_helper.raise_on(domain)
179 if not isinstance(software, str) and software is not None:
180 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
181 elif not isinstance(origin, str) and origin is not None:
182 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
183 elif isinstance(origin, str) and origin == "":
184 raise ValueError("Parameter 'origin' is empty")
186 if software == "misskey":
187 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
188 return misskey.fetch_peers(domain)
189 elif software == "lemmy":
190 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
191 return lemmy.fetch_peers(domain, origin)
192 elif software == "peertube":
193 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
194 return peertube.fetch_peers(domain)
196 # No CSRF by default, you don't have to add network.api_headers by yourself here
200 logger.debug("Checking CSRF for domain='%s'", domain)
201 headers = csrf.determine(domain, dict())
202 except network.exceptions as exception:
203 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
204 instances.set_last_error(domain, exception)
206 logger.debug("Returning empty list ... - EXIT!")
210 "/api/v1/instance/peers",
214 # Init peers variable
217 logger.debug("Checking %d paths ...", len(paths))
219 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
220 data = network.get_json_api(
224 (config.get("connection_timeout"), config.get("read_timeout"))
227 logger.debug("data[]='%s'", type(data))
228 if "error_message" in data:
229 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
230 instances.set_last_error(domain, data)
231 elif "json" in data and len(data["json"]) > 0:
232 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
235 logger.debug("Marking domain='%s' as successfully handled ...", domain)
236 instances.set_success(domain)
239 if not isinstance(peers, list):
240 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
243 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
244 instances.set_total_peers(domain, peers)
246 logger.debug("peers()=%d - EXIT!", len(peers))
249 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
250 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
251 domain_helper.raise_on(domain)
253 if not isinstance(path, str):
254 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
256 raise ValueError("Parameter 'path' is empty")
258 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
261 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
262 response = network.fetch_response(
266 (config.get("connection_timeout"), config.get("read_timeout")),
270 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
271 if ((response.ok and response.status_code < 300) or response.status_code == 410) and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
272 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
273 doc = bs4.BeautifulSoup(response.text, "html.parser")
275 logger.debug("doc[]='%s'", type(doc))
276 platform = doc.find("meta", {"property": "og:platform"})
277 generator = doc.find("meta", {"name" : "generator"})
278 site_name = doc.find("meta", {"property": "og:site_name"})
279 app_name = doc.find("meta", {"name" : "application-name"})
281 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s',app_name[]='%s'", type(generator), type(site_name), type(platform), type(app_name))
282 if isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
283 logger.debug("Found property=og:platform, domain='%s'", domain)
284 software = tidyup.domain(platform.get("content"))
286 logger.debug("software[%s]='%s'", type(software), software)
287 if software is not None and software != "":
288 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
289 instances.set_detection_mode(domain, "PLATFORM")
290 elif isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
291 logger.debug("Found generator meta tag: domain='%s'", domain)
292 software = tidyup.domain(generator.get("content"))
294 logger.debug("software[%s]='%s'", type(software), software)
295 if software is not None and software != "":
296 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
297 instances.set_detection_mode(domain, "GENERATOR")
298 elif isinstance(app_name, bs4.element.Tag) and isinstance(app_name.get("content"), str):
299 logger.debug("Found property=og:app_name, domain='%s'", domain)
300 software = tidyup.domain(app_name.get("content"))
302 logger.debug("software[%s]='%s'", type(software), software)
303 if software is not None and software != "":
304 logger.debug("domain='%s' has application-name='%s' - Setting detection_mode=app_name ...", domain, software)
305 instances.set_detection_mode(domain, "APP_NAME")
306 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
307 logger.debug("Found property=og:site_name, domain='%s'", domain)
308 software = tidyup.domain(site_name.get("content"))
310 logger.debug("software[%s]='%s'", type(software), software)
311 if software is not None and software != "":
312 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
313 instances.set_detection_mode(domain, "SITE_NAME")
314 elif not domain_helper.is_in_url(domain, response.url):
315 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
317 components = urlparse(response.url)
319 logger.debug("components[]='%s'", type(components))
320 if not instances.is_registered(components.netloc):
321 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
322 fetch_instances(components.netloc, domain, None, "fetch_generator")
324 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
325 instances.set_last_error(domain, message)
326 instances.set_software(domain, None)
327 instances.set_detection_mode(domain, None)
328 instances.set_nodeinfo_url(domain, None)
330 raise requests.exceptions.TooManyRedirects(message)
332 logger.debug("software[]='%s'", type(software))
333 if isinstance(software, str) and software == "":
334 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
336 elif isinstance(software, str) and ("." in software or " " in software):
337 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
338 software = version.remove(software)
340 logger.debug("software[]='%s'", type(software))
341 if isinstance(software, str) and "powered by " in software:
342 logger.debug("software='%s' has 'powered by' in it", software)
343 software = version.remove(software_helper.strip_powered_by(software))
344 elif isinstance(software, str) and " hosted on " in software:
345 logger.debug("software='%s' has 'hosted on' in it", software)
346 software = version.remove(software_helper.strip_hosted_on(software))
347 elif isinstance(software, str) and " by " in software:
348 logger.debug("software='%s' has ' by ' in it", software)
349 software = software_helper.strip_until(software, " by ")
350 elif isinstance(software, str) and " see " in software:
351 logger.debug("software='%s' has ' see ' in it", software)
352 software = software_helper.strip_until(software, " see ")
354 logger.debug("software='%s' - EXIT!", software)
357 def determine_software(domain: str, path: str = None, nodeinfo_url: str = None) -> str:
358 logger.debug("domain='%s',path='%s',nodeinfo_url='%s' - CALLED!", domain, path, nodeinfo_url)
359 domain_helper.raise_on(domain)
361 if not isinstance(path, str) and path is not None:
362 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
363 elif not isinstance(nodeinfo_url, str) and nodeinfo_url is not None:
364 raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not of type 'str'")
366 logger.debug("Fetching nodeinfo from domain='%s',path='%s',nodeinfo_url='%s' ...", domain, path, nodeinfo_url)
367 data = nodeinfo.fetch(domain, path, nodeinfo_url)
370 logger.debug("data[%s]='%s'", type(data), data)
371 if "exception" in data:
372 # Continue raising it
373 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
374 raise data["exception"]
375 elif "error_message" in data:
376 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
377 software = fetch_generator_from_path(domain)
378 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
380 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
383 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
384 software = fetch_generator_from_path(domain)
385 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
387 if "status" in data and data["status"] == "error" and "message" in data:
388 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
389 instances.set_last_error(domain, data["message"])
390 instances.set_detection_mode(domain, None)
391 instances.set_nodeinfo_url(domain, None)
392 software = fetch_generator_from_path(domain)
393 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
394 elif "software" in data and "name" in data["software"]:
395 logger.debug("Found data[json][software][name] in JSON response")
396 software = data["software"]["name"]
397 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
398 elif "message" in data:
399 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
400 instances.set_last_error(domain, data["message"])
401 instances.set_detection_mode(domain, None)
402 instances.set_nodeinfo_url(domain, None)
404 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
405 software = fetch_generator_from_path(domain)
406 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
407 elif "server" in data and "software" in data["server"]:
408 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
409 software = data["server"]["software"].lower()
410 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
411 elif "software" not in data or "name" not in data["software"]:
412 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
413 instances.set_detection_mode(domain, None)
414 instances.set_nodeinfo_url(domain, None)
416 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
417 software = fetch_generator_from_path(domain)
418 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
420 logger.debug("software[%s]='%s'", type(software), software)
422 logger.debug("Returning None - EXIT!")
425 logger.debug("software='%s'- BEFORE!", software)
426 software = software_helper.alias(software)
427 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
429 if str(software) == "":
430 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
431 software = fetch_generator_from_path(domain)
432 elif len(str(software)) > 0 and ("." in software or " " in software):
433 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
434 software = version.remove(software)
436 logger.debug("software[]='%s'", type(software))
437 if isinstance(software, str) and "powered by" in software:
438 logger.debug("software='%s' has 'powered by' in it", software)
439 software = version.remove(software_helper.strip_powered_by(software))
441 software = software.strip()
443 logger.debug("software='%s' - EXIT!", software)
446 def find_domains(tag: bs4.element.Tag) -> list:
447 logger.debug("tag[]='%s' - CALLED!", type(tag))
448 if not isinstance(tag, bs4.element.Tag):
449 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
450 elif len(tag.select("tr")) == 0:
451 raise KeyError("No table rows found in table!")
454 for element in tag.select("tr"):
455 logger.debug("element[]='%s'", type(element))
456 if not element.find("td"):
457 logger.debug("Skipping element, no <td> found")
460 domain = tidyup.domain(element.find("td").text)
461 reason = tidyup.reason(element.findAll("td")[1].text)
463 logger.debug("domain='%s',reason='%s'", domain, reason)
465 if not domain_helper.is_wanted(domain):
466 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
468 elif domain == "gab.com/.ai, develop.gab.com":
469 logger.debug("Multiple domains detected in one row")
479 "domain": "develop.gab.com",
483 elif not validators.domain(domain.split("/")[0]):
484 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
487 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
493 logger.debug("domains()=%d - EXIT!", len(domains))
496 def add_peers(rows: dict) -> list:
497 logger.debug("rows[]='%s' - CALLED!", type(rows))
498 if not isinstance(rows, dict):
499 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
502 for key in ["linked", "allowed", "blocked"]:
503 logger.debug("Checking key='%s'", key)
504 if key not in rows or rows[key] is None:
505 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
508 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
509 for peer in rows[key]:
510 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
511 if peer is None or peer == "":
512 logger.debug("peer is empty - SKIPPED")
514 elif isinstance(peer, dict) and "domain" in peer:
515 logger.debug("peer[domain]='%s'", peer["domain"])
516 peer = tidyup.domain(peer["domain"])
517 elif isinstance(peer, str):
518 logger.debug("peer='%s'", peer)
519 peer = tidyup.domain(peer)
521 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
523 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
524 if not domain_helper.is_wanted(peer):
525 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
528 logger.debug("Appending peer='%s' ...", peer)
531 logger.debug("peers()=%d - EXIT!", len(peers))