1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
42 # Depth counter, being raised and lowered
45 logging.basicConfig(level=logging.INFO)
46 logger = logging.getLogger(__name__)
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
50 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
52 domain_helper.raise_on(domain)
54 if not isinstance(origin, str) and origin is not None:
55 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
56 elif not isinstance(command, str):
57 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
59 raise ValueError("Parameter 'command' is empty")
60 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
61 raise ValueError(f"Parameter command='{command}' but origin is None, please fix invoking this function.")
62 elif not isinstance(path, str) and path is not None:
63 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
64 elif instances.is_recent(domain, "last_instance_fetch"):
65 raise ValueError(f"domain='{domain}' has recently been fetched but function was invoked")
66 elif software is None and not instances.is_recent(domain, "last_nodeinfo"):
68 logger.debug("Software for domain='%s' is not set, determining ...", domain)
69 software = determine_software(domain, path)
70 except network.exceptions as exception:
71 logger.warning("Exception '%s' during determining software type", type(exception))
72 instances.set_last_error(domain, exception)
74 logger.debug("Determined software='%s' for domain='%s'", software, domain)
75 elif software is None:
76 logger.debug("domain='%s' has unknown software or nodeinfo has recently being fetched", domain)
77 elif not isinstance(software, str):
78 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
80 logger.debug("Checking if domain='%s' is registered ...", domain)
81 if not instances.is_registered(domain):
82 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
83 instances.add(domain, origin, command, path, software)
85 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
86 instances.set_last_instance_fetch(domain)
89 logger.debug("software='%s'", software)
90 if software is not None:
92 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
93 peerlist = fetch_peers(domain, software, origin)
94 except network.exceptions as exception:
95 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
97 logger.debug("peerlist[]='%s'", type(peerlist))
98 if isinstance(peerlist, list):
99 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
100 instances.set_total_peers(domain, peerlist)
102 logger.debug("peerlist[]='%s'", type(peerlist))
103 if peerlist is None or len(peerlist) == 0:
104 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
106 if instances.has_pending(domain):
107 logger.debug("Flushing updates for domain='%s' ...", domain)
108 instances.update_data(domain)
110 logger.debug("Invoking cookies.clear(%s) ...", domain)
111 cookies.clear(domain)
114 logger.debug("EXIT!")
117 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
118 for instance in peerlist:
119 logger.debug("instance='%s'", instance)
120 if instance is None or instance == "":
121 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
124 logger.debug("instance='%s' - BEFORE!", instance)
125 instance = tidyup.domain(instance)
126 logger.debug("instance='%s' - AFTER!", instance)
129 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
132 logger.debug("instance='%s' - BEFORE!", instance)
133 instance = instance.encode("idna").decode("utf-8")
134 logger.debug("instance='%s' - AFTER!", instance)
136 if not domain_helper.is_wanted(instance):
137 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
139 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
140 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
142 elif instance.find("/tag/") > 0:
143 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
145 elif not instances.is_registered(instance):
146 logger.debug("Checking if domain='%s' has pending updates ...", domain)
147 if instances.has_pending(domain):
148 logger.debug("Flushing updates for domain='%s' ...", domain)
149 instances.update_data(domain)
151 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
152 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
153 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
154 fetch_instances(instance, domain, None, command, path)
156 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
157 instances.add(instance, domain, command)
159 logger.debug("Invoking cookies.clear(%s) ...", domain)
160 cookies.clear(domain)
162 logger.debug("Checking if domain='%s' has pending updates ...", domain)
163 if instances.has_pending(domain):
164 logger.debug("Flushing updates for domain='%s' ...", domain)
165 instances.update_data(domain)
168 logger.debug("EXIT!")
170 def fetch_peers(domain: str, software: str, origin: str) -> list:
171 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
172 domain_helper.raise_on(domain)
174 if not isinstance(software, str) and software is not None:
175 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
176 elif not isinstance(origin, str) and origin is not None:
177 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
178 elif isinstance(origin, str) and origin == "":
179 raise ValueError("Parameter 'origin' is empty")
181 if software == "misskey":
182 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
183 return misskey.fetch_peers(domain)
184 elif software == "lemmy":
185 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
186 return lemmy.fetch_peers(domain, origin)
187 elif software == "peertube":
188 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
189 return peertube.fetch_peers(domain)
191 # No CSRF by default, you don't have to add network.api_headers by yourself here
195 logger.debug("Checking CSRF for domain='%s'", domain)
196 headers = csrf.determine(domain, dict())
197 except network.exceptions as exception:
198 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
199 instances.set_last_error(domain, exception)
201 logger.debug("Returning empty list ... - EXIT!")
205 "/api/v1/instance/peers",
209 # Init peers variable
212 logger.debug("Checking %d paths ...", len(paths))
214 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
215 data = network.get_json_api(
219 (config.get("connection_timeout"), config.get("read_timeout"))
222 logger.debug("data[]='%s'", type(data))
223 if "error_message" in data:
224 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
225 instances.set_last_error(domain, data)
226 elif "json" in data and len(data["json"]) > 0:
227 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
230 logger.debug("Marking domain='%s' as successfully handled ...", domain)
231 instances.set_success(domain)
234 if not isinstance(peers, list):
235 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
238 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
239 instances.set_total_peers(domain, peers)
241 logger.debug("peers()=%d - EXIT!", len(peers))
244 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
245 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
246 domain_helper.raise_on(domain)
248 if not isinstance(path, str):
249 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
251 raise ValueError("Parameter 'path' is empty")
253 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
256 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
257 response = network.fetch_response(
261 (config.get("connection_timeout"), config.get("read_timeout")),
265 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
266 if ((response.ok and response.status_code < 300) or response.status_code == 410) and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
267 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
268 doc = bs4.BeautifulSoup(response.text, "html.parser")
270 logger.debug("doc[]='%s'", type(doc))
271 platform = doc.find("meta", {"property": "og:platform"})
272 generator = doc.find("meta", {"name" : "generator"})
273 site_name = doc.find("meta", {"property": "og:site_name"})
274 app_name = doc.find("meta", {"name" : "application-name"})
276 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s',app_name[]='%s'", type(generator), type(site_name), type(platform), type(app_name))
277 if isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
278 logger.debug("Found property=og:platform, domain='%s'", domain)
279 software = tidyup.domain(platform.get("content"))
281 logger.debug("software[%s]='%s'", type(software), software)
282 if software is not None and software != "":
283 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
284 instances.set_detection_mode(domain, "PLATFORM")
285 elif isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
286 logger.debug("Found generator meta tag: domain='%s'", domain)
287 software = tidyup.domain(generator.get("content"))
289 logger.debug("software[%s]='%s'", type(software), software)
290 if software is not None and software != "":
291 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
292 instances.set_detection_mode(domain, "GENERATOR")
293 elif isinstance(app_name, bs4.element.Tag) and isinstance(app_name.get("content"), str):
294 logger.debug("Found property=og:app_name, domain='%s'", domain)
295 software = tidyup.domain(app_name.get("content"))
297 logger.debug("software[%s]='%s'", type(software), software)
298 if software is not None and software != "":
299 logger.debug("domain='%s' has application-name='%s' - Setting detection_mode=app_name ...", domain, software)
300 instances.set_detection_mode(domain, "APP_NAME")
301 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
302 logger.debug("Found property=og:site_name, domain='%s'", domain)
303 software = tidyup.domain(site_name.get("content"))
305 logger.debug("software[%s]='%s'", type(software), software)
306 if software is not None and software != "":
307 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
308 instances.set_detection_mode(domain, "SITE_NAME")
309 elif not domain_helper.is_in_url(domain, response.url):
310 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
312 components = urlparse(response.url)
314 logger.debug("components[]='%s'", type(components))
315 if not instances.is_registered(components.netloc):
316 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
317 fetch_instances(components.netloc, domain, None, "fetch_generator")
319 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
320 instances.set_last_error(domain, message)
321 instances.set_software(domain, None)
322 instances.set_detection_mode(domain, None)
323 instances.set_nodeinfo_url(domain, None)
325 raise requests.exceptions.TooManyRedirects(message)
327 logger.debug("software[]='%s'", type(software))
328 if isinstance(software, str) and software == "":
329 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
331 elif isinstance(software, str) and ("." in software or " " in software):
332 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
333 software = version.remove(software)
335 logger.debug("software[]='%s'", type(software))
336 if isinstance(software, str) and "powered by " in software:
337 logger.debug("software='%s' has 'powered by' in it", software)
338 software = version.remove(software_helper.strip_powered_by(software))
339 elif isinstance(software, str) and " hosted on " in software:
340 logger.debug("software='%s' has 'hosted on' in it", software)
341 software = version.remove(software_helper.strip_hosted_on(software))
342 elif isinstance(software, str) and " by " in software:
343 logger.debug("software='%s' has ' by ' in it", software)
344 software = software_helper.strip_until(software, " by ")
345 elif isinstance(software, str) and " see " in software:
346 logger.debug("software='%s' has ' see ' in it", software)
347 software = software_helper.strip_until(software, " see ")
349 logger.debug("software='%s' - EXIT!", software)
352 def determine_software(domain: str, path: str = None) -> str:
353 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
354 domain_helper.raise_on(domain)
356 if not isinstance(path, str) and path is not None:
357 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
359 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
362 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
363 data = nodeinfo.fetch_nodeinfo(domain, path)
365 logger.debug("data[%s]='%s'", type(data), data)
366 if "exception" in data:
367 # Continue raising it
368 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
369 raise data["exception"]
370 elif "error_message" in data:
371 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
372 software = fetch_generator_from_path(domain)
373 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
375 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
378 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
379 software = fetch_generator_from_path(domain)
380 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
382 if "status" in data and data["status"] == "error" and "message" in data:
383 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
384 instances.set_last_error(domain, data["message"])
385 instances.set_detection_mode(domain, None)
386 instances.set_nodeinfo_url(domain, None)
387 software = fetch_generator_from_path(domain)
388 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
389 elif "software" in data and "name" in data["software"]:
390 logger.debug("Found data[json][software][name] in JSON response")
391 software = data["software"]["name"]
392 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
393 elif "message" in data:
394 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
395 instances.set_last_error(domain, data["message"])
396 instances.set_detection_mode(domain, None)
397 instances.set_nodeinfo_url(domain, None)
399 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
400 software = fetch_generator_from_path(domain)
401 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
402 elif "server" in data and "software" in data["server"]:
403 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
404 software = data["server"]["software"].lower()
405 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
406 elif "software" not in data or "name" not in data["software"]:
407 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
408 instances.set_detection_mode(domain, None)
409 instances.set_nodeinfo_url(domain, None)
411 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
412 software = fetch_generator_from_path(domain)
413 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
415 logger.debug("software[%s]='%s'", type(software), software)
417 logger.debug("Returning None - EXIT!")
420 logger.debug("software='%s'- BEFORE!", software)
421 software = software_helper.alias(software)
422 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
424 if str(software) == "":
425 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
426 software = fetch_generator_from_path(domain)
427 elif len(str(software)) > 0 and ("." in software or " " in software):
428 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
429 software = version.remove(software)
431 logger.debug("software[]='%s'", type(software))
432 if isinstance(software, str) and "powered by" in software:
433 logger.debug("software='%s' has 'powered by' in it", software)
434 software = version.remove(software_helper.strip_powered_by(software))
436 logger.debug("software='%s' - EXIT!", software)
439 def find_domains(tag: bs4.element.Tag) -> list:
440 logger.debug("tag[]='%s' - CALLED!", type(tag))
441 if not isinstance(tag, bs4.element.Tag):
442 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
443 elif len(tag.select("tr")) == 0:
444 raise KeyError("No table rows found in table!")
447 for element in tag.select("tr"):
448 logger.debug("element[]='%s'", type(element))
449 if not element.find("td"):
450 logger.debug("Skipping element, no <td> found")
453 domain = tidyup.domain(element.find("td").text)
454 reason = tidyup.reason(element.findAll("td")[1].text)
456 logger.debug("domain='%s',reason='%s'", domain, reason)
458 if not domain_helper.is_wanted(domain):
459 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
461 elif domain == "gab.com/.ai, develop.gab.com":
462 logger.debug("Multiple domains detected in one row")
472 "domain": "develop.gab.com",
476 elif not validators.domain(domain.split("/")[0]):
477 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
480 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
486 logger.debug("domains()=%d - EXIT!", len(domains))
489 def add_peers(rows: dict) -> list:
490 logger.debug("rows[]='%s' - CALLED!", type(rows))
491 if not isinstance(rows, dict):
492 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
495 for key in ["linked", "allowed", "blocked"]:
496 logger.debug("Checking key='%s'", key)
497 if key not in rows or rows[key] is None:
498 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
501 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
502 for peer in rows[key]:
503 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
504 if peer is None or peer == "":
505 logger.debug("peer is empty - SKIPPED")
507 elif isinstance(peer, dict) and "domain" in peer:
508 logger.debug("peer[domain]='%s'", peer["domain"])
509 peer = tidyup.domain(peer["domain"])
510 elif isinstance(peer, str):
511 logger.debug("peer='%s'", peer)
512 peer = tidyup.domain(peer)
514 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
516 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
517 if not domain_helper.is_wanted(peer):
518 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
521 logger.debug("Appending peer='%s' ...", peer)
524 logger.debug("peers()=%d - EXIT!", len(peers))