1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
44 logging.basicConfig(level=logging.INFO)
45 logger = logging.getLogger(__name__)
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif software is None:
61 logger.debug("Software for domain='%s' is not set, determining ...", domain)
62 software = determine_software(domain, path)
63 except network.exceptions as exception:
64 logger.warning("Exception '%s' during determining software type", type(exception))
65 instances.set_last_error(domain, exception)
67 logger.debug("Determined software='%s' for domain='%s'", software, domain)
68 elif not isinstance(software, str):
69 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
70 elif not isinstance(path, str) and path is not None:
71 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
73 logger.debug("Checking if domain='%s' is registered ...", domain)
74 if not instances.is_registered(domain):
75 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
76 instances.add(domain, origin, command, path, software)
78 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
79 instances.set_last_instance_fetch(domain)
82 logger.debug("software='%s'", software)
83 if software is not None:
85 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
86 peerlist = fetch_peers(domain, software, origin)
87 except network.exceptions as exception:
88 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
90 logger.debug("peerlist[]='%s'", type(peerlist))
91 if isinstance(peerlist, list):
92 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
93 instances.set_total_peers(domain, peerlist)
95 logger.debug("peerlist[]='%s'", type(peerlist))
96 if peerlist is None or len(peerlist) == 0:
97 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
99 if instances.has_pending(domain):
100 logger.debug("Flushing updates for domain='%s' ...", domain)
101 instances.update_data(domain)
103 logger.debug("Invoking cookies.clear(%s) ...", domain)
104 cookies.clear(domain)
107 logger.debug("EXIT!")
110 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
111 for instance in peerlist:
112 logger.debug("instance='%s'", instance)
113 if instance is None or instance == "":
114 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
117 logger.debug("instance='%s' - BEFORE!", instance)
118 instance = tidyup.domain(instance)
119 logger.debug("instance='%s' - AFTER!", instance)
122 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
125 logger.debug("instance='%s' - BEFORE!", instance)
126 instance = instance.encode("idna").decode("utf-8")
127 logger.debug("instance='%s' - AFTER!", instance)
129 if not domain_helper.is_wanted(instance):
130 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
132 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
133 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
135 elif instance.find("/tag/") > 0:
136 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
138 elif not instances.is_registered(instance):
139 logger.debug("Checking if domain='%s' has pending updates ...", domain)
140 if instances.has_pending(domain):
141 logger.debug("Flushing updates for domain='%s' ...", domain)
142 instances.update_data(domain)
144 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
145 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
146 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
147 fetch_instances(instance, domain, None, command, path)
149 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
150 instances.add(instance, domain, command)
152 logger.debug("Invoking cookies.clear(%s) ...", domain)
153 cookies.clear(domain)
155 logger.debug("Checking if domain='%s' has pending updates ...", domain)
156 if instances.has_pending(domain):
157 logger.debug("Flushing updates for domain='%s' ...", domain)
158 instances.update_data(domain)
161 logger.debug("EXIT!")
163 def fetch_peers(domain: str, software: str, origin: str) -> list:
164 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
165 domain_helper.raise_on(domain)
167 if not isinstance(software, str) and software is not None:
168 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
169 elif not isinstance(origin, str) and origin is not None:
170 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
171 elif isinstance(origin, str) and origin == "":
172 raise ValueError("Parameter 'origin' is empty")
174 if software == "misskey":
175 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
176 return misskey.fetch_peers(domain)
177 elif software == "lemmy":
178 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
179 return lemmy.fetch_peers(domain, origin)
180 elif software == "peertube":
181 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
182 return peertube.fetch_peers(domain)
184 # No CSRF by default, you don't have to add network.api_headers by yourself here
188 logger.debug("Checking CSRF for domain='%s'", domain)
189 headers = csrf.determine(domain, dict())
190 except network.exceptions as exception:
191 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
192 instances.set_last_error(domain, exception)
194 logger.debug("Returning empty list ... - EXIT!")
198 "/api/v1/instance/peers",
202 # Init peers variable
205 logger.debug("Checking %d paths ...", len(paths))
207 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
208 data = network.get_json_api(
212 (config.get("connection_timeout"), config.get("read_timeout"))
215 logger.debug("data[]='%s'", type(data))
216 if "error_message" in data:
217 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
218 instances.set_last_error(domain, data)
219 elif "json" in data and len(data["json"]) > 0:
220 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
223 logger.debug("Marking domain='%s' as successfully handled ...", domain)
224 instances.set_success(domain)
227 if not isinstance(peers, list):
228 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
231 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
232 instances.set_total_peers(domain, peers)
234 logger.debug("peers()=%d - EXIT!", len(peers))
237 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
238 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
239 domain_helper.raise_on(domain)
241 if not isinstance(path, str):
242 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
244 raise ValueError("Parameter 'path' is empty")
246 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
249 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
250 response = network.fetch_response(
254 (config.get("connection_timeout"), config.get("read_timeout")),
258 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
259 if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
260 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
261 doc = bs4.BeautifulSoup(response.text, "html.parser")
263 logger.debug("doc[]='%s'", type(doc))
264 generator = doc.find("meta", {"name" : "generator"})
265 site_name = doc.find("meta", {"property": "og:site_name"})
266 platform = doc.find("meta", {"property": "og:platform"})
268 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s'", type(generator), type(site_name), type(platform))
269 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
270 logger.debug("Found generator meta tag: domain='%s'", domain)
271 software = tidyup.domain(generator.get("content"))
273 logger.debug("software[%s]='%s'", type(software), software)
274 if software is not None and software != "":
275 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
276 instances.set_detection_mode(domain, "GENERATOR")
277 elif isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
278 logger.debug("Found property=og:platform, domain='%s'", domain)
279 software = tidyup.domain(platform.get("content"))
281 logger.debug("software[%s]='%s'", type(software), software)
282 if software is not None and software != "":
283 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
284 instances.set_detection_mode(domain, "PLATFORM")
285 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
286 logger.debug("Found property=og:site_name, domain='%s'", domain)
287 software = tidyup.domain(site_name.get("content"))
289 logger.debug("software[%s]='%s'", type(software), software)
290 if software is not None and software != "":
291 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
292 instances.set_detection_mode(domain, "SITE_NAME")
293 elif not domain_helper.is_in_url(domain, response.url):
294 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
296 components = urlparse(response.url)
298 logger.debug("components[]='%s'", type(components))
299 if not instances.is_registered(components.netloc):
300 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
301 fetch_instances(components.netloc, domain, None, "fetch_generator")
303 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
304 instances.set_last_error(domain, message)
305 instances.set_software(domain, None)
306 instances.set_detection_mode(domain, None)
307 instances.set_nodeinfo_url(domain, None)
309 raise requests.exceptions.TooManyRedirects(message)
311 logger.debug("software[]='%s'", type(software))
312 if isinstance(software, str) and software == "":
313 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
315 elif isinstance(software, str) and ("." in software or " " in software):
316 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
317 software = version.remove(software)
319 logger.debug("software[]='%s'", type(software))
320 if isinstance(software, str) and "powered by " in software:
321 logger.debug("software='%s' has 'powered by' in it", software)
322 software = version.remove(version.strip_powered_by(software))
323 elif isinstance(software, str) and " hosted on " in software:
324 logger.debug("software='%s' has 'hosted on' in it", software)
325 software = version.remove(version.strip_hosted_on(software))
326 elif isinstance(software, str) and " by " in software:
327 logger.debug("software='%s' has ' by ' in it", software)
328 software = version.strip_until(software, " by ")
329 elif isinstance(software, str) and " see " in software:
330 logger.debug("software='%s' has ' see ' in it", software)
331 software = version.strip_until(software, " see ")
333 logger.debug("software='%s' - EXIT!", software)
336 def determine_software(domain: str, path: str = None) -> str:
337 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
338 domain_helper.raise_on(domain)
340 if not isinstance(path, str) and path is not None:
341 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
343 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
346 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
347 data = nodeinfo.fetch_nodeinfo(domain, path)
349 logger.debug("data[%s]='%s'", type(data), data)
350 if "exception" in data:
351 # Continue raising it
352 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
353 raise data["exception"]
354 elif "error_message" in data:
355 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
356 software = fetch_generator_from_path(domain)
357 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
359 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
362 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
363 software = fetch_generator_from_path(domain)
364 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
366 if "status" in data and data["status"] == "error" and "message" in data:
367 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
368 instances.set_last_error(domain, data["message"])
369 instances.set_detection_mode(domain, None)
370 instances.set_nodeinfo_url(domain, None)
371 software = fetch_generator_from_path(domain)
372 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
373 elif "software" in data and "name" in data["software"]:
374 logger.debug("Found data[json][software][name] in JSON response")
375 software = data["software"]["name"]
376 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
377 elif "message" in data:
378 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
379 instances.set_last_error(domain, data["message"])
380 instances.set_detection_mode(domain, None)
381 instances.set_nodeinfo_url(domain, None)
383 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
384 software = fetch_generator_from_path(domain)
385 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
386 elif "server" in data and "software" in data["server"]:
387 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
388 software = data["server"]["software"].lower()
389 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
390 elif "software" not in data or "name" not in data["software"]:
391 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
392 instances.set_detection_mode(domain, None)
393 instances.set_nodeinfo_url(domain, None)
395 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
396 software = fetch_generator_from_path(domain)
397 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
399 logger.debug("software[%s]='%s'", type(software), software)
401 logger.debug("Returning None - EXIT!")
404 logger.debug("software='%s'- BEFORE!", software)
405 software = software_helper.alias(software)
406 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
408 if str(software) == "":
409 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
410 software = fetch_generator_from_path(domain)
411 elif len(str(software)) > 0 and ("." in software or " " in software):
412 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
413 software = version.remove(software)
415 logger.debug("software[]='%s'", type(software))
416 if isinstance(software, str) and "powered by" in software:
417 logger.debug("software='%s' has 'powered by' in it", software)
418 software = version.remove(version.strip_powered_by(software))
420 logger.debug("software='%s' - EXIT!", software)
423 def find_domains(tag: bs4.element.Tag) -> list:
424 logger.debug("tag[]='%s' - CALLED!", type(tag))
425 if not isinstance(tag, bs4.element.Tag):
426 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
427 elif len(tag.select("tr")) == 0:
428 raise KeyError("No table rows found in table!")
431 for element in tag.select("tr"):
432 logger.debug("element[]='%s'", type(element))
433 if not element.find("td"):
434 logger.debug("Skipping element, no <td> found")
437 domain = tidyup.domain(element.find("td").text)
438 reason = tidyup.reason(element.findAll("td")[1].text)
440 logger.debug("domain='%s',reason='%s'", domain, reason)
442 if not domain_helper.is_wanted(domain):
443 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
445 elif domain == "gab.com/.ai, develop.gab.com":
446 logger.debug("Multiple domains detected in one row")
456 "domain": "develop.gab.com",
460 elif not validators.domain(domain.split("/")[0]):
461 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
464 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
470 logger.debug("domains()=%d - EXIT!", len(domains))
473 def add_peers(rows: dict) -> list:
474 logger.debug("rows[]='%s' - CALLED!", type(rows))
475 if not isinstance(rows, dict):
476 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
479 for key in ["linked", "allowed", "blocked"]:
480 logger.debug("Checking key='%s'", key)
481 if key not in rows or rows[key] is None:
482 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
485 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
486 for peer in rows[key]:
487 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
488 if peer is None or peer == "":
489 logger.debug("peer is empty - SKIPPED")
491 elif isinstance(peer, dict) and "domain" in peer:
492 logger.debug("peer[domain]='%s'", peer["domain"])
493 peer = tidyup.domain(peer["domain"])
494 elif isinstance(peer, str):
495 logger.debug("peer='%s'", peer)
496 peer = tidyup.domain(peer)
498 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
500 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
501 if not domain_helper.is_wanted(peer):
502 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
505 logger.debug("Appending peer='%s' ...", peer)
508 logger.debug("peers()=%d - EXIT!", len(peers))