1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
44 logging.basicConfig(level=logging.INFO)
45 logger = logging.getLogger(__name__)
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
60 raise ValueError("Parameter command='%s' but origin is None, please fix invoking this function.", command)
61 elif software is None:
63 logger.debug("Software for domain='%s' is not set, determining ...", domain)
64 software = determine_software(domain, path)
65 except network.exceptions as exception:
66 logger.warning("Exception '%s' during determining software type", type(exception))
67 instances.set_last_error(domain, exception)
69 logger.debug("Determined software='%s' for domain='%s'", software, domain)
70 elif not isinstance(software, str):
71 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
72 elif not isinstance(path, str) and path is not None:
73 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
75 logger.debug("Checking if domain='%s' is registered ...", domain)
76 if not instances.is_registered(domain):
77 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
78 instances.add(domain, origin, command, path, software)
80 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
81 instances.set_last_instance_fetch(domain)
84 logger.debug("software='%s'", software)
85 if software is not None:
87 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
88 peerlist = fetch_peers(domain, software, origin)
89 except network.exceptions as exception:
90 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
92 logger.debug("peerlist[]='%s'", type(peerlist))
93 if isinstance(peerlist, list):
94 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
95 instances.set_total_peers(domain, peerlist)
97 logger.debug("peerlist[]='%s'", type(peerlist))
98 if peerlist is None or len(peerlist) == 0:
99 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
101 if instances.has_pending(domain):
102 logger.debug("Flushing updates for domain='%s' ...", domain)
103 instances.update_data(domain)
105 logger.debug("Invoking cookies.clear(%s) ...", domain)
106 cookies.clear(domain)
109 logger.debug("EXIT!")
112 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
113 for instance in peerlist:
114 logger.debug("instance='%s'", instance)
115 if instance is None or instance == "":
116 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
119 logger.debug("instance='%s' - BEFORE!", instance)
120 instance = tidyup.domain(instance)
121 logger.debug("instance='%s' - AFTER!", instance)
124 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
127 logger.debug("instance='%s' - BEFORE!", instance)
128 instance = instance.encode("idna").decode("utf-8")
129 logger.debug("instance='%s' - AFTER!", instance)
131 if not domain_helper.is_wanted(instance):
132 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
134 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
135 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
137 elif instance.find("/tag/") > 0:
138 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
140 elif not instances.is_registered(instance):
141 logger.debug("Checking if domain='%s' has pending updates ...", domain)
142 if instances.has_pending(domain):
143 logger.debug("Flushing updates for domain='%s' ...", domain)
144 instances.update_data(domain)
146 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
147 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
148 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
149 fetch_instances(instance, domain, None, command, path)
151 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
152 instances.add(instance, domain, command)
154 logger.debug("Invoking cookies.clear(%s) ...", domain)
155 cookies.clear(domain)
157 logger.debug("Checking if domain='%s' has pending updates ...", domain)
158 if instances.has_pending(domain):
159 logger.debug("Flushing updates for domain='%s' ...", domain)
160 instances.update_data(domain)
163 logger.debug("EXIT!")
165 def fetch_peers(domain: str, software: str, origin: str) -> list:
166 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
167 domain_helper.raise_on(domain)
169 if not isinstance(software, str) and software is not None:
170 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
171 elif not isinstance(origin, str) and origin is not None:
172 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
173 elif isinstance(origin, str) and origin == "":
174 raise ValueError("Parameter 'origin' is empty")
176 if software == "misskey":
177 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
178 return misskey.fetch_peers(domain)
179 elif software == "lemmy":
180 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
181 return lemmy.fetch_peers(domain, origin)
182 elif software == "peertube":
183 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
184 return peertube.fetch_peers(domain)
186 # No CSRF by default, you don't have to add network.api_headers by yourself here
190 logger.debug("Checking CSRF for domain='%s'", domain)
191 headers = csrf.determine(domain, dict())
192 except network.exceptions as exception:
193 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
194 instances.set_last_error(domain, exception)
196 logger.debug("Returning empty list ... - EXIT!")
200 "/api/v1/instance/peers",
204 # Init peers variable
207 logger.debug("Checking %d paths ...", len(paths))
209 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
210 data = network.get_json_api(
214 (config.get("connection_timeout"), config.get("read_timeout"))
217 logger.debug("data[]='%s'", type(data))
218 if "error_message" in data:
219 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
220 instances.set_last_error(domain, data)
221 elif "json" in data and len(data["json"]) > 0:
222 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
225 logger.debug("Marking domain='%s' as successfully handled ...", domain)
226 instances.set_success(domain)
229 if not isinstance(peers, list):
230 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
233 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
234 instances.set_total_peers(domain, peers)
236 logger.debug("peers()=%d - EXIT!", len(peers))
239 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
240 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
241 domain_helper.raise_on(domain)
243 if not isinstance(path, str):
244 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
246 raise ValueError("Parameter 'path' is empty")
248 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
251 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
252 response = network.fetch_response(
256 (config.get("connection_timeout"), config.get("read_timeout")),
260 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
261 if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
262 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
263 doc = bs4.BeautifulSoup(response.text, "html.parser")
265 logger.debug("doc[]='%s'", type(doc))
266 generator = doc.find("meta", {"name" : "generator"})
267 site_name = doc.find("meta", {"property": "og:site_name"})
268 platform = doc.find("meta", {"property": "og:platform"})
270 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s'", type(generator), type(site_name), type(platform))
271 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
272 logger.debug("Found generator meta tag: domain='%s'", domain)
273 software = tidyup.domain(generator.get("content"))
275 logger.debug("software[%s]='%s'", type(software), software)
276 if software is not None and software != "":
277 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
278 instances.set_detection_mode(domain, "GENERATOR")
279 elif isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
280 logger.debug("Found property=og:platform, domain='%s'", domain)
281 software = tidyup.domain(platform.get("content"))
283 logger.debug("software[%s]='%s'", type(software), software)
284 if software is not None and software != "":
285 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
286 instances.set_detection_mode(domain, "PLATFORM")
287 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
288 logger.debug("Found property=og:site_name, domain='%s'", domain)
289 software = tidyup.domain(site_name.get("content"))
291 logger.debug("software[%s]='%s'", type(software), software)
292 if software is not None and software != "":
293 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
294 instances.set_detection_mode(domain, "SITE_NAME")
295 elif not domain_helper.is_in_url(domain, response.url):
296 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
298 components = urlparse(response.url)
300 logger.debug("components[]='%s'", type(components))
301 if not instances.is_registered(components.netloc):
302 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
303 fetch_instances(components.netloc, domain, None, "fetch_generator")
305 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
306 instances.set_last_error(domain, message)
307 instances.set_software(domain, None)
308 instances.set_detection_mode(domain, None)
309 instances.set_nodeinfo_url(domain, None)
311 raise requests.exceptions.TooManyRedirects(message)
313 logger.debug("software[]='%s'", type(software))
314 if isinstance(software, str) and software == "":
315 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
317 elif isinstance(software, str) and ("." in software or " " in software):
318 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
319 software = version.remove(software)
321 logger.debug("software[]='%s'", type(software))
322 if isinstance(software, str) and "powered by " in software:
323 logger.debug("software='%s' has 'powered by' in it", software)
324 software = version.remove(software_helper.strip_powered_by(software))
325 elif isinstance(software, str) and " hosted on " in software:
326 logger.debug("software='%s' has 'hosted on' in it", software)
327 software = version.remove(software_helper.strip_hosted_on(software))
328 elif isinstance(software, str) and " by " in software:
329 logger.debug("software='%s' has ' by ' in it", software)
330 software = software_helper.strip_until(software, " by ")
331 elif isinstance(software, str) and " see " in software:
332 logger.debug("software='%s' has ' see ' in it", software)
333 software = software_helper.strip_until(software, " see ")
335 logger.debug("software='%s' - EXIT!", software)
338 def determine_software(domain: str, path: str = None) -> str:
339 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
340 domain_helper.raise_on(domain)
342 if not isinstance(path, str) and path is not None:
343 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
345 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
348 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
349 data = nodeinfo.fetch_nodeinfo(domain, path)
351 logger.debug("data[%s]='%s'", type(data), data)
352 if "exception" in data:
353 # Continue raising it
354 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
355 raise data["exception"]
356 elif "error_message" in data:
357 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
358 software = fetch_generator_from_path(domain)
359 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
361 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
364 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
365 software = fetch_generator_from_path(domain)
366 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
368 if "status" in data and data["status"] == "error" and "message" in data:
369 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
370 instances.set_last_error(domain, data["message"])
371 instances.set_detection_mode(domain, None)
372 instances.set_nodeinfo_url(domain, None)
373 software = fetch_generator_from_path(domain)
374 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
375 elif "software" in data and "name" in data["software"]:
376 logger.debug("Found data[json][software][name] in JSON response")
377 software = data["software"]["name"]
378 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
379 elif "message" in data:
380 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
381 instances.set_last_error(domain, data["message"])
382 instances.set_detection_mode(domain, None)
383 instances.set_nodeinfo_url(domain, None)
385 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
386 software = fetch_generator_from_path(domain)
387 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
388 elif "server" in data and "software" in data["server"]:
389 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
390 software = data["server"]["software"].lower()
391 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
392 elif "software" not in data or "name" not in data["software"]:
393 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
394 instances.set_detection_mode(domain, None)
395 instances.set_nodeinfo_url(domain, None)
397 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
398 software = fetch_generator_from_path(domain)
399 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
401 logger.debug("software[%s]='%s'", type(software), software)
403 logger.debug("Returning None - EXIT!")
406 logger.debug("software='%s'- BEFORE!", software)
407 software = software_helper.alias(software)
408 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
410 if str(software) == "":
411 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
412 software = fetch_generator_from_path(domain)
413 elif len(str(software)) > 0 and ("." in software or " " in software):
414 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
415 software = version.remove(software)
417 logger.debug("software[]='%s'", type(software))
418 if isinstance(software, str) and "powered by" in software:
419 logger.debug("software='%s' has 'powered by' in it", software)
420 software = version.remove(software_helper.strip_powered_by(software))
422 logger.debug("software='%s' - EXIT!", software)
425 def find_domains(tag: bs4.element.Tag) -> list:
426 logger.debug("tag[]='%s' - CALLED!", type(tag))
427 if not isinstance(tag, bs4.element.Tag):
428 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
429 elif len(tag.select("tr")) == 0:
430 raise KeyError("No table rows found in table!")
433 for element in tag.select("tr"):
434 logger.debug("element[]='%s'", type(element))
435 if not element.find("td"):
436 logger.debug("Skipping element, no <td> found")
439 domain = tidyup.domain(element.find("td").text)
440 reason = tidyup.reason(element.findAll("td")[1].text)
442 logger.debug("domain='%s',reason='%s'", domain, reason)
444 if not domain_helper.is_wanted(domain):
445 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
447 elif domain == "gab.com/.ai, develop.gab.com":
448 logger.debug("Multiple domains detected in one row")
458 "domain": "develop.gab.com",
462 elif not validators.domain(domain.split("/")[0]):
463 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
466 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
472 logger.debug("domains()=%d - EXIT!", len(domains))
475 def add_peers(rows: dict) -> list:
476 logger.debug("rows[]='%s' - CALLED!", type(rows))
477 if not isinstance(rows, dict):
478 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
481 for key in ["linked", "allowed", "blocked"]:
482 logger.debug("Checking key='%s'", key)
483 if key not in rows or rows[key] is None:
484 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
487 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
488 for peer in rows[key]:
489 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
490 if peer is None or peer == "":
491 logger.debug("peer is empty - SKIPPED")
493 elif isinstance(peer, dict) and "domain" in peer:
494 logger.debug("peer[domain]='%s'", peer["domain"])
495 peer = tidyup.domain(peer["domain"])
496 elif isinstance(peer, str):
497 logger.debug("peer='%s'", peer)
498 peer = tidyup.domain(peer)
500 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
502 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
503 if not domain_helper.is_wanted(peer):
504 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
507 logger.debug("Appending peer='%s' ...", peer)
510 logger.debug("peers()=%d - EXIT!", len(peers))