1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
44 logging.basicConfig(level=logging.INFO)
45 logger = logging.getLogger(__name__)
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
60 raise ValueError(f"Parameter command='{command}' but origin is None, please fix invoking this function.")
61 elif software is None:
63 logger.debug("Software for domain='%s' is not set, determining ...", domain)
64 software = determine_software(domain, path)
65 except network.exceptions as exception:
66 logger.warning("Exception '%s' during determining software type", type(exception))
67 instances.set_last_error(domain, exception)
69 logger.debug("Determined software='%s' for domain='%s'", software, domain)
70 elif not isinstance(software, str):
71 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
72 elif not isinstance(path, str) and path is not None:
73 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
75 logger.debug("Checking if domain='%s' is registered ...", domain)
76 if not instances.is_registered(domain):
77 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
78 instances.add(domain, origin, command, path, software)
80 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
81 instances.set_last_instance_fetch(domain)
84 logger.debug("software='%s'", software)
85 if software is not None:
87 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
88 peerlist = fetch_peers(domain, software, origin)
89 except network.exceptions as exception:
90 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
92 logger.debug("peerlist[]='%s'", type(peerlist))
93 if isinstance(peerlist, list):
94 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
95 instances.set_total_peers(domain, peerlist)
97 logger.debug("peerlist[]='%s'", type(peerlist))
98 if peerlist is None or len(peerlist) == 0:
99 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
101 if instances.has_pending(domain):
102 logger.debug("Flushing updates for domain='%s' ...", domain)
103 instances.update_data(domain)
105 logger.debug("Invoking cookies.clear(%s) ...", domain)
106 cookies.clear(domain)
109 logger.debug("EXIT!")
112 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
113 for instance in peerlist:
114 logger.debug("instance='%s'", instance)
115 if instance is None or instance == "":
116 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
119 logger.debug("instance='%s' - BEFORE!", instance)
120 instance = tidyup.domain(instance)
121 logger.debug("instance='%s' - AFTER!", instance)
124 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
127 logger.debug("instance='%s' - BEFORE!", instance)
128 instance = instance.encode("idna").decode("utf-8")
129 logger.debug("instance='%s' - AFTER!", instance)
131 if not domain_helper.is_wanted(instance):
132 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
134 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
135 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
137 elif instance.find("/tag/") > 0:
138 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
140 elif not instances.is_registered(instance):
141 logger.debug("Checking if domain='%s' has pending updates ...", domain)
142 if instances.has_pending(domain):
143 logger.debug("Flushing updates for domain='%s' ...", domain)
144 instances.update_data(domain)
146 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
147 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
148 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
149 fetch_instances(instance, domain, None, command, path)
151 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
152 instances.add(instance, domain, command)
154 logger.debug("Invoking cookies.clear(%s) ...", domain)
155 cookies.clear(domain)
157 logger.debug("Checking if domain='%s' has pending updates ...", domain)
158 if instances.has_pending(domain):
159 logger.debug("Flushing updates for domain='%s' ...", domain)
160 instances.update_data(domain)
163 logger.debug("EXIT!")
165 def fetch_peers(domain: str, software: str, origin: str) -> list:
166 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
167 domain_helper.raise_on(domain)
169 if not isinstance(software, str) and software is not None:
170 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
171 elif not isinstance(origin, str) and origin is not None:
172 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
173 elif isinstance(origin, str) and origin == "":
174 raise ValueError("Parameter 'origin' is empty")
176 if software == "misskey":
177 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
178 return misskey.fetch_peers(domain)
179 elif software == "lemmy":
180 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
181 return lemmy.fetch_peers(domain, origin)
182 elif software == "peertube":
183 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
184 return peertube.fetch_peers(domain)
186 # No CSRF by default, you don't have to add network.api_headers by yourself here
190 logger.debug("Checking CSRF for domain='%s'", domain)
191 headers = csrf.determine(domain, dict())
192 except network.exceptions as exception:
193 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
194 instances.set_last_error(domain, exception)
196 logger.debug("Returning empty list ... - EXIT!")
200 "/api/v1/instance/peers",
204 # Init peers variable
207 logger.debug("Checking %d paths ...", len(paths))
209 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
210 data = network.get_json_api(
214 (config.get("connection_timeout"), config.get("read_timeout"))
217 logger.debug("data[]='%s'", type(data))
218 if "error_message" in data:
219 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
220 instances.set_last_error(domain, data)
221 elif "json" in data and len(data["json"]) > 0:
222 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
225 logger.debug("Marking domain='%s' as successfully handled ...", domain)
226 instances.set_success(domain)
229 if not isinstance(peers, list):
230 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
233 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
234 instances.set_total_peers(domain, peers)
236 logger.debug("peers()=%d - EXIT!", len(peers))
239 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
240 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
241 domain_helper.raise_on(domain)
243 if not isinstance(path, str):
244 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
246 raise ValueError("Parameter 'path' is empty")
248 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
251 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
252 response = network.fetch_response(
256 (config.get("connection_timeout"), config.get("read_timeout")),
260 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
261 if ((response.ok and response.status_code < 300) or response.status_code == 410) and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
262 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
263 doc = bs4.BeautifulSoup(response.text, "html.parser")
265 logger.debug("doc[]='%s'", type(doc))
266 platform = doc.find("meta", {"property": "og:platform"})
267 generator = doc.find("meta", {"name" : "generator"})
268 site_name = doc.find("meta", {"property": "og:site_name"})
269 app_name = doc.find("meta", {"name" : "application-name"})
271 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s',app_name[]='%s'", type(generator), type(site_name), type(platform), type(app_name))
272 if isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
273 logger.debug("Found property=og:platform, domain='%s'", domain)
274 software = tidyup.domain(platform.get("content"))
276 logger.debug("software[%s]='%s'", type(software), software)
277 if software is not None and software != "":
278 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
279 instances.set_detection_mode(domain, "PLATFORM")
280 elif isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
281 logger.debug("Found generator meta tag: domain='%s'", domain)
282 software = tidyup.domain(generator.get("content"))
284 logger.debug("software[%s]='%s'", type(software), software)
285 if software is not None and software != "":
286 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
287 instances.set_detection_mode(domain, "GENERATOR")
288 elif isinstance(app_name, bs4.element.Tag) and isinstance(app_name.get("content"), str):
289 logger.debug("Found property=og:app_name, domain='%s'", domain)
290 software = tidyup.domain(app_name.get("content"))
292 logger.debug("software[%s]='%s'", type(software), software)
293 if software is not None and software != "":
294 logger.debug("domain='%s' has application-name='%s' - Setting detection_mode=app_name ...", domain, software)
295 instances.set_detection_mode(domain, "APP_NAME")
296 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
297 logger.debug("Found property=og:site_name, domain='%s'", domain)
298 software = tidyup.domain(site_name.get("content"))
300 logger.debug("software[%s]='%s'", type(software), software)
301 if software is not None and software != "":
302 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
303 instances.set_detection_mode(domain, "SITE_NAME")
304 elif not domain_helper.is_in_url(domain, response.url):
305 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
307 components = urlparse(response.url)
309 logger.debug("components[]='%s'", type(components))
310 if not instances.is_registered(components.netloc):
311 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
312 fetch_instances(components.netloc, domain, None, "fetch_generator")
314 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
315 instances.set_last_error(domain, message)
316 instances.set_software(domain, None)
317 instances.set_detection_mode(domain, None)
318 instances.set_nodeinfo_url(domain, None)
320 raise requests.exceptions.TooManyRedirects(message)
322 logger.debug("software[]='%s'", type(software))
323 if isinstance(software, str) and software == "":
324 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
326 elif isinstance(software, str) and ("." in software or " " in software):
327 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
328 software = version.remove(software)
330 logger.debug("software[]='%s'", type(software))
331 if isinstance(software, str) and "powered by " in software:
332 logger.debug("software='%s' has 'powered by' in it", software)
333 software = version.remove(software_helper.strip_powered_by(software))
334 elif isinstance(software, str) and " hosted on " in software:
335 logger.debug("software='%s' has 'hosted on' in it", software)
336 software = version.remove(software_helper.strip_hosted_on(software))
337 elif isinstance(software, str) and " by " in software:
338 logger.debug("software='%s' has ' by ' in it", software)
339 software = software_helper.strip_until(software, " by ")
340 elif isinstance(software, str) and " see " in software:
341 logger.debug("software='%s' has ' see ' in it", software)
342 software = software_helper.strip_until(software, " see ")
344 logger.debug("software='%s' - EXIT!", software)
347 def determine_software(domain: str, path: str = None) -> str:
348 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
349 domain_helper.raise_on(domain)
351 if not isinstance(path, str) and path is not None:
352 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
354 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
357 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
358 data = nodeinfo.fetch_nodeinfo(domain, path)
360 logger.debug("data[%s]='%s'", type(data), data)
361 if "exception" in data:
362 # Continue raising it
363 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
364 raise data["exception"]
365 elif "error_message" in data:
366 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
367 software = fetch_generator_from_path(domain)
368 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
370 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
373 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
374 software = fetch_generator_from_path(domain)
375 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
377 if "status" in data and data["status"] == "error" and "message" in data:
378 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
379 instances.set_last_error(domain, data["message"])
380 instances.set_detection_mode(domain, None)
381 instances.set_nodeinfo_url(domain, None)
382 software = fetch_generator_from_path(domain)
383 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
384 elif "software" in data and "name" in data["software"]:
385 logger.debug("Found data[json][software][name] in JSON response")
386 software = data["software"]["name"]
387 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
388 elif "message" in data:
389 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
390 instances.set_last_error(domain, data["message"])
391 instances.set_detection_mode(domain, None)
392 instances.set_nodeinfo_url(domain, None)
394 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
395 software = fetch_generator_from_path(domain)
396 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
397 elif "server" in data and "software" in data["server"]:
398 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
399 software = data["server"]["software"].lower()
400 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
401 elif "software" not in data or "name" not in data["software"]:
402 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
403 instances.set_detection_mode(domain, None)
404 instances.set_nodeinfo_url(domain, None)
406 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
407 software = fetch_generator_from_path(domain)
408 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
410 logger.debug("software[%s]='%s'", type(software), software)
412 logger.debug("Returning None - EXIT!")
415 logger.debug("software='%s'- BEFORE!", software)
416 software = software_helper.alias(software)
417 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
419 if str(software) == "":
420 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
421 software = fetch_generator_from_path(domain)
422 elif len(str(software)) > 0 and ("." in software or " " in software):
423 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
424 software = version.remove(software)
426 logger.debug("software[]='%s'", type(software))
427 if isinstance(software, str) and "powered by" in software:
428 logger.debug("software='%s' has 'powered by' in it", software)
429 software = version.remove(software_helper.strip_powered_by(software))
431 logger.debug("software='%s' - EXIT!", software)
434 def find_domains(tag: bs4.element.Tag) -> list:
435 logger.debug("tag[]='%s' - CALLED!", type(tag))
436 if not isinstance(tag, bs4.element.Tag):
437 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
438 elif len(tag.select("tr")) == 0:
439 raise KeyError("No table rows found in table!")
442 for element in tag.select("tr"):
443 logger.debug("element[]='%s'", type(element))
444 if not element.find("td"):
445 logger.debug("Skipping element, no <td> found")
448 domain = tidyup.domain(element.find("td").text)
449 reason = tidyup.reason(element.findAll("td")[1].text)
451 logger.debug("domain='%s',reason='%s'", domain, reason)
453 if not domain_helper.is_wanted(domain):
454 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
456 elif domain == "gab.com/.ai, develop.gab.com":
457 logger.debug("Multiple domains detected in one row")
467 "domain": "develop.gab.com",
471 elif not validators.domain(domain.split("/")[0]):
472 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
475 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
481 logger.debug("domains()=%d - EXIT!", len(domains))
484 def add_peers(rows: dict) -> list:
485 logger.debug("rows[]='%s' - CALLED!", type(rows))
486 if not isinstance(rows, dict):
487 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
490 for key in ["linked", "allowed", "blocked"]:
491 logger.debug("Checking key='%s'", key)
492 if key not in rows or rows[key] is None:
493 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
496 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
497 for peer in rows[key]:
498 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
499 if peer is None or peer == "":
500 logger.debug("peer is empty - SKIPPED")
502 elif isinstance(peer, dict) and "domain" in peer:
503 logger.debug("peer[domain]='%s'", peer["domain"])
504 peer = tidyup.domain(peer["domain"])
505 elif isinstance(peer, str):
506 logger.debug("peer='%s'", peer)
507 peer = tidyup.domain(peer)
509 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
511 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
512 if not domain_helper.is_wanted(peer):
513 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
516 logger.debug("Appending peer='%s' ...", peer)
519 logger.debug("peers()=%d - EXIT!", len(peers))