1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
26 from fba.helpers import config
27 from fba.helpers import cookies
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import software as software_helper
30 from fba.helpers import tidyup
31 from fba.helpers import version
33 from fba.http import network
34 from fba.http import nodeinfo
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
42 # Depth counter, being raised and lowered
45 logging.basicConfig(level=logging.INFO)
46 logger = logging.getLogger(__name__)
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
50 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
52 domain_helper.raise_on(domain)
54 if not isinstance(origin, str) and origin is not None:
55 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
56 elif not isinstance(command, str):
57 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
59 raise ValueError("Parameter 'command' is empty")
60 elif command in ["fetch_blocks", "fetch_cs", "fetch_bkali", "fetch_relays", "fetch_fedipact", "fetch_joinmobilizon", "fetch_joinmisskey", "fetch_joinfediverse"] and origin is None:
61 raise ValueError(f"Parameter command='{command}' but origin is None, please fix invoking this function.")
62 elif software is None:
64 logger.debug("Software for domain='%s' is not set, determining ...", domain)
65 software = determine_software(domain, path)
66 except network.exceptions as exception:
67 logger.warning("Exception '%s' during determining software type", type(exception))
68 instances.set_last_error(domain, exception)
70 logger.debug("Determined software='%s' for domain='%s'", software, domain)
71 elif not isinstance(software, str):
72 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
73 elif not isinstance(path, str) and path is not None:
74 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
76 logger.debug("Checking if domain='%s' is registered ...", domain)
77 if not instances.is_registered(domain):
78 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
79 instances.add(domain, origin, command, path, software)
81 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
82 instances.set_last_instance_fetch(domain)
85 logger.debug("software='%s'", software)
86 if software is not None:
88 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
89 peerlist = fetch_peers(domain, software, origin)
90 except network.exceptions as exception:
91 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
93 logger.debug("peerlist[]='%s'", type(peerlist))
94 if isinstance(peerlist, list):
95 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
96 instances.set_total_peers(domain, peerlist)
98 logger.debug("peerlist[]='%s'", type(peerlist))
99 if peerlist is None or len(peerlist) == 0:
100 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
102 if instances.has_pending(domain):
103 logger.debug("Flushing updates for domain='%s' ...", domain)
104 instances.update_data(domain)
106 logger.debug("Invoking cookies.clear(%s) ...", domain)
107 cookies.clear(domain)
110 logger.debug("EXIT!")
113 logger.info("Checking %d instance(s) from domain='%s',software='%s',depth=%d ...", len(peerlist), domain, software, _DEPTH)
114 for instance in peerlist:
115 logger.debug("instance='%s'", instance)
116 if instance is None or instance == "":
117 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
120 logger.debug("instance='%s' - BEFORE!", instance)
121 instance = tidyup.domain(instance)
122 logger.debug("instance='%s' - AFTER!", instance)
125 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
128 logger.debug("instance='%s' - BEFORE!", instance)
129 instance = instance.encode("idna").decode("utf-8")
130 logger.debug("instance='%s' - AFTER!", instance)
132 if not domain_helper.is_wanted(instance):
133 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
135 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
136 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
138 elif instance.find("/tag/") > 0:
139 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
141 elif not instances.is_registered(instance):
142 logger.debug("Checking if domain='%s' has pending updates ...", domain)
143 if instances.has_pending(domain):
144 logger.debug("Flushing updates for domain='%s' ...", domain)
145 instances.update_data(domain)
147 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
148 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
149 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
150 fetch_instances(instance, domain, None, command, path)
152 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
153 instances.add(instance, domain, command)
155 logger.debug("Invoking cookies.clear(%s) ...", domain)
156 cookies.clear(domain)
158 logger.debug("Checking if domain='%s' has pending updates ...", domain)
159 if instances.has_pending(domain):
160 logger.debug("Flushing updates for domain='%s' ...", domain)
161 instances.update_data(domain)
164 logger.debug("EXIT!")
166 def fetch_peers(domain: str, software: str, origin: str) -> list:
167 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
168 domain_helper.raise_on(domain)
170 if not isinstance(software, str) and software is not None:
171 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
172 elif not isinstance(origin, str) and origin is not None:
173 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
174 elif isinstance(origin, str) and origin == "":
175 raise ValueError("Parameter 'origin' is empty")
177 if software == "misskey":
178 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
179 return misskey.fetch_peers(domain)
180 elif software == "lemmy":
181 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
182 return lemmy.fetch_peers(domain, origin)
183 elif software == "peertube":
184 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
185 return peertube.fetch_peers(domain)
187 # No CSRF by default, you don't have to add network.api_headers by yourself here
191 logger.debug("Checking CSRF for domain='%s'", domain)
192 headers = csrf.determine(domain, dict())
193 except network.exceptions as exception:
194 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
195 instances.set_last_error(domain, exception)
197 logger.debug("Returning empty list ... - EXIT!")
201 "/api/v1/instance/peers",
205 # Init peers variable
208 logger.debug("Checking %d paths ...", len(paths))
210 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
211 data = network.get_json_api(
215 (config.get("connection_timeout"), config.get("read_timeout"))
218 logger.debug("data[]='%s'", type(data))
219 if "error_message" in data:
220 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
221 instances.set_last_error(domain, data)
222 elif "json" in data and len(data["json"]) > 0:
223 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
226 logger.debug("Marking domain='%s' as successfully handled ...", domain)
227 instances.set_success(domain)
230 if not isinstance(peers, list):
231 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
234 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
235 instances.set_total_peers(domain, peers)
237 logger.debug("peers()=%d - EXIT!", len(peers))
240 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
241 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
242 domain_helper.raise_on(domain)
244 if not isinstance(path, str):
245 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
247 raise ValueError("Parameter 'path' is empty")
249 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
252 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
253 response = network.fetch_response(
257 (config.get("connection_timeout"), config.get("read_timeout")),
261 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
262 if ((response.ok and response.status_code < 300) or response.status_code == 410) and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
263 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
264 doc = bs4.BeautifulSoup(response.text, "html.parser")
266 logger.debug("doc[]='%s'", type(doc))
267 platform = doc.find("meta", {"property": "og:platform"})
268 generator = doc.find("meta", {"name" : "generator"})
269 site_name = doc.find("meta", {"property": "og:site_name"})
270 app_name = doc.find("meta", {"name" : "application-name"})
272 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s',app_name[]='%s'", type(generator), type(site_name), type(platform), type(app_name))
273 if isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
274 logger.debug("Found property=og:platform, domain='%s'", domain)
275 software = tidyup.domain(platform.get("content"))
277 logger.debug("software[%s]='%s'", type(software), software)
278 if software is not None and software != "":
279 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
280 instances.set_detection_mode(domain, "PLATFORM")
281 elif isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
282 logger.debug("Found generator meta tag: domain='%s'", domain)
283 software = tidyup.domain(generator.get("content"))
285 logger.debug("software[%s]='%s'", type(software), software)
286 if software is not None and software != "":
287 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
288 instances.set_detection_mode(domain, "GENERATOR")
289 elif isinstance(app_name, bs4.element.Tag) and isinstance(app_name.get("content"), str):
290 logger.debug("Found property=og:app_name, domain='%s'", domain)
291 software = tidyup.domain(app_name.get("content"))
293 logger.debug("software[%s]='%s'", type(software), software)
294 if software is not None and software != "":
295 logger.debug("domain='%s' has application-name='%s' - Setting detection_mode=app_name ...", domain, software)
296 instances.set_detection_mode(domain, "APP_NAME")
297 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
298 logger.debug("Found property=og:site_name, domain='%s'", domain)
299 software = tidyup.domain(site_name.get("content"))
301 logger.debug("software[%s]='%s'", type(software), software)
302 if software is not None and software != "":
303 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
304 instances.set_detection_mode(domain, "SITE_NAME")
305 elif not domain_helper.is_in_url(domain, response.url):
306 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
308 components = urlparse(response.url)
310 logger.debug("components[]='%s'", type(components))
311 if not instances.is_registered(components.netloc):
312 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
313 fetch_instances(components.netloc, domain, None, "fetch_generator")
315 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
316 instances.set_last_error(domain, message)
317 instances.set_software(domain, None)
318 instances.set_detection_mode(domain, None)
319 instances.set_nodeinfo_url(domain, None)
321 raise requests.exceptions.TooManyRedirects(message)
323 logger.debug("software[]='%s'", type(software))
324 if isinstance(software, str) and software == "":
325 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
327 elif isinstance(software, str) and ("." in software or " " in software):
328 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
329 software = version.remove(software)
331 logger.debug("software[]='%s'", type(software))
332 if isinstance(software, str) and "powered by " in software:
333 logger.debug("software='%s' has 'powered by' in it", software)
334 software = version.remove(software_helper.strip_powered_by(software))
335 elif isinstance(software, str) and " hosted on " in software:
336 logger.debug("software='%s' has 'hosted on' in it", software)
337 software = version.remove(software_helper.strip_hosted_on(software))
338 elif isinstance(software, str) and " by " in software:
339 logger.debug("software='%s' has ' by ' in it", software)
340 software = software_helper.strip_until(software, " by ")
341 elif isinstance(software, str) and " see " in software:
342 logger.debug("software='%s' has ' see ' in it", software)
343 software = software_helper.strip_until(software, " see ")
345 logger.debug("software='%s' - EXIT!", software)
348 def determine_software(domain: str, path: str = None) -> str:
349 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
350 domain_helper.raise_on(domain)
352 if not isinstance(path, str) and path is not None:
353 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
355 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
358 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
359 data = nodeinfo.fetch_nodeinfo(domain, path)
361 logger.debug("data[%s]='%s'", type(data), data)
362 if "exception" in data:
363 # Continue raising it
364 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
365 raise data["exception"]
366 elif "error_message" in data:
367 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
368 software = fetch_generator_from_path(domain)
369 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
371 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
374 logger.debug("Auto-detection for domain='%s' was failing, fetching / ...", domain)
375 software = fetch_generator_from_path(domain)
376 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
378 if "status" in data and data["status"] == "error" and "message" in data:
379 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
380 instances.set_last_error(domain, data["message"])
381 instances.set_detection_mode(domain, None)
382 instances.set_nodeinfo_url(domain, None)
383 software = fetch_generator_from_path(domain)
384 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
385 elif "software" in data and "name" in data["software"]:
386 logger.debug("Found data[json][software][name] in JSON response")
387 software = data["software"]["name"]
388 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
389 elif "message" in data:
390 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
391 instances.set_last_error(domain, data["message"])
392 instances.set_detection_mode(domain, None)
393 instances.set_nodeinfo_url(domain, None)
395 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
396 software = fetch_generator_from_path(domain)
397 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
398 elif "server" in data and "software" in data["server"]:
399 logger.debug("Found data[server][software]='%s' for domain='%s'", data["server"]["software"].lower(), domain)
400 software = data["server"]["software"].lower()
401 logger.debug("Detected software for domain='%s' is: '%s'", domain, software)
402 elif "software" not in data or "name" not in data["software"]:
403 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
404 instances.set_detection_mode(domain, None)
405 instances.set_nodeinfo_url(domain, None)
407 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
408 software = fetch_generator_from_path(domain)
409 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
411 logger.debug("software[%s]='%s'", type(software), software)
413 logger.debug("Returning None - EXIT!")
416 logger.debug("software='%s'- BEFORE!", software)
417 software = software_helper.alias(software)
418 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
420 if str(software) == "":
421 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
422 software = fetch_generator_from_path(domain)
423 elif len(str(software)) > 0 and ("." in software or " " in software):
424 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
425 software = version.remove(software)
427 logger.debug("software[]='%s'", type(software))
428 if isinstance(software, str) and "powered by" in software:
429 logger.debug("software='%s' has 'powered by' in it", software)
430 software = version.remove(software_helper.strip_powered_by(software))
432 logger.debug("software='%s' - EXIT!", software)
435 def find_domains(tag: bs4.element.Tag) -> list:
436 logger.debug("tag[]='%s' - CALLED!", type(tag))
437 if not isinstance(tag, bs4.element.Tag):
438 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
439 elif len(tag.select("tr")) == 0:
440 raise KeyError("No table rows found in table!")
443 for element in tag.select("tr"):
444 logger.debug("element[]='%s'", type(element))
445 if not element.find("td"):
446 logger.debug("Skipping element, no <td> found")
449 domain = tidyup.domain(element.find("td").text)
450 reason = tidyup.reason(element.findAll("td")[1].text)
452 logger.debug("domain='%s',reason='%s'", domain, reason)
454 if not domain_helper.is_wanted(domain):
455 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
457 elif domain == "gab.com/.ai, develop.gab.com":
458 logger.debug("Multiple domains detected in one row")
468 "domain": "develop.gab.com",
472 elif not validators.domain(domain.split("/")[0]):
473 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
476 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
482 logger.debug("domains()=%d - EXIT!", len(domains))
485 def add_peers(rows: dict) -> list:
486 logger.debug("rows[]='%s' - CALLED!", type(rows))
487 if not isinstance(rows, dict):
488 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
491 for key in ["linked", "allowed", "blocked"]:
492 logger.debug("Checking key='%s'", key)
493 if key not in rows or rows[key] is None:
494 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
497 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
498 for peer in rows[key]:
499 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
500 if peer is None or peer == "":
501 logger.debug("peer is empty - SKIPPED")
503 elif isinstance(peer, dict) and "domain" in peer:
504 logger.debug("peer[domain]='%s'", peer["domain"])
505 peer = tidyup.domain(peer["domain"])
506 elif isinstance(peer, str):
507 logger.debug("peer='%s'", peer)
508 peer = tidyup.domain(peer)
510 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
512 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
513 if not domain_helper.is_wanted(peer):
514 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
517 logger.debug("Appending peer='%s' ...", peer)
520 logger.debug("peers()=%d - EXIT!", len(peers))