1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
24 from fba import database
27 from fba.helpers import blacklist
28 from fba.helpers import cache
29 from fba.helpers import config
31 from fba.http import federation
32 from fba.http import network
34 from fba.models import error_log
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
39 # Found info from node, such as nodeinfo URL, detection mode that needs to be
40 # written to database. Both arrays must be filled at the same time or else
41 # update_data() will fail
43 # Detection mode: 'AUTO_DISCOVERY', 'STATIC_CHECKS' or 'GENERATOR'
44 # NULL means all detection methods have failed (maybe still reachable instance)
45 "detection_mode" : {},
50 # Last fetched instances
51 "last_instance_fetch": {},
56 # Last nodeinfo (fetched)
59 "last_status_code" : {},
61 "last_error_details" : {},
64 def _set_data(key: str, domain: str, value: any):
65 logger.debug(f"key='{key}',domain='{domain}',value[]='{type(value)}' - CALLED!")
66 if not isinstance(key, str):
67 raise ValueError("Parameter key[]='{type(key)}' is not 'str'")
69 raise ValueError("Parameter 'key' is empty")
70 elif not isinstance(domain, str):
71 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
73 raise ValueError("Parameter 'domain' is empty")
74 elif domain.lower() != domain:
75 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
76 elif not validators.domain(domain.split("/")[0]):
77 raise ValueError(f"domain='{domain}' is not a valid domain")
78 elif domain.endswith(".arpa"):
79 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
80 elif domain.endswith(".tld"):
81 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
82 elif not key in _pending:
83 raise ValueError(f"key='{key}' not found in _pending")
84 elif not utils.is_primitive(value):
85 raise ValueError(f"value[]='{type(value)}' is not a primitive type")
88 _pending[key][domain] = value
92 def has_pending(domain: str) -> bool:
93 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
94 if not isinstance(domain, str):
95 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
97 raise ValueError("Parameter 'domain' is empty")
98 elif domain.lower() != domain:
99 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
100 elif not validators.domain(domain.split("/")[0]):
101 raise ValueError(f"domain='{domain}' is not a valid domain")
102 elif domain.endswith(".arpa"):
103 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
104 elif domain.endswith(".tld"):
105 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
109 logger.debug(f"key='{key}',domain='{domain}',_pending[key]()='{len(_pending[key])}'")
110 if domain in _pending[key]:
114 logger.debug(f"has='{has}' - EXIT!")
117 def update_data(domain: str):
118 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
119 if not isinstance(domain, str):
120 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
122 raise ValueError("Parameter 'domain' is empty")
123 elif domain.lower() != domain:
124 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
125 elif not validators.domain(domain.split("/")[0]):
126 raise ValueError(f"domain='{domain}' is not a valid domain")
127 elif domain.endswith(".arpa"):
128 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
129 elif domain.endswith(".tld"):
130 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
131 elif not has_pending(domain):
132 raise Exception(f"domain='{domain}' has no pending instance data, but function invoked")
133 elif not is_registered(domain):
134 raise Exception(f"domain='{domain}' cannot be updated while not being registered")
136 logger.debug(f"Updating instance data for domain='{domain}' ...")
140 logger.debug("key:", key)
141 if domain in _pending[key]:
142 logger.debug(f"Adding '{_pending[key][domain]}' for key='{key}' ...")
143 fields.append(_pending[key][domain])
144 sql_string += f" {key} = ?,"
146 logger.debug(f"sql_string()={len(sql_string)}")
148 raise ValueError(f"No fields have been set, but method invoked, domain='{domain}'")
150 # Set last_updated to current timestamp
151 fields.append(time.time())
153 # For WHERE statement
154 fields.append(domain)
156 logger.debug(f"sql_string='{sql_string}',fields()={len(fields)}")
157 sql_string = "UPDATE instances SET" + sql_string + " last_updated = ? WHERE domain = ? LIMIT 1"
158 logger.debug("sql_string:", sql_string)
160 logger.debug("Executing SQL:", sql_string)
161 database.cursor.execute(sql_string, fields)
163 logger.debug(f"Success! (rowcount={database.cursor.rowcount })")
164 if database.cursor.rowcount == 0:
165 raise Exception(f"Did not update any rows: domain='{domain}',fields()={len(fields)}")
167 logger.debug("Invoking commit() ...")
168 database.connection.commit()
170 logger.debug(f"Deleting _pending for domain='{domain}'")
172 logger.debug(f"domain='{domain}',key='{key}'")
173 if domain in _pending[key]:
174 del _pending[key][domain]
176 logger.debug("EXIT!")
178 def add(domain: str, origin: str, command: str, path: str = None, software: str = None):
179 logger.debug(f"domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}' - CALLED!")
180 if not isinstance(domain, str):
181 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
183 raise ValueError("Parameter 'domain' is empty")
184 elif domain.lower() != domain:
185 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
186 elif not validators.domain(domain.split("/")[0]):
187 raise ValueError(f"domain='{domain}' is not a valid domain")
188 elif domain.endswith(".arpa"):
189 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
190 elif domain.endswith(".tld"):
191 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
192 elif not isinstance(origin, str) and origin is not None:
193 raise ValueError(f"origin[]='{type(origin)}' is not 'str'")
195 raise ValueError("Parameter 'origin' is empty")
196 elif not isinstance(command, str):
197 raise ValueError(f"command[]='{type(command)}' is not 'str'")
199 raise ValueError("Parameter 'command' is empty")
200 elif not validators.domain(domain.split("/")[0]):
201 raise ValueError(f"Bad domain name='{domain}'")
202 elif not isinstance(path, str) and path is not None:
203 raise ValueError(f"path[]='{type(path)}' is not 'str'")
205 raise ValueError("Parameter 'path' is empty")
206 elif not isinstance(software, str) and software is not None:
207 raise ValueError(f"software[]='{type(software)}' is not 'str'")
209 raise ValueError("Parameter 'software' is empty")
210 elif domain.endswith(".arpa"):
211 raise ValueError(f"Please don't crawl .arpa domains: domain='{domain}'")
212 elif origin is not None and not validators.domain(origin.split("/")[0]):
213 raise ValueError(f"Bad origin name='{origin}'")
214 elif blacklist.is_blacklisted(domain):
215 raise Exception(f"domain='{domain}' is blacklisted, but method invoked")
216 elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (software == "lemmy" and domain.find("/c/") > 0):
217 raise Exception(f"domain='{domain}' is a single user")
218 elif domain.endswith(".tld"):
219 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
223 logger.debug("domain,origin,command,path:", domain, origin, command, path)
224 software = federation.determine_software(domain, path)
225 except network.exceptions as exception:
226 logger.warning("Exception '%s' during determining software type, domain='%s'", type(exception), domain)
227 set_last_error(domain, exception)
229 logger.debug("Determined software='%s'", software)
230 if software == "lemmy" and domain.find("/c/") > 0:
231 domain = domain.split("/c/")[0]
232 if is_registered(domain):
233 logger.warning("domain='%s' already registered after cutting off user part. - EXIT!", domain)
236 logger.info("Adding instance domain='%s' (origin='%s',software='%s')", domain, origin, software)
237 database.cursor.execute(
238 "INSERT INTO instances (domain, origin, command, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)",
243 utils.get_hash(domain),
249 logger.debug(f"Marking domain='{domain}' as registered.")
250 cache.set_sub_key("is_registered", domain, True)
252 if has_pending(domain):
253 logger.debug(f"domain='{domain}' has pending nodeinfo being updated ...")
256 logger.debug("EXIT!")
258 def set_last_nodeinfo(domain: str):
259 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
260 if not isinstance(domain, str):
261 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
263 raise ValueError("Parameter 'domain' is empty")
264 elif domain.lower() != domain:
265 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
266 elif not validators.domain(domain.split("/")[0]):
267 raise ValueError(f"domain='{domain}' is not a valid domain")
268 elif domain.endswith(".arpa"):
269 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
270 elif domain.endswith(".tld"):
271 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
273 logger.debug("Updating last_nodeinfo for domain:", domain)
274 _set_data("last_nodeinfo", domain, time.time())
276 # Running pending updated
277 logger.debug(f"Invoking update_data({domain}) ...")
280 logger.debug("EXIT!")
282 def set_last_error(domain: str, error: dict):
283 logger.debug("domain,error[]:", domain, type(error))
284 if not isinstance(domain, str):
285 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
287 raise ValueError("Parameter 'domain' is empty")
288 elif domain.lower() != domain:
289 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
290 elif not validators.domain(domain.split("/")[0]):
291 raise ValueError(f"domain='{domain}' is not a valid domain")
292 elif domain.endswith(".arpa"):
293 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
294 elif domain.endswith(".tld"):
295 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
297 logger.debug("BEFORE error[]:", type(error))
298 if isinstance(error, (BaseException, json.decoder.JSONDecodeError)):
299 error = f"error[{type(error)}]='{str(error)}'"
300 logger.debug("AFTER error[]:", type(error))
302 if isinstance(error, str):
303 logger.debug(f"Setting last_error_details='{error}'")
304 _set_data("last_status_code" , domain, 999)
305 _set_data("last_error_details", domain, error if error != "" else None)
306 elif isinstance(error, requests.models.Response):
307 logger.debug(f"Setting last_error_details='{error.reason}'")
308 _set_data("last_status_code" , domain, error.status_code)
309 _set_data("last_error_details", domain, error.reason if error.reason != "" else None)
310 elif not isinstance(error, dict):
311 raise KeyError(f"Cannot handle keys in error[{type(error)}]='{error}'")
312 elif "status_code" in error and "error_message" in error:
313 logger.debug(f"Setting last_error_details='{error['error_message']}'")
314 _set_data("last_status_code" , domain, error["status_code"])
315 _set_data("last_error_details", domain, error["error_message"] if error["error_message"] != "" else None)
316 elif "json" in error and "error" in error["json"]:
317 _set_data("last_status_code" , domain, error["status_code"])
318 _set_data("last_error_details", domain, error["json"]["error"] if error["json"]["error"] != "" else None)
320 logger.debug(f"Invoking error_log.add(domain='{domain}',error[]='{type(error)}'")
321 error_log.add(domain, error)
323 logger.debug("EXIT!")
325 def is_registered(domain: str) -> bool:
326 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
327 if not isinstance(domain, str):
328 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
330 raise ValueError("Parameter 'domain' is empty")
331 elif domain.lower() != domain:
332 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
333 elif not validators.domain(domain.split("/")[0]):
334 raise ValueError(f"domain='{domain}' is not a valid domain")
335 elif domain.endswith(".arpa"):
336 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
337 elif domain.endswith(".tld"):
338 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
340 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
341 if not cache.key_exists("is_registered"):
342 logger.debug("Cache for 'is_registered' not initialized, fetching all rows ...")
343 database.cursor.execute("SELECT domain FROM instances")
346 cache.set_all("is_registered", database.cursor.fetchall(), True)
349 registered = cache.sub_key_exists("is_registered", domain)
351 logger.debug(f"registered='{registered}' - EXIT!")
354 def is_recent(domain: str) -> bool:
355 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
356 if not isinstance(domain, str):
357 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
359 raise ValueError("Parameter 'domain' is empty")
360 elif domain.lower() != domain:
361 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
362 elif not validators.domain(domain.split("/")[0]):
363 raise ValueError(f"domain='{domain}' is not a valid domain")
364 elif domain.endswith(".arpa"):
365 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
366 elif domain.endswith(".tld"):
367 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
368 elif not is_registered(domain):
369 logger.debug(f"domain='{domain}' is not registered, returning False - EXIT!")
373 database.cursor.execute("SELECT last_instance_fetch FROM instances WHERE domain = ? LIMIT 1", [domain])
376 fetched = database.cursor.fetchone()[0]
378 logger.debug(f"fetched[{type(fetched)}]='{fetched}'")
379 recently = isinstance(fetched, float) and time.time() - fetched <= config.get("recheck_instance")
381 logger.debug(f"recently='{recently}' - EXIT!")
384 def deobscure(char: str, domain: str, blocked_hash: str = None) -> tuple:
385 logger.debug(f"char='{char}',domain='{domain}',blocked_hash='{blocked_hash}' - CALLED!")
386 if not isinstance(char, str):
387 raise ValueError(f"Parameter char[]='{type(char)}' is not 'str'")
389 raise ValueError("Parameter 'char' is empty")
390 elif not isinstance(domain, str):
391 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
393 raise ValueError("Parameter 'domain' is empty")
394 elif domain.lower() != domain:
395 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
396 elif not validators.domain(domain.split("/")[0]):
397 raise ValueError(f"domain='{domain}' is not a valid domain")
398 elif domain.endswith(".arpa"):
399 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
400 elif domain.endswith(".tld"):
401 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
402 elif not isinstance(blocked_hash, str) and blocked_hash is not None:
403 raise ValueError(f"Parameter blocked_hash[]='{type(blocked_hash)}' is not 'str'")
405 if isinstance(blocked_hash, str):
406 logger.debug(f"Looking up blocked_hash='{blocked_hash}' ...")
407 database.cursor.execute(
408 "SELECT domain, origin, nodeinfo_url FROM instances WHERE hash = ? LIMIT 1", [blocked_hash]
411 row = database.cursor.fetchone()
412 logger.debug("row[]='%s'", type(row))
415 logger.debug(f"blocked_hash='{blocked_hash}' not found, trying domain='{domain}' ...")
416 return deobscure(char, domain)
418 logger.debug(f"Looking up domain='{domain}' ...")
419 database.cursor.execute(
420 "SELECT domain, origin, nodeinfo_url FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", [domain.replace(char, "_")]
423 row = database.cursor.fetchone()
424 logger.debug("row[]='%s'", type(row))
426 logger.debug(f"row[]='{type(row)}' - EXIT!")
429 def set_last_blocked(domain: str):
430 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
431 if not isinstance(domain, str):
432 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
434 raise ValueError("Parameter 'domain' is empty")
435 elif domain.lower() != domain:
436 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
437 elif not validators.domain(domain.split("/")[0]):
438 raise ValueError(f"domain='{domain}' is not a valid domain")
439 elif domain.endswith(".arpa"):
440 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
441 elif domain.endswith(".tld"):
442 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
445 _set_data("last_blocked", domain, time.time())
446 logger.debug("EXIT!")
448 def set_last_instance_fetch(domain: str):
449 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
450 if not isinstance(domain, str):
451 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
453 raise ValueError("Parameter 'domain' is empty")
454 elif domain.lower() != domain:
455 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
456 elif not validators.domain(domain.split("/")[0]):
457 raise ValueError(f"domain='{domain}' is not a valid domain")
458 elif domain.endswith(".arpa"):
459 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
460 elif domain.endswith(".tld"):
461 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
464 _set_data("last_instance_fetch", domain, time.time())
465 logger.debug("EXIT!")
467 def set_total_peers(domain: str, peers: list):
468 logger.debug(f"domain='{domain}',peers()={len(peers)} - CALLED!")
469 if not isinstance(domain, str):
470 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
472 raise ValueError("Parameter 'domain' is empty")
473 elif domain.lower() != domain:
474 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
475 elif not validators.domain(domain.split("/")[0]):
476 raise ValueError(f"domain='{domain}' is not a valid domain")
477 elif domain.endswith(".arpa"):
478 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
479 elif domain.endswith(".tld"):
480 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
481 elif not isinstance(peers, list):
482 raise ValueError(f"Parameter peers[]='{type(peers)}' is not 'list'")
485 _set_data("total_peers", domain, len(peers))
486 logger.debug("EXIT!")
488 def set_nodeinfo_url(domain: str, url: str):
489 logger.debug(f"domain='{domain}',url='{url}' - CALLED!")
490 if not isinstance(domain, str):
491 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
493 raise ValueError("Parameter 'domain' is empty")
494 elif domain.lower() != domain:
495 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
496 elif not validators.domain(domain.split("/")[0]):
497 raise ValueError(f"domain='{domain}' is not a valid domain")
498 elif domain.endswith(".arpa"):
499 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
500 elif domain.endswith(".tld"):
501 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
502 elif not isinstance(url, str):
503 raise ValueError("Parameter url[]='{type(url)}' is not 'list'")
505 raise ValueError("Parameter 'url' is empty")
508 _set_data("nodeinfo_url", domain, url)
509 logger.debug("EXIT!")
511 def set_detection_mode(domain: str, mode: str):
512 logger.debug(f"domain='{domain}',mode='{mode}' - CALLED!")
513 if not isinstance(domain, str):
514 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
516 raise ValueError("Parameter 'domain' is empty")
517 elif domain.lower() != domain:
518 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
519 elif not validators.domain(domain.split("/")[0]):
520 raise ValueError(f"domain='{domain}' is not a valid domain")
521 elif domain.endswith(".arpa"):
522 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
523 elif domain.endswith(".tld"):
524 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
525 elif not isinstance(mode, str):
526 raise ValueError("Parameter mode[]='{type(mode)}' is not 'list'")
528 raise ValueError("Parameter 'mode' is empty")
531 _set_data("detection_mode", domain, mode)
532 logger.debug("EXIT!")