1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from fba import database
26 from fba.helpers import blacklist
27 from fba.helpers import config
28 from fba.helpers import tidyup
30 from fba.http import federation
31 from fba.http import network
33 from fba.models import blocks
34 from fba.models import instances
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
39 # Language mapping X -> English
42 "Reject": "Suspended servers",
45 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
46 logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
47 if not isinstance(domain, str):
48 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
50 raise ValueError("Parameter 'domain' is empty")
51 elif domain.lower() != domain:
52 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
53 elif not validators.domain(domain.split("/")[0]):
54 raise ValueError(f"domain='{domain}' is not a valid domain")
55 elif domain.endswith(".arpa"):
56 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
57 elif domain.endswith(".tld"):
58 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
59 elif not isinstance(origin, str) and origin is not None:
60 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
62 raise ValueError("Parameter 'origin' is empty")
63 elif not isinstance(nodeinfo_url, str):
64 raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
65 elif nodeinfo_url == "":
66 raise ValueError("Parameter 'nodeinfo_url' is empty")
68 # @TODO Unused blockdict
72 logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
73 rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
74 except network.exceptions as exception:
75 logger.warning(f"Exception '{type(exception)}' during fetching nodeinfo")
76 instances.set_last_error(domain, exception)
79 logger.warning("Could not fetch nodeinfo from domain:", domain)
81 elif "metadata" not in rows:
82 logger.warning(f"rows()={len(rows)} does not have key 'metadata', domain='{domain}'")
84 elif "federation" not in rows["metadata"]:
85 logger.warning(f"rows()={len(rows['metadata'])} does not have key 'federation', domain='{domain}'")
88 data = rows["metadata"]["federation"]
91 logger.debug("data[]='%s'", type(data))
92 if "mrf_simple" in data:
93 logger.debug("Found mrf_simple:", domain)
95 for block_level, blocklist in (
99 "quarantined_instances": data["quarantined_instances"]
103 logger.debug("block_level, blocklist():", block_level, len(blocklist))
104 block_level = tidyup.domain(block_level)
105 logger.debug("BEFORE block_level:", block_level)
107 if block_level == "":
108 logger.warning("block_level is now empty!")
110 elif block_level == "accept":
111 logger.debug("domain='%s' skipping block_level='accept'", domain)
114 logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
115 if len(blocklist) > 0:
116 for blocked in blocklist:
117 logger.debug("BEFORE blocked:", blocked)
118 blocked = tidyup.domain(blocked)
119 logger.debug("AFTER blocked:", blocked)
122 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
124 elif blacklist.is_blacklisted(blocked):
125 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
127 elif blocked.count("*") > 0:
128 # Obscured domain name with no hash
129 row = instances.deobscure("*", blocked)
131 logger.debug("row[]='%s'", type(row))
133 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
136 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
139 nodeinfo_url = row[2]
140 elif blocked.count("?") > 0:
141 # Obscured domain name with no hash
142 row = instances.deobscure("?", blocked)
144 logger.debug("row[]='%s'", type(row))
146 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
149 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
152 nodeinfo_url = row[2]
154 logger.debug(f"blocked='{blocked}'")
155 if not utils.is_domain_wanted(blocked):
156 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
158 elif not instances.is_registered(blocked):
160 logger.debug("Invoking commit() ...")
161 database.connection.commit()
163 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
164 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
166 if not blocks.is_instance_blocked(domain, blocked, block_level):
167 logger.debug("Blocking:", domain, blocked, block_level)
168 blocks.add_instance(domain, blocked, None, block_level)
170 if block_level == "reject":
171 logger.debug("Adding to blockdict:", blocked)
177 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
178 blocks.update_last_seen(domain, blocked, block_level)
179 elif "quarantined_instances" in data:
180 logger.debug(f"Found 'quarantined_instances' in JSON response: domain='{domain}'")
182 block_level = "quarantined"
184 for blocked in data["quarantined_instances"]:
185 logger.debug("BEFORE blocked:", blocked)
186 blocked = tidyup.domain(blocked)
187 logger.debug("AFTER blocked:", blocked)
190 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
192 elif blacklist.is_blacklisted(blocked):
193 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
195 elif blocked.count("*") > 0:
196 # Obscured domain name with no hash
197 row = instances.deobscure("*", blocked)
199 logger.debug("row[]='%s'", type(row))
201 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
204 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
207 nodeinfo_url = row[2]
208 elif blocked.count("?") > 0:
209 # Obscured domain name with no hash
210 row = instances.deobscure("?", blocked)
212 logger.debug("row[]='%s'", type(row))
214 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
217 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
220 nodeinfo_url = row[2]
222 logger.debug(f"blocked='{blocked}'")
223 if not utils.is_domain_wanted(blocked):
224 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
226 elif not instances.is_registered(blocked):
228 logger.debug("Invoking commit() ...")
229 database.connection.commit()
231 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
232 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
234 if not blocks.is_instance_blocked(domain, blocked, block_level):
235 logger.debug("Blocking:", domain, blocked, block_level)
236 blocks.add_instance(domain, blocked, None, block_level)
238 if block_level == "reject":
239 logger.debug("Adding to blockdict:", blocked)
245 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
246 blocks.update_last_seen(domain, blocked, block_level)
248 logger.warning(f"Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'")
250 logger.debug("Invoking commit() ...")
251 database.connection.commit()
254 if "mrf_simple_info" in data:
255 logger.debug("Found mrf_simple_info:", domain)
257 for block_level, info in (
259 **data["mrf_simple_info"],
260 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
263 logger.debug("block_level, info.items():", block_level, len(info.items()))
264 block_level = tidyup.domain(block_level)
265 logger.debug("BEFORE block_level:", block_level)
267 if block_level == "":
268 logger.warning("block_level is now empty!")
270 elif block_level == "accept":
271 logger.debug("domain='%s' skipping block_level='accept'", domain)
274 logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',block_level='{block_level}' ...")
275 for blocked, reason in info.items():
276 logger.debug(f"blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!")
277 blocked = tidyup.domain(blocked)
279 if isinstance(reason, str):
280 logger.debug("reason[] is a string")
281 reason = tidyup.reason(reason)
282 elif isinstance(reason, dict) and "reason" in reason:
283 logger.debug("reason[] is a dict")
284 reason = tidyup.reason(reason["reason"])
285 elif reason is not None:
286 raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
288 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
291 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
293 elif blacklist.is_blacklisted(blocked):
294 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
296 elif blocked.count("*") > 0:
297 # Obscured domain name with no hash
298 row = instances.deobscure("*", blocked)
300 logger.debug("row[]='%s'", type(row))
302 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
305 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
308 nodeinfo_url = row[2]
309 elif blocked.count("?") > 0:
310 # Obscured domain name with no hash
311 row = instances.deobscure("?", blocked)
313 logger.debug("row[]='%s'", type(row))
315 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
318 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
321 nodeinfo_url = row[2]
323 logger.debug(f"blocked='{blocked}'")
324 if not utils.is_domain_wanted(blocked):
325 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
327 elif not instances.is_registered(blocked):
328 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
329 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
331 logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
332 blocks.update_reason(reason, domain, blocked, block_level)
334 logger.debug(f"blockdict()={len(blockdict)}")
335 for entry in blockdict:
336 if entry["blocked"] == blocked:
337 logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
338 entry["reason"] = reason
340 elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
341 logger.debug(f"Found 'quarantined_instances_info' in JSON response: domain='{domain}'")
343 block_level = "quarantined"
345 #print(data["quarantined_instances_info"])
346 rows = data["quarantined_instances_info"]["quarantined_instances"]
348 logger.debug("BEFORE blocked:", blocked)
349 blocked = tidyup.domain(blocked)
350 logger.debug("AFTER blocked:", blocked)
352 if blocked not in rows or "reason" not in rows[blocked]:
353 logger.warning(f"Cannot find blocked='{blocked}' in rows()={len(rows)},domain='{domain}'")
356 reason = rows[blocked]["reason"]
357 logger.debug(f"reason='{reason}'")
360 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
362 elif blacklist.is_blacklisted(blocked):
363 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
365 elif blocked.count("*") > 0:
366 # Obscured domain name with no hash
367 row = instances.deobscure("*", blocked)
369 logger.debug("row[]='%s'", type(row))
371 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
374 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
377 nodeinfo_url = row[2]
378 elif blocked.count("?") > 0:
379 # Obscured domain name with no hash
380 row = instances.deobscure("?", blocked)
382 logger.debug("row[]='%s'", type(row))
384 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
387 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
390 nodeinfo_url = row[2]
392 logger.debug(f"blocked='{blocked}'")
393 if not utils.is_domain_wanted(blocked):
394 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
396 elif not instances.is_registered(blocked):
397 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
398 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
400 logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
401 blocks.update_reason(reason, domain, blocked, block_level)
403 logger.debug(f"blockdict()={len(blockdict)}")
404 for entry in blockdict:
405 if entry["blocked"] == blocked:
406 logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
407 entry["reason"] = reason
409 logger.warning(f"Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='{domain}'")
412 logger.debug(f"Did not find any useable JSON elements, domain='{domain}', continuing with /about page ...")
413 blocklist = fetch_blocks_from_about(domain)
415 logger.debug(f"blocklist()={len(blocklist)}")
416 if len(blocklist) > 0:
417 logger.info("Checking %d record(s) ...", len(blocklist))
418 for block_level in blocklist:
419 logger.debug("block_level='%s'", block_level)
421 rows = blocklist[block_level]
422 logger.debug(f"rows['{type(rows)}]()={len(rows)}'")
424 logger.debug(f"record[]='{type(record)}'")
425 blocked = tidyup.domain(record["blocked"])
426 reason = tidyup.reason(record["reason"])
427 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
430 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
432 elif blacklist.is_blacklisted(blocked):
433 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
435 elif blocked.count("*") > 0:
436 # Obscured domain name with no hash
437 row = instances.deobscure("*", blocked)
439 logger.debug("row[]='%s'", type(row))
441 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
444 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
447 nodeinfo_url = row[2]
448 elif blocked.count("?") > 0:
449 # Obscured domain name with no hash
450 row = instances.deobscure("?", blocked)
452 logger.debug("row[]='%s'", type(row))
454 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
457 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
460 nodeinfo_url = row[2]
462 logger.debug(f"blocked='{blocked}'")
463 if not utils.is_domain_wanted(blocked):
464 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
466 elif not instances.is_registered(blocked):
467 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
468 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
470 if not blocks.is_instance_blocked(domain, blocked, block_level):
471 logger.debug("Blocking:", domain, blocked, block_level)
472 blocks.add_instance(domain, blocked, reason, block_level)
474 if block_level == "reject":
475 logger.debug("Adding to blockdict:", blocked)
481 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
482 blocks.update_reason(reason, domain, blocked, block_level)
484 logger.debug("Invoking commit() ...")
485 database.connection.commit()
487 logger.debug("EXIT!")
489 def fetch_blocks_from_about(domain: str) -> dict:
490 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
491 if not isinstance(domain, str):
492 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
494 raise ValueError("Parameter 'domain' is empty")
495 elif domain.lower() != domain:
496 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
497 elif not validators.domain(domain.split("/")[0]):
498 raise ValueError(f"domain='{domain}' is not a valid domain")
499 elif domain.endswith(".arpa"):
500 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
501 elif domain.endswith(".tld"):
502 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
504 logger.debug(f"Fetching mastodon blocks from domain='{domain}'")
506 for path in ["/instance/about/index.html"]:
511 logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
512 response = network.fetch_response(
516 (config.get("connection_timeout"), config.get("read_timeout"))
519 logger.debug(f"response.ok='{response.ok}',response.status_code='{response.status_code}',response.text()={len(response.text)}")
520 if not response.ok or response.text.strip() == "":
521 logger.warning(f"path='{path}' does not exist on domain='{domain}' - SKIPPED!")
524 logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
525 doc = bs4.BeautifulSoup(
530 logger.debug("doc[]='%s'", type(doc))
531 if doc.find("h2") is not None:
532 logger.debug(f"Found 'h2' header in path='{path}' - BREAK!")
535 except network.exceptions as exception:
536 logger.warning("Cannot fetch from domain:", domain, exception)
537 instances.set_last_error(domain, exception)
541 "Suspended servers": [],
542 "Filtered media" : [],
543 "Limited servers" : [],
544 "Silenced servers" : [],
547 logger.debug("doc[]='%s'", type(doc))
549 logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
552 for header in doc.find_all("h2"):
553 header_text = tidyup.reason(header.text)
555 logger.debug(f"header_text='{header_text}' - BEFORE!")
556 if header_text in language_mapping:
557 logger.debug(f"header_text='{header_text}' - FOUND!")
558 header_text = language_mapping[header_text]
560 logger.warning(f"header_text='{header_text}' not found in language mapping table")
562 logger.debug(f"header_text='{header_text} - AFTER!'")
563 if header_text in blocklist or header_text.lower() in blocklist:
564 # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
565 logger.debug(f"Found header_text='{header_text}', importing domain blocks ...")
566 for line in header.find_next("table").find_all("tr")[1:]:
567 logger.debug(f"line[]='{type(line)}'")
568 blocklist[header_text].append({
569 "blocked": tidyup.domain(line.find_all("td")[0].text),
570 "reason" : tidyup.reason(line.find_all("td")[1].text),
573 logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
575 logger.debug(f"Returning blocklist for domain='{domain}'")
577 "reject" : blocklist["Suspended servers"],
578 "media_removal" : blocklist["Filtered media"],
579 "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],