1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from fba import federation
24 from fba import network
26 from fba.helpers import blacklist
27 from fba.helpers import config
28 from fba.helpers import tidyup
30 from fba.models import blocks
31 from fba.models import instances
35 "Reject": "Suspended servers",
38 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
39 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
40 if not isinstance(domain, str):
41 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
43 raise ValueError("Parameter 'domain' is empty")
44 elif not isinstance(origin, str) and origin is not None:
45 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
47 raise ValueError("Parameter 'origin' is empty")
48 elif not isinstance(nodeinfo_url, str):
49 raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
50 elif nodeinfo_url == "":
51 raise ValueError("Parameter 'nodeinfo_url' is empty")
53 # @TODO Unused blockdict
57 # DEBUG: print(f"DEBUG: Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
58 rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
59 except network.exceptions as exception:
60 print(f"WARNING: Exception '{type(exception)}' during fetching nodeinfo")
61 instances.set_last_error(domain, exception)
64 print("WARNING: Could not fetch nodeinfo from domain:", domain)
66 elif "metadata" not in rows:
67 print(f"WARNING: rows()={len(rows)} does not have key 'metadata', domain='{domain}'")
69 elif "federation" not in rows["metadata"]:
70 print(f"WARNING: rows()={len(rows['metadata'])} does not have key 'federation', domain='{domain}'")
73 data = rows["metadata"]["federation"]
76 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
77 if "mrf_simple" in data:
78 # DEBUG: print("DEBUG: Found mrf_simple:", domain)
80 for block_level, blocklist in (
84 "quarantined_instances": data["quarantined_instances"]
88 # DEBUG: print("DEBUG: block_level, blocklist():", block_level, len(blocklist))
89 block_level = tidyup.domain(block_level)
90 # DEBUG: print("DEBUG: BEFORE block_level:", block_level)
93 print("WARNING: block_level is now empty!")
95 elif block_level == "accept":
96 # DEBUG: print(f"DEBUG: domain='{domain}' skipping block_level='accept'")
99 # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
100 if len(blocklist) > 0:
101 for blocked in blocklist:
102 # DEBUG: print("DEBUG: BEFORE blocked:", blocked)
103 blocked = tidyup.domain(blocked)
104 # DEBUG: print("DEBUG: AFTER blocked:", blocked)
107 print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
109 elif blacklist.is_blacklisted(blocked):
110 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
112 elif blocked.count("*") > 0:
113 # Obscured domain name with no hash
114 row = instances.deobscure("*", blocked)
116 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
118 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
121 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
124 nodeinfo_url = row[2]
125 elif blocked.count("?") > 0:
126 # Obscured domain name with no hash
127 row = instances.deobscure("?", blocked)
129 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
131 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
134 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
137 nodeinfo_url = row[2]
139 # DEBUG: print(f"DEBUG: blocked='{blocked}'")
140 if not validators.domain(blocked):
141 print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
143 elif blocked.endswith(".arpa"):
144 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
146 elif blocked.endswith(".tld"):
147 print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
149 elif blacklist.is_blacklisted(blocked):
150 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
152 elif not instances.is_registered(blocked):
154 fba.connection.commit()
156 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
157 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
159 if not blocks.is_instance_blocked(domain, blocked, block_level):
160 # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
161 blocks.add_instance(domain, blocked, None, block_level)
163 if block_level == "reject":
164 # DEBUG: print("DEBUG: Adding to blockdict:", blocked)
170 # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
171 blocks.update_last_seen(domain, blocked, block_level)
172 elif "quarantined_instances" in data:
173 # DEBUG: print(f"DEBUG: Found 'quarantined_instances' in JSON response: domain='{domain}'")
175 block_level = "quarantined"
177 for blocked in data["quarantined_instances"]:
178 # DEBUG: print("DEBUG: BEFORE blocked:", blocked)
179 blocked = tidyup.domain(blocked)
180 # DEBUG: print("DEBUG: AFTER blocked:", blocked)
183 print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
185 elif blacklist.is_blacklisted(blocked):
186 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
188 elif blocked.count("*") > 0:
189 # Obscured domain name with no hash
190 row = instances.deobscure("*", blocked)
192 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
194 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
197 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
200 nodeinfo_url = row[2]
201 elif blocked.count("?") > 0:
202 # Obscured domain name with no hash
203 row = instances.deobscure("?", blocked)
205 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
207 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
210 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
213 nodeinfo_url = row[2]
215 # DEBUG: print(f"DEBUG: blocked='{blocked}'")
216 if not validators.domain(blocked):
217 print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
219 elif blocked.endswith(".arpa"):
220 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
222 elif blocked.endswith(".tld"):
223 print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
225 elif blacklist.is_blacklisted(blocked):
226 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
228 elif not instances.is_registered(blocked):
230 fba.connection.commit()
232 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
233 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
235 if not blocks.is_instance_blocked(domain, blocked, block_level):
236 # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
237 blocks.add_instance(domain, blocked, None, block_level)
239 if block_level == "reject":
240 # DEBUG: print("DEBUG: Adding to blockdict:", blocked)
246 # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
247 blocks.update_last_seen(domain, blocked, block_level)
249 print(f"WARNING: Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'")
251 # DEBUG: print("DEBUG: Committing changes ...")
252 fba.connection.commit()
255 if "mrf_simple_info" in data:
256 # DEBUG: print("DEBUG: Found mrf_simple_info:", domain)
258 for block_level, info in (
260 **data["mrf_simple_info"],
261 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
264 # DEBUG: print("DEBUG: block_level, info.items():", block_level, len(info.items()))
265 block_level = tidyup.domain(block_level)
266 # DEBUG: print("DEBUG: BEFORE block_level:", block_level)
268 if block_level == "":
269 print("WARNING: block_level is now empty!")
271 elif block_level == "accept":
272 # DEBUG: print(f"DEBUG: domain='{domain}' skipping block_level='accept'")
275 # DEBUG: print(f"DEBUG: Checking {len(info.items())} entries from domain='{domain}',software='pleroma',block_level='{block_level}' ...")
276 for blocked, reason in info.items():
277 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!")
278 blocked = tidyup.domain(blocked)
280 if isinstance(reason, str):
281 # DEBUG: print("DEBUG: reason[] is a string")
282 reason = tidyup.reason(reason)
283 elif isinstance(reason, dict) and "reason" in reason:
284 # DEBUG: print("DEBUG: reason[] is a dict")
285 reason = tidyup.reason(reason["reason"])
286 elif reason is not None:
287 raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
289 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
292 print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
294 elif blacklist.is_blacklisted(blocked):
295 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
297 elif blocked.count("*") > 0:
298 # Obscured domain name with no hash
299 row = instances.deobscure("*", blocked)
301 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
303 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
306 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
309 nodeinfo_url = row[2]
310 elif blocked.count("?") > 0:
311 # Obscured domain name with no hash
312 row = instances.deobscure("?", blocked)
314 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
316 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
319 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
322 nodeinfo_url = row[2]
324 # DEBUG: print(f"DEBUG: blocked='{blocked}'")
325 if not validators.domain(blocked):
326 print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
328 elif blocked.endswith(".arpa"):
329 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
331 elif blocked.endswith(".tld"):
332 print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
334 elif blacklist.is_blacklisted(blocked):
335 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
337 elif not instances.is_registered(blocked):
338 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
339 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
341 # DEBUG: print(f"DEBUG: Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
342 blocks.update_reason(reason, domain, blocked, block_level)
344 # DEBUG: print(f"DEBUG: blockdict()={len(blockdict)}")
345 for entry in blockdict:
346 if entry["blocked"] == blocked:
347 # DEBUG: print(f"DEBUG: Updating entry reason: blocked='{blocked}',reason='{reason}'")
348 entry["reason"] = reason
350 elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
351 # DEBUG: print(f"DEBUG: Found 'quarantined_instances_info' in JSON response: domain='{domain}'")
353 block_level = "quarantined"
355 #print(data["quarantined_instances_info"])
356 rows = data["quarantined_instances_info"]["quarantined_instances"]
358 # DEBUG: print("DEBUG: BEFORE blocked:", blocked)
359 blocked = tidyup.domain(blocked)
360 # DEBUG: print("DEBUG: AFTER blocked:", blocked)
362 if blocked not in rows or "reason" not in rows[blocked]:
363 print(f"WARNING: Cannot find blocked='{blocked}' in rows()={len(rows)},domain='{domain}'")
366 reason = rows[blocked]["reason"]
367 # DEBUG: print(f"DEBUG: reason='{reason}'")
370 print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
372 elif blacklist.is_blacklisted(blocked):
373 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
375 elif blocked.count("*") > 0:
376 # Obscured domain name with no hash
377 row = instances.deobscure("*", blocked)
379 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
381 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
384 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
387 nodeinfo_url = row[2]
388 elif blocked.count("?") > 0:
389 # Obscured domain name with no hash
390 row = instances.deobscure("?", blocked)
392 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
394 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
397 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
400 nodeinfo_url = row[2]
402 # DEBUG: print(f"DEBUG: blocked='{blocked}'")
403 if not validators.domain(blocked):
404 print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
406 elif blocked.endswith(".arpa"):
407 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
409 elif blocked.endswith(".tld"):
410 print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
412 elif blacklist.is_blacklisted(blocked):
413 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
415 elif not instances.is_registered(blocked):
416 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
417 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
419 # DEBUG: print(f"DEBUG: Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
420 blocks.update_reason(reason, domain, blocked, block_level)
422 # DEBUG: print(f"DEBUG: blockdict()={len(blockdict)}")
423 for entry in blockdict:
424 if entry["blocked"] == blocked:
425 # DEBUG: print(f"DEBUG: Updating entry reason: blocked='{blocked}',reason='{reason}'")
426 entry["reason"] = reason
428 print(f"WARNING: Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='{domain}'")
431 # DEBUG: print(f"DEBUG: Did not find any useable JSON elements, domain='{domain}', continuing with /about page ...")
432 blocklist = fetch_blocks_from_about(domain)
434 # DEBUG: print(f"DEBUG: blocklist()={len(blocklist)}")
435 if len(blocklist) > 0:
436 print(f"INFO: Checking {len(blocklist)} record(s) ...")
437 for block_level in blocklist:
438 # DEBUG: print(f"DEBUG: block_level='{block_level}'")
439 rows = blocklist[block_level]
440 # DEBUG: print(f"DEBUG: rows['{type(rows)}]()={len(rows)}'")
442 # DEBUG: print(f"DEBUG: record[]='{type(record)}'")
443 blocked = tidyup.domain(record["blocked"])
444 reason = tidyup.reason(record["reason"])
445 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
448 print("WARNING: blocked is empty after tidyup.domain():", domain, block_level)
450 elif blacklist.is_blacklisted(blocked):
451 # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
453 elif blocked.count("*") > 0:
454 # Obscured domain name with no hash
455 row = instances.deobscure("*", blocked)
457 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
459 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
462 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
465 nodeinfo_url = row[2]
466 elif blocked.count("?") > 0:
467 # Obscured domain name with no hash
468 row = instances.deobscure("?", blocked)
470 # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
472 print(f"WARNING: Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
475 # DEBUG: print(f"DEBUG: blocked='{blocked}' de-obscured to '{row[0]}'")
478 nodeinfo_url = row[2]
480 # DEBUG: print(f"DEBUG: blocked='{blocked}'")
481 if not validators.domain(blocked):
482 print(f"WARNING: blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
484 elif blocked.endswith(".arpa"):
485 print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
487 elif blocked.endswith(".tld"):
488 print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
490 elif not instances.is_registered(blocked):
491 # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
492 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
494 if not blocks.is_instance_blocked(domain, blocked, block_level):
495 # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
496 blocks.add_instance(domain, blocked, reason, block_level)
498 if block_level == "reject":
499 # DEBUG: print("DEBUG: Adding to blockdict:", blocked)
505 # DEBUG: print(f"DEBUG: Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
506 blocks.update_reason(reason, domain, blocked, block_level)
508 fba.connection.commit()
509 # DEBUG: print("DEBUG: EXIT!")
511 def fetch_blocks_from_about(domain: str) -> dict:
512 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
513 if not isinstance(domain, str):
514 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
516 raise ValueError("Parameter 'domain' is empty")
518 # DEBUG: print(f"DEBUG: Fetching mastodon blocks from domain='{domain}'")
520 for path in ["/instance/about/index.html"]:
525 # DEBUG: print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
526 response = network.fetch_response(
530 (config.get("connection_timeout"), config.get("read_timeout"))
533 # DEBUG: print(f"DEBUG: response.ok='{response.ok}',response.status_code='{response.status_code}',response.text()={len(response.text)}")
534 if not response.ok or response.text.strip() == "":
535 print(f"WARNING: path='{path}' does not exist on domain='{domain}' - SKIPPED!")
538 # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...")
539 doc = bs4.BeautifulSoup(
544 # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
545 if doc.find("h2") is not None:
546 # DEBUG: print(f"DEBUG: Found 'h2' header in path='{path}' - BREAK!")
549 except network.exceptions as exception:
550 print("ERROR: Cannot fetch from domain:", domain, exception)
551 instances.set_last_error(domain, exception)
555 "Suspended servers": [],
556 "Filtered media" : [],
557 "Limited servers" : [],
558 "Silenced servers" : [],
561 # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
563 print(f"WARNING: Cannot fetch any /about pages for domain='{domain}' - EXIT!")
566 for header in doc.find_all("h2"):
567 header_text = tidyup.reason(header.text)
569 # DEBUG: print(f"DEBUG: header_text='{header_text}' - BEFORE!")
570 if header_text in language_mapping:
571 # DEBUG: print(f"DEBUG: header_text='{header_text}' - FOUND!")
572 header_text = language_mapping[header_text]
574 print(f"WARNING: header_text='{header_text}' not found in language mapping table")
576 # DEBUG: print(f"DEBUG: header_text='{header_text} - AFTER!'")
577 if header_text in blocklist or header_text.lower() in blocklist:
578 # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
579 # DEBUG: print(f"DEBUG: Found header_text='{header_text}', importing domain blocks ...")
580 for line in header.find_next("table").find_all("tr")[1:]:
581 # DEBUG: print(f"DEBUG: line[]='{type(line)}'")
582 blocklist[header_text].append({
583 "blocked": tidyup.domain(line.find_all("td")[0].text),
584 "reason" : tidyup.reason(line.find_all("td")[1].text),
587 print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
589 # DEBUG: print(f"DEBUG: Returning blocklist for domain='{domain}'")
591 "reject" : blocklist["Suspended servers"],
592 "media_removal" : blocklist["Filtered media"],
593 "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],