1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import tidyup
29 from fba.http import federation
30 from fba.http import network
32 from fba.models import blocks
33 from fba.models import instances
35 logging.basicConfig(level=logging.INFO)
36 logger = logging.getLogger(__name__)
38 # Language mapping X -> English
41 "Reject": "Suspended servers",
44 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
45 logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
46 if not isinstance(domain, str):
47 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
49 raise ValueError("Parameter 'domain' is empty")
50 elif domain.lower() != domain:
51 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
52 elif not validators.domain(domain.split("/")[0]):
53 raise ValueError(f"domain='{domain}' is not a valid domain")
54 elif domain.endswith(".arpa"):
55 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
56 elif domain.endswith(".tld"):
57 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
58 elif not isinstance(origin, str) and origin is not None:
59 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
61 raise ValueError("Parameter 'origin' is empty")
62 elif not isinstance(nodeinfo_url, str):
63 raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
64 elif nodeinfo_url == "":
65 raise ValueError("Parameter 'nodeinfo_url' is empty")
67 # @TODO Unused blockdict
71 logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
72 rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
73 except network.exceptions as exception:
74 logger.warning(f"Exception '{type(exception)}' during fetching nodeinfo")
75 instances.set_last_error(domain, exception)
78 logger.warning("Could not fetch nodeinfo from domain:", domain)
80 elif "metadata" not in rows:
81 logger.warning(f"rows()={len(rows)} does not have key 'metadata', domain='{domain}'")
83 elif "federation" not in rows["metadata"]:
84 logger.warning(f"rows()={len(rows['metadata'])} does not have key 'federation', domain='{domain}'")
87 data = rows["metadata"]["federation"]
90 logger.debug("data[]='%s'", type(data))
91 if "mrf_simple" in data:
92 logger.debug("Found mrf_simple:", domain)
94 for block_level, blocklist in (
98 "quarantined_instances": data["quarantined_instances"]
102 logger.debug("block_level, blocklist():", block_level, len(blocklist))
103 block_level = tidyup.domain(block_level)
104 logger.debug("BEFORE block_level:", block_level)
106 if block_level == "":
107 logger.warning("block_level is now empty!")
109 elif block_level == "accept":
110 logger.debug(f"domain='{domain}' skipping block_level='accept'")
113 logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
114 if len(blocklist) > 0:
115 for blocked in blocklist:
116 logger.debug("BEFORE blocked:", blocked)
117 blocked = tidyup.domain(blocked)
118 logger.debug("AFTER blocked:", blocked)
121 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
123 elif blacklist.is_blacklisted(blocked):
124 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
126 elif blocked.count("*") > 0:
127 # Obscured domain name with no hash
128 row = instances.deobscure("*", blocked)
130 logger.debug("row[]='%s'", type(row))
132 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
135 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
138 nodeinfo_url = row[2]
139 elif blocked.count("?") > 0:
140 # Obscured domain name with no hash
141 row = instances.deobscure("?", blocked)
143 logger.debug("row[]='%s'", type(row))
145 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
148 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
151 nodeinfo_url = row[2]
153 logger.debug(f"blocked='{blocked}'")
154 if not validators.domain(blocked):
155 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
157 elif blocked.endswith(".arpa"):
158 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
160 elif blocked.endswith(".tld"):
161 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
163 elif blacklist.is_blacklisted(blocked):
164 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
166 elif not instances.is_registered(blocked):
168 fba.connection.commit()
170 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
171 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
173 if not blocks.is_instance_blocked(domain, blocked, block_level):
174 logger.debug("Blocking:", domain, blocked, block_level)
175 blocks.add_instance(domain, blocked, None, block_level)
177 if block_level == "reject":
178 logger.debug("Adding to blockdict:", blocked)
184 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
185 blocks.update_last_seen(domain, blocked, block_level)
186 elif "quarantined_instances" in data:
187 logger.debug(f"Found 'quarantined_instances' in JSON response: domain='{domain}'")
189 block_level = "quarantined"
191 for blocked in data["quarantined_instances"]:
192 logger.debug("BEFORE blocked:", blocked)
193 blocked = tidyup.domain(blocked)
194 logger.debug("AFTER blocked:", blocked)
197 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
199 elif blacklist.is_blacklisted(blocked):
200 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
202 elif blocked.count("*") > 0:
203 # Obscured domain name with no hash
204 row = instances.deobscure("*", blocked)
206 logger.debug("row[]='%s'", type(row))
208 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
211 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
214 nodeinfo_url = row[2]
215 elif blocked.count("?") > 0:
216 # Obscured domain name with no hash
217 row = instances.deobscure("?", blocked)
219 logger.debug("row[]='%s'", type(row))
221 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
224 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
227 nodeinfo_url = row[2]
229 logger.debug(f"blocked='{blocked}'")
230 if not validators.domain(blocked):
231 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
233 elif blocked.endswith(".arpa"):
234 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
236 elif blocked.endswith(".tld"):
237 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
239 elif blacklist.is_blacklisted(blocked):
240 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
242 elif not instances.is_registered(blocked):
244 fba.connection.commit()
246 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
247 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
249 if not blocks.is_instance_blocked(domain, blocked, block_level):
250 logger.debug("Blocking:", domain, blocked, block_level)
251 blocks.add_instance(domain, blocked, None, block_level)
253 if block_level == "reject":
254 logger.debug("Adding to blockdict:", blocked)
260 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
261 blocks.update_last_seen(domain, blocked, block_level)
263 logger.warning(f"Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'")
265 logger.debug("Committing changes ...")
266 fba.connection.commit()
269 if "mrf_simple_info" in data:
270 logger.debug("Found mrf_simple_info:", domain)
272 for block_level, info in (
274 **data["mrf_simple_info"],
275 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
278 logger.debug("block_level, info.items():", block_level, len(info.items()))
279 block_level = tidyup.domain(block_level)
280 logger.debug("BEFORE block_level:", block_level)
282 if block_level == "":
283 logger.warning("block_level is now empty!")
285 elif block_level == "accept":
286 logger.debug(f"domain='{domain}' skipping block_level='accept'")
289 logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',software='pleroma',block_level='{block_level}' ...")
290 for blocked, reason in info.items():
291 logger.debug(f"blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!")
292 blocked = tidyup.domain(blocked)
294 if isinstance(reason, str):
295 logger.debug("reason[] is a string")
296 reason = tidyup.reason(reason)
297 elif isinstance(reason, dict) and "reason" in reason:
298 logger.debug("reason[] is a dict")
299 reason = tidyup.reason(reason["reason"])
300 elif reason is not None:
301 raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
303 logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
306 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
308 elif blacklist.is_blacklisted(blocked):
309 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
311 elif blocked.count("*") > 0:
312 # Obscured domain name with no hash
313 row = instances.deobscure("*", blocked)
315 logger.debug("row[]='%s'", type(row))
317 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
320 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
323 nodeinfo_url = row[2]
324 elif blocked.count("?") > 0:
325 # Obscured domain name with no hash
326 row = instances.deobscure("?", blocked)
328 logger.debug("row[]='%s'", type(row))
330 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
333 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
336 nodeinfo_url = row[2]
338 logger.debug(f"blocked='{blocked}'")
339 if not validators.domain(blocked):
340 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
342 elif blocked.endswith(".arpa"):
343 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
345 elif blocked.endswith(".tld"):
346 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
348 elif blacklist.is_blacklisted(blocked):
349 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
351 elif not instances.is_registered(blocked):
352 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
353 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
355 logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
356 blocks.update_reason(reason, domain, blocked, block_level)
358 logger.debug(f"blockdict()={len(blockdict)}")
359 for entry in blockdict:
360 if entry["blocked"] == blocked:
361 logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
362 entry["reason"] = reason
364 elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
365 logger.debug(f"Found 'quarantined_instances_info' in JSON response: domain='{domain}'")
367 block_level = "quarantined"
369 #print(data["quarantined_instances_info"])
370 rows = data["quarantined_instances_info"]["quarantined_instances"]
372 logger.debug("BEFORE blocked:", blocked)
373 blocked = tidyup.domain(blocked)
374 logger.debug("AFTER blocked:", blocked)
376 if blocked not in rows or "reason" not in rows[blocked]:
377 logger.warning(f"Cannot find blocked='{blocked}' in rows()={len(rows)},domain='{domain}'")
380 reason = rows[blocked]["reason"]
381 logger.debug(f"reason='{reason}'")
384 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
386 elif blacklist.is_blacklisted(blocked):
387 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
389 elif blocked.count("*") > 0:
390 # Obscured domain name with no hash
391 row = instances.deobscure("*", blocked)
393 logger.debug("row[]='%s'", type(row))
395 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
398 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
401 nodeinfo_url = row[2]
402 elif blocked.count("?") > 0:
403 # Obscured domain name with no hash
404 row = instances.deobscure("?", blocked)
406 logger.debug("row[]='%s'", type(row))
408 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
411 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
414 nodeinfo_url = row[2]
416 logger.debug(f"blocked='{blocked}'")
417 if not validators.domain(blocked):
418 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
420 elif blocked.endswith(".arpa"):
421 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
423 elif blocked.endswith(".tld"):
424 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
426 elif blacklist.is_blacklisted(blocked):
427 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
429 elif not instances.is_registered(blocked):
430 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
431 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
433 logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
434 blocks.update_reason(reason, domain, blocked, block_level)
436 logger.debug(f"blockdict()={len(blockdict)}")
437 for entry in blockdict:
438 if entry["blocked"] == blocked:
439 logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
440 entry["reason"] = reason
442 logger.warning(f"Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='{domain}'")
445 logger.debug(f"Did not find any useable JSON elements, domain='{domain}', continuing with /about page ...")
446 blocklist = fetch_blocks_from_about(domain)
448 logger.debug(f"blocklist()={len(blocklist)}")
449 if len(blocklist) > 0:
450 logger.info("Checking %d record(s) ...", len(blocklist))
451 for block_level in blocklist:
452 logger.debug("block_level='%s'", block_level)
454 rows = blocklist[block_level]
455 logger.debug(f"rows['{type(rows)}]()={len(rows)}'")
457 logger.debug(f"record[]='{type(record)}'")
458 blocked = tidyup.domain(record["blocked"])
459 reason = tidyup.reason(record["reason"])
460 logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
463 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
465 elif blacklist.is_blacklisted(blocked):
466 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
468 elif blocked.count("*") > 0:
469 # Obscured domain name with no hash
470 row = instances.deobscure("*", blocked)
472 logger.debug("row[]='%s'", type(row))
474 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
477 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
480 nodeinfo_url = row[2]
481 elif blocked.count("?") > 0:
482 # Obscured domain name with no hash
483 row = instances.deobscure("?", blocked)
485 logger.debug("row[]='%s'", type(row))
487 logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
490 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
493 nodeinfo_url = row[2]
495 logger.debug(f"blocked='{blocked}'")
496 if not validators.domain(blocked):
497 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
499 elif blocked.endswith(".arpa"):
500 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
502 elif blocked.endswith(".tld"):
503 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
505 elif not instances.is_registered(blocked):
506 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
507 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
509 if not blocks.is_instance_blocked(domain, blocked, block_level):
510 logger.debug("Blocking:", domain, blocked, block_level)
511 blocks.add_instance(domain, blocked, reason, block_level)
513 if block_level == "reject":
514 logger.debug("Adding to blockdict:", blocked)
520 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
521 blocks.update_reason(reason, domain, blocked, block_level)
523 fba.connection.commit()
524 logger.debug("EXIT!")
526 def fetch_blocks_from_about(domain: str) -> dict:
527 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
528 if not isinstance(domain, str):
529 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
531 raise ValueError("Parameter 'domain' is empty")
532 elif domain.lower() != domain:
533 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
534 elif not validators.domain(domain.split("/")[0]):
535 raise ValueError(f"domain='{domain}' is not a valid domain")
536 elif domain.endswith(".arpa"):
537 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
538 elif domain.endswith(".tld"):
539 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
541 logger.debug(f"Fetching mastodon blocks from domain='{domain}'")
543 for path in ["/instance/about/index.html"]:
548 logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
549 response = network.fetch_response(
553 (config.get("connection_timeout"), config.get("read_timeout"))
556 logger.debug(f"response.ok='{response.ok}',response.status_code='{response.status_code}',response.text()={len(response.text)}")
557 if not response.ok or response.text.strip() == "":
558 logger.warning(f"path='{path}' does not exist on domain='{domain}' - SKIPPED!")
561 logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
562 doc = bs4.BeautifulSoup(
567 logger.debug("doc[]='%s'", type(doc))
568 if doc.find("h2") is not None:
569 logger.debug(f"Found 'h2' header in path='{path}' - BREAK!")
572 except network.exceptions as exception:
573 logger.warning("Cannot fetch from domain:", domain, exception)
574 instances.set_last_error(domain, exception)
578 "Suspended servers": [],
579 "Filtered media" : [],
580 "Limited servers" : [],
581 "Silenced servers" : [],
584 logger.debug("doc[]='%s'", type(doc))
586 logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
589 for header in doc.find_all("h2"):
590 header_text = tidyup.reason(header.text)
592 logger.debug(f"header_text='{header_text}' - BEFORE!")
593 if header_text in language_mapping:
594 logger.debug(f"header_text='{header_text}' - FOUND!")
595 header_text = language_mapping[header_text]
597 logger.warning(f"header_text='{header_text}' not found in language mapping table")
599 logger.debug(f"header_text='{header_text} - AFTER!'")
600 if header_text in blocklist or header_text.lower() in blocklist:
601 # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
602 logger.debug(f"Found header_text='{header_text}', importing domain blocks ...")
603 for line in header.find_next("table").find_all("tr")[1:]:
604 logger.debug(f"line[]='{type(line)}'")
605 blocklist[header_text].append({
606 "blocked": tidyup.domain(line.find_all("td")[0].text),
607 "reason" : tidyup.reason(line.find_all("td")[1].text),
610 logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
612 logger.debug(f"Returning blocklist for domain='{domain}'")
614 "reject" : blocklist["Suspended servers"],
615 "media_removal" : blocklist["Filtered media"],
616 "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],