1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
24 from fba import database
27 from fba.helpers import blacklist
28 from fba.helpers import config
29 from fba.helpers import tidyup
31 from fba.http import network
33 from fba.models import blocks
34 from fba.models import instances
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
39 # Language mapping X -> English
42 "Silenced instances" : "Silenced servers",
43 "Suspended instances" : "Suspended servers",
44 "Limited instances" : "Limited servers",
45 "Filtered media" : "Filtered media",
46 # Mappuing German -> English
47 "Gesperrte Server" : "Suspended servers",
48 "Gefilterte Medien" : "Filtered media",
49 "Stummgeschaltete Server" : "Silenced servers",
51 "停止済みのサーバー" : "Suspended servers",
52 "制限中のサーバー" : "Limited servers",
53 "メディアを拒否しているサーバー": "Filtered media",
54 "サイレンス済みのサーバー" : "Silenced servers",
56 "שרתים מושעים" : "Suspended servers",
57 "מדיה מסוננת" : "Filtered media",
58 "שרתים מוגבלים" : "Silenced servers",
60 "Serveurs suspendus" : "Suspended servers",
61 "Médias filtrés" : "Filtered media",
62 "Serveurs limités" : "Limited servers",
63 "Serveurs modérés" : "Limited servers",
66 def fetch_blocks_from_about(domain: str) -> dict:
67 logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
68 if not isinstance(domain, str):
69 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
71 raise ValueError("Parameter 'domain' is empty")
72 elif domain.lower() != domain:
73 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
74 elif not validators.domain(domain.split("/")[0]):
75 raise ValueError(f"domain='{domain}' is not a valid domain")
76 elif domain.endswith(".arpa"):
77 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
78 elif domain.endswith(".tld"):
79 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
81 logger.debug("Fetching mastodon blocks from domain:", domain)
83 for path in ["/about/more", "/about"]:
85 logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
86 doc = bs4.BeautifulSoup(
87 network.fetch_response(
91 (config.get("connection_timeout"), config.get("read_timeout"))
96 if len(doc.find_all("h3")) > 0:
97 logger.debug(f"path='{path}' had some headlines - BREAK!")
100 except network.exceptions as exception:
101 logger.warning(f"Cannot fetch from domain='{domain}',exception='{type(exception)}'")
102 instances.set_last_error(domain, exception)
106 "Suspended servers": [],
107 "Filtered media" : [],
108 "Limited servers" : [],
109 "Silenced servers" : [],
112 logger.debug("doc[]='%s'", type(doc))
114 logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
117 for header in doc.find_all("h3"):
118 header_text = tidyup.reason(header.text)
120 logger.debug("header_text='%s'", header_text)
121 if header_text in language_mapping:
122 logger.debug("header_text='%s'", header_text)
123 header_text = language_mapping[header_text]
125 logger.warning(f"header_text='{header_text}' not found in language mapping table")
127 if header_text in blocklist or header_text.lower() in blocklist:
128 # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
129 for line in header.find_all_next("table")[0].find_all("tr")[1:]:
130 blocklist[header_text].append({
131 "domain": tidyup.domain(line.find("span").text),
132 "hash" : tidyup.domain(line.find("span")["title"][9:]),
133 "reason": tidyup.reason(line.find_all("td")[1].text),
136 logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
138 logger.debug("Returning blocklist for domain:", domain)
140 "reject" : blocklist["Suspended servers"],
141 "media_removal" : blocklist["Filtered media"],
142 "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
145 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
146 logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
147 if not isinstance(domain, str):
148 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
150 raise ValueError("Parameter 'domain' is empty")
151 elif domain.lower() != domain:
152 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
153 elif not validators.domain(domain.split("/")[0]):
154 raise ValueError(f"domain='{domain}' is not a valid domain")
155 elif domain.endswith(".arpa"):
156 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
157 elif domain.endswith(".tld"):
158 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
159 elif not isinstance(origin, str) and origin is not None:
160 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
162 raise ValueError("Parameter 'origin' is empty")
163 elif not isinstance(nodeinfo_url, str):
164 raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
165 elif nodeinfo_url == "":
166 raise ValueError("Parameter 'nodeinfo_url' is empty")
168 # No CSRF by default, you don't have to add network.api_headers by yourself here
172 logger.debug("Checking CSRF for domain='%s'", domain)
173 headers = csrf.determine(domain, dict())
174 except network.exceptions as exception:
175 logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
176 instances.set_last_error(domain, exception)
180 # json endpoint for newer mastodongs
181 found_blocks = list()
186 "media_removal" : [],
187 "followers_only": [],
188 "report_removal": [],
191 logger.debug("Querying API domain_blocks:", domain)
192 data = network.get_json_api(
194 "/api/v1/instance/domain_blocks",
196 (config.get("connection_timeout"), config.get("read_timeout"))
199 logger.debug("data[]='%s'", type(data))
200 if "error_message" in data:
201 logger.debug(f"Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
202 instances.set_last_error(domain, data)
204 elif "json" in data and "error" in data["json"]:
205 logger.warning(f"JSON API returned error message: '{data['json']['error']}'")
206 instances.set_last_error(domain, data)
210 blocklist = data["json"]
212 if len(blocklist) > 0:
213 logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
214 for block in blocklist:
216 logger.debug("block[]='%s'", type(block))
217 if not isinstance(block, dict):
218 logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
222 logger.debug(f"block[{type(block)}]='{block}'")
224 "domain": block["domain"],
225 "hash" : block["digest"],
226 "reason": block["comment"] if "comment" in block else None
229 logger.debug("severity='%s',domain='%s',hash='%s',comment='%s'", block['severity'], block['domain'], block['digest'], block['comment'])
230 if block['severity'] == 'suspend':
231 logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
232 rows['reject'].append(entry)
233 elif block['severity'] == 'silence':
234 logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
235 rows['followers_only'].append(entry)
236 elif block['severity'] == 'reject_media':
237 logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
238 rows['media_removal'].append(entry)
239 elif block['severity'] == 'reject_reports':
240 logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
241 rows['report_removal'].append(entry)
243 logger.warning("Unknown severity='%s', domain='%s'", block['severity'], block['domain'])
245 logger.debug("domain='%s' has returned zero rows, trying /about/more page ...", domain)
246 rows = fetch_blocks_from_about(domain)
248 logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
249 for block_level, blocklist in rows.items():
250 logger.debug("domain='%s',block_level='%s',blocklist()=%d", domain, block_level, len(blocklist))
251 block_level = tidyup.domain(block_level)
253 logger.debug("block_level='%s' - AFTER!", block_level)
254 if block_level == "":
255 logger.warning("block_level is empty, domain='%s'", domain)
257 elif block_level == "accept":
258 logger.debug("domain='%s' skipping block_level='accept'", domain)
261 logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
262 for block in blocklist:
263 logger.debug("block[]='%s'", type(block))
264 blocked, blocked_hash, reason = block.values()
265 logger.debug(f"blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
266 blocked = tidyup.domain(blocked)
267 reason = tidyup.reason(reason) if reason is not None and reason != "" else None
268 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
271 logger.warning("blocked is empty, domain='%s'", domain)
273 elif blacklist.is_blacklisted(blocked):
274 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
276 elif blocked.count("*") > 0:
277 # Doing the hash search for instance names as well to tidy up DB
278 row = instances.deobscure("*", blocked, blocked_hash)
280 logger.debug("row[]='%s'", type(row))
282 logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
285 logger.debug("Updating domain: row[0]='%s'", row[0])
288 nodeinfo_url = row[2]
289 elif blocked.count("?") > 0:
290 # Doing the hash search for instance names as well to tidy up DB
291 row = instances.deobscure("?", blocked, blocked_hash)
293 logger.debug("row[]='%s'", type(row))
295 logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
298 logger.debug("Updating domain: row[0]='%s'", row[0])
301 nodeinfo_url = row[2]
303 logger.debug("Looking up instance by domain:", blocked)
304 if not utils.is_domain_wanted(blocked):
305 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
307 elif not instances.is_registered(blocked):
308 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
309 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
311 logger.debug("Looking up instance by domain:", blocked)
312 if not utils.is_domain_wanted(blocked):
313 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
315 elif not instances.is_registered(blocked):
316 logger.debug("Hash wasn't found, adding:", blocked, domain)
317 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
319 if not blocks.is_instance_blocked(domain, blocked, block_level):
320 logger.debug("Blocking:", domain, blocked, block_level)
321 blocks.add_instance(domain, blocked, reason, block_level)
323 if block_level == "reject":
324 found_blocks.append({
329 logger.debug(f"Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
330 blocks.update_last_seen(domain, blocked, block_level)
331 blocks.update_reason(reason, domain, blocked, block_level)
333 logger.debug("Invoking commit() ...")
334 database.connection.commit()
335 except network.exceptions as exception:
336 logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
337 instances.set_last_error(domain, exception)
339 logger.debug("EXIT!")