]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Continued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21
22 from fba import csrf
23 from fba import database
24 from fba import utils
25
26 from fba.helpers import blacklist
27 from fba.helpers import config
28 from fba.helpers import domain as domain_helper
29 from fba.helpers import tidyup
30
31 from fba.http import network
32
33 from fba.models import blocks
34 from fba.models import instances
35
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
38
39 # Language mapping X -> English
40 language_mapping = {
41     # English -> English
42     "Silenced instances"            : "Silenced servers",
43     "Suspended instances"           : "Suspended servers",
44     "Limited instances"             : "Limited servers",
45     "Filtered media"                : "Filtered media",
46     # Mappuing German -> English
47     "Gesperrte Server"              : "Suspended servers",
48     "Gefilterte Medien"             : "Filtered media",
49     "Stummgeschaltete Server"       : "Silenced servers",
50     # Japanese -> English
51     "停止済みのサーバー"            : "Suspended servers",
52     "制限中のサーバー"              : "Limited servers",
53     "メディアを拒否しているサーバー": "Filtered media",
54     "サイレンス済みのサーバー"      : "Silenced servers",
55     # ??? -> English
56     "שרתים מושעים"                  : "Suspended servers",
57     "מדיה מסוננת"                   : "Filtered media",
58     "שרתים מוגבלים"                 : "Silenced servers",
59     # French -> English
60     "Serveurs suspendus"            : "Suspended servers",
61     "Médias filtrés"                : "Filtered media",
62     "Serveurs limités"              : "Limited servers",
63     "Serveurs modérés"              : "Limited servers",
64 }
65
66 def fetch_blocks_from_about(domain: str) -> dict:
67     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
68     domain_helper.raise_on(domain)
69
70     logger.debug("Fetching mastodon blocks from domain='%s'", domain)
71     doc = None
72     for path in ["/about/more", "/about"]:
73         try:
74             logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
75             doc = bs4.BeautifulSoup(
76                 network.fetch_response(
77                     domain,
78                     path,
79                     network.web_headers,
80                     (config.get("connection_timeout"), config.get("read_timeout"))
81                 ).text,
82                 "html.parser",
83             )
84
85             if len(doc.find_all("h3")) > 0:
86                 logger.debug(f"path='{path}' had some headlines - BREAK!")
87                 break
88
89         except network.exceptions as exception:
90             logger.warning("Cannot fetch from domain='%s',exception='%s'", domain, type(exception))
91             instances.set_last_error(domain, exception)
92             break
93
94     blocklist = {
95         "Suspended servers": [],
96         "Filtered media"   : [],
97         "Limited servers"  : [],
98         "Silenced servers" : [],
99     }
100
101     logger.debug("doc[]='%s'", type(doc))
102     if doc is None:
103         logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain)
104         return blocklist
105
106     for header in doc.find_all("h3"):
107         header_text = tidyup.reason(header.text)
108
109         logger.debug("header_text='%s'", header_text)
110         if header_text in language_mapping:
111             logger.debug("Translating header_text='%s' ...", header_text)
112             header_text = language_mapping[header_text]
113         else:
114             logger.warning("header_text='%s' not found in language mapping table", header_text)
115
116         if header_text in blocklist or header_text.lower() in blocklist:
117             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
118             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
119                 blocklist[header_text].append({
120                     "domain": tidyup.domain(line.find("span").text),
121                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
122                     "reason": tidyup.reason(line.find_all("td")[1].text),
123                 })
124         else:
125             logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist))
126
127     logger.debug("Returning blocklist for domain='%s' - EXIT!", domain)
128     return {
129         "reject"        : blocklist["Suspended servers"],
130         "media_removal" : blocklist["Filtered media"],
131         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
132     }
133
134 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
135     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
136     domain_helper.raise_on(domain)
137     if not isinstance(origin, str) and origin is not None:
138         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
139     elif origin == "":
140         raise ValueError("Parameter 'origin' is empty")
141     elif not isinstance(nodeinfo_url, str):
142         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
143     elif nodeinfo_url == "":
144         raise ValueError("Parameter 'nodeinfo_url' is empty")
145
146     # No CSRF by default, you don't have to add network.api_headers by yourself here
147     headers = tuple()
148
149     try:
150         logger.debug("Checking CSRF for domain='%s'", domain)
151         headers = csrf.determine(domain, dict())
152     except network.exceptions as exception:
153         logger.warning("Exception '%s' during checking CSRF (fetch_blocks,%s) - EXIT!", type(exception), __name__)
154         instances.set_last_error(domain, exception)
155         return
156
157     try:
158         # json endpoint for newer mastodongs
159         found_blocks = list()
160         blocklist = list()
161
162         rows = {
163             "reject"        : [],
164             "media_removal" : [],
165             "followers_only": [],
166             "report_removal": [],
167         }
168
169         logger.debug("Querying API domain_blocks: domain='%s'", domain)
170         data = network.get_json_api(
171             domain,
172             "/api/v1/instance/domain_blocks",
173             headers,
174             (config.get("connection_timeout"), config.get("read_timeout"))
175         )
176
177         logger.debug("data[]='%s'", type(data))
178         if "error_message" in data:
179             logger.debug(f"Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
180             instances.set_last_error(domain, data)
181             return
182         elif "json" in data and "error" in data["json"]:
183             logger.warning("JSON API returned error message: '%s'", data['json']['error'])
184             instances.set_last_error(domain, data)
185             return
186         else:
187             # Getting blocklist
188             blocklist = data["json"]
189
190         if len(blocklist) > 0:
191             logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
192             for block in blocklist:
193                 # Check type
194                 logger.debug("block[]='%s'", type(block))
195                 if not isinstance(block, dict):
196                     logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
197                     continue
198
199                 # Map block -> entry
200                 logger.debug(f"block[{type(block)}]='{block}'")
201                 entry = {
202                     "domain": block["domain"],
203                     "hash"  : block["digest"],
204                     "reason": block["comment"] if "comment" in block else None
205                 }
206
207                 logger.debug("severity='%s',domain='%s',hash='%s',comment='%s'", block['severity'], block['domain'], block['digest'], block['comment'])
208                 if block['severity'] == 'suspend':
209                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
210                     rows['reject'].append(entry)
211                 elif block['severity'] == 'silence':
212                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
213                     rows['followers_only'].append(entry)
214                 elif block['severity'] == 'reject_media':
215                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
216                     rows['media_removal'].append(entry)
217                 elif block['severity'] == 'reject_reports':
218                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
219                     rows['report_removal'].append(entry)
220                 else:
221                     logger.warning("Unknown severity='%s', domain='%s'", block['severity'], block['domain'])
222         else:
223             logger.debug("domain='%s' has returned zero rows, trying /about/more page ...", domain)
224             rows = fetch_blocks_from_about(domain)
225
226         logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
227         for block_level, blocklist in rows.items():
228             logger.debug("domain='%s',block_level='%s',blocklist()=%d", domain, block_level, len(blocklist))
229             block_level = tidyup.domain(block_level)
230
231             logger.debug("block_level='%s' - AFTER!", block_level)
232             if block_level == "":
233                 logger.warning("block_level is empty, domain='%s'", domain)
234                 continue
235             elif block_level == "accept":
236                 logger.debug("domain='%s' skipping block_level='accept'", domain)
237                 continue
238
239             logger.debug("Checking %s entries from domain='{domain}',block_level='{block_level}' ...", len(blocklist))
240             for block in blocklist:
241                 logger.debug("block[]='%s'", type(block))
242                 blocked, blocked_hash, reason = block.values()
243
244                 logger.debug("blocked='%s',blocked_hash='%s',reason='%s'", blocked, blocked_hash, reason)
245                 blocked = tidyup.domain(blocked)
246                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
247                 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
248
249                 if blocked == "":
250                     logger.warning("blocked is empty, domain='%s'", domain)
251                     continue
252                 elif blocked.count("*") > 0:
253                     logger.debug("domain='%s' uses obfucated domains, marking ...", domain)
254                     instances.set_has_obfucation(domain, True)
255
256                     # Doing the hash search for instance names as well to tidy up DB
257                     row = instances.deobfucate("*", blocked, blocked_hash)
258
259                     logger.debug("row[]='%s'", type(row))
260                     if row is None:
261                         logger.warning("Cannot deobfucate blocked='%s',blocked_hash='%s' - SKIPPED!", blocked, blocked_hash)
262                         continue
263
264                     logger.debug("Updating domain: row[0]='%s'", row[0])
265                     blocked      = row[0]
266                     origin       = row[1]
267                     nodeinfo_url = row[2]
268                 elif blocked.count("?") > 0:
269                     logger.debug("domain='%s' uses obfucated domains, marking ...", domain)
270                     instances.set_has_obfucation(domain, True)
271
272                     # Doing the hash search for instance names as well to tidy up DB
273                     row = instances.deobfucate("?", blocked, blocked_hash)
274
275                     logger.debug("row[]='%s'", type(row))
276                     if row is None:
277                         logger.warning("Cannot deobfucate blocked='%s',blocked_hash='%s' - SKIPPED!", blocked, blocked_hash)
278                         continue
279
280                     logger.debug("Updating domain: row[0]='%s'", row[0])
281                     blocked      = row[0]
282                     origin       = row[1]
283                     nodeinfo_url = row[2]
284
285                 logger.debug("Looking up instance by domain: blocked='%s'", blocked)
286                 if not utils.is_domain_wanted(blocked):
287                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
288                     continue
289                 elif not instances.is_registered(blocked):
290                     logger.debug(f"Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='%s'", blocked, domain, origin, nodeinfo_url)
291                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
292
293                 logger.debug("Looking up instance by domain: blocked='%s'", blocked)
294                 if not utils.is_domain_wanted(blocked):
295                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
296                     continue
297                 elif not instances.is_registered(blocked):
298                     logger.debug("Hash wasn't found, adding: blocked='%s',domain='%s'", blocked, domain)
299                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
300
301                 if not blocks.is_instance_blocked(domain, blocked, block_level):
302                     logger.debug("Blocking domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
303                     blocks.add_instance(domain, blocked, reason, block_level)
304
305                     if block_level == "reject":
306                         found_blocks.append({
307                             "blocked": blocked,
308                             "reason" : reason
309                         })
310                 else:
311                     logger.debug("Updating block last seen and reason for domain='%s',blocked='%s' ...", domain, blocked)
312                     blocks.update_last_seen(domain, blocked, block_level)
313                     blocks.update_reason(reason, domain, blocked, block_level)
314
315         logger.debug("Invoking commit() ...")
316         database.connection.commit()
317     except network.exceptions as exception:
318         logger.warning("domain='%s',exception[%s]='%s'", domain, type(exception), str(exception))
319         instances.set_last_error(domain, exception)
320
321     logger.debug("EXIT!")