]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Continued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18
19 import bs4
20 import validators
21
22 from fba import csrf
23 from fba import fba
24
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import tidyup
28
29 from fba.http import network
30
31 from fba.models import blocks
32 from fba.models import instances
33
34 language_mapping = {
35     # English -> English
36     "Silenced instances"            : "Silenced servers",
37     "Suspended instances"           : "Suspended servers",
38     "Limited instances"             : "Limited servers",
39     "Filtered media"                : "Filtered media",
40     # Mappuing German -> English
41     "Gesperrte Server"              : "Suspended servers",
42     "Gefilterte Medien"             : "Filtered media",
43     "Stummgeschaltete Server"       : "Silenced servers",
44     # Japanese -> English
45     "停止済みのサーバー"            : "Suspended servers",
46     "制限中のサーバー"              : "Limited servers",
47     "メディアを拒否しているサーバー": "Filtered media",
48     "サイレンス済みのサーバー"      : "Silenced servers",
49     # ??? -> English
50     "שרתים מושעים"                  : "Suspended servers",
51     "מדיה מסוננת"                   : "Filtered media",
52     "שרתים מוגבלים"                 : "Silenced servers",
53     # French -> English
54     "Serveurs suspendus"            : "Suspended servers",
55     "Médias filtrés"                : "Filtered media",
56     "Serveurs limités"              : "Limited servers",
57     "Serveurs modérés"              : "Limited servers",
58 }
59
60 def fetch_blocks_from_about(domain: str) -> dict:
61     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
62     if not isinstance(domain, str):
63         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
64     elif domain == "":
65         raise ValueError("Parameter 'domain' is empty")
66
67     # DEBUG: print("DEBUG: Fetching mastodon blocks from domain:", domain)
68     doc = None
69     for path in ["/about/more", "/about"]:
70         try:
71             # DEBUG: print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
72             doc = bs4.BeautifulSoup(
73                 network.fetch_response(
74                     domain,
75                     path,
76                     network.web_headers,
77                     (config.get("connection_timeout"), config.get("read_timeout"))
78                 ).text,
79                 "html.parser",
80             )
81
82             if len(doc.find_all("h3")) > 0:
83                 # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!")
84                 break
85
86         except network.exceptions as exception:
87             print(f"ERROR: Cannot fetch from domain='{domain}',exception='{type(exception)}'")
88             instances.set_last_error(domain, exception)
89             break
90
91     blocklist = {
92         "Suspended servers": [],
93         "Filtered media"   : [],
94         "Limited servers"  : [],
95         "Silenced servers" : [],
96     }
97
98     # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
99     if doc is None:
100         print(f"WARNING: Cannot fetch any /about pages for domain='{domain}' - EXIT!")
101         return blocklist
102
103     for header in doc.find_all("h3"):
104         header_text = tidyup.reason(header.text)
105
106         # DEBUG: print(f"DEBUG: header_text='{header_text}'")
107         if header_text in language_mapping:
108             # DEBUG: print(f"DEBUG: header_text='{header_text}'")
109             header_text = language_mapping[header_text]
110         else:
111             print(f"WARNING: header_text='{header_text}' not found in language mapping table")
112
113         if header_text in blocklist or header_text.lower() in blocklist:
114             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
115             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
116                 blocklist[header_text].append({
117                     "domain": tidyup.domain(line.find("span").text),
118                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
119                     "reason": tidyup.reason(line.find_all("td")[1].text),
120                 })
121         else:
122             print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
123
124     # DEBUG: print("DEBUG: Returning blocklist for domain:", domain)
125     return {
126         "reject"        : blocklist["Suspended servers"],
127         "media_removal" : blocklist["Filtered media"],
128         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
129     }
130
131 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
132     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
133     if not isinstance(domain, str):
134         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
135     elif domain == "":
136         raise ValueError("Parameter 'domain' is empty")
137     elif not isinstance(origin, str) and origin is not None:
138         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
139     elif origin == "":
140         raise ValueError("Parameter 'origin' is empty")
141     elif not isinstance(nodeinfo_url, str):
142         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
143     elif nodeinfo_url == "":
144         raise ValueError("Parameter 'nodeinfo_url' is empty")
145
146     # No CSRF by default, you don't have to add network.api_headers by yourself here
147     headers = tuple()
148
149     try:
150         # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
151         headers = csrf.determine(domain, dict())
152     except network.exceptions as exception:
153         print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
154         instances.set_last_error(domain, exception)
155         return
156
157     try:
158         # json endpoint for newer mastodongs
159         found_blocks = list()
160         blocklist = list()
161
162         rows = {
163             "reject"        : [],
164             "media_removal" : [],
165             "followers_only": [],
166             "report_removal": [],
167         }
168
169         # DEBUG: print("DEBUG: Querying API domain_blocks:", domain)
170         data = network.get_json_api(
171             domain,
172             "/api/v1/instance/domain_blocks",
173             headers,
174             (config.get("connection_timeout"), config.get("read_timeout"))
175         )
176
177         # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
178         if "error_message" in data:
179             # DEBUG: print(f"DEBUG: Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
180             instances.set_last_error(domain, data)
181             return
182         elif "json" in data and "error" in data["json"]:
183             print(f"WARNING: JSON API returned error message: '{data['json']['error']}'")
184             instances.set_last_error(domain, data)
185             return
186         else:
187             # Getting blocklist
188             blocklist = data["json"]
189
190         if len(blocklist) > 0:
191             print(f"INFO: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon' ...")
192             for block in blocklist:
193                 # Check type
194                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
195                 if not isinstance(block, dict):
196                     # DEBUG: print(f"DEBUG: block[]='{type(block)}' is of type 'dict' - SKIPPED!")
197                     continue
198
199                 # Map block -> entry
200                 # DEBUG: print(f"DEBUG: block[{type(block)}]='{block}'")
201                 entry = {
202                     "domain": block["domain"],
203                     "hash"  : block["digest"],
204                     "reason": block["comment"] if "comment" in block else None
205                 }
206
207                 # DEBUG: print("DEBUG: severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
208                 if block['severity'] == 'suspend':
209                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
210                     rows['reject'].append(entry)
211                 elif block['severity'] == 'silence':
212                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
213                     rows['followers_only'].append(entry)
214                 elif block['severity'] == 'reject_media':
215                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
216                     rows['media_removal'].append(entry)
217                 elif block['severity'] == 'reject_reports':
218                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
219                     rows['report_removal'].append(entry)
220                 else:
221                     print(f"WARNING: Unknown severity='{block['severity']}', domain='{block['domain']}'")
222         else:
223             # DEBUG: print(f"DEBUG: domain='{domain}' has returned zero rows, trying /about/more page ...")
224             rows = fetch_blocks_from_about(domain)
225
226         print(f"INFO: Checking {len(rows.items())} entries from domain='{domain}',software='mastodon' ...")
227         for block_level, blocklist in rows.items():
228             # DEBUG: print("DEBUG: domain,block_level,blocklist():", domain, block_level, len(blocklist))
229             block_level = tidyup.domain(block_level)
230
231             # DEBUG: print("DEBUG: AFTER-block_level:", block_level)
232             if block_level == "":
233                 print("WARNING: block_level is empty, domain:", domain)
234                 continue
235             elif block_level == "accept":
236                 # DEBUG: print(f"DEBUG: domain='{domain}' skipping block_level='accept'")
237                 continue
238
239             # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon',block_level='{block_level}' ...")
240             for block in blocklist:
241                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
242                 blocked, blocked_hash, reason = block.values()
243                 # DEBUG: print(f"DEBUG: blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
244                 blocked = tidyup.domain(blocked)
245                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
246                 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
247
248                 if blocked == "":
249                     print("WARNING: blocked is empty:", domain)
250                     continue
251                 elif blacklist.is_blacklisted(blocked):
252                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
253                     continue
254                 elif blocked.count("*") > 0:
255                     # Doing the hash search for instance names as well to tidy up DB
256                     row = instances.deobscure("*", blocked, blocked_hash)
257
258                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
259                     if row is None:
260                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
261                         continue
262
263                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
264                     blocked      = row[0]
265                     origin       = row[1]
266                     nodeinfo_url = row[2]
267                 elif blocked.count("?") > 0:
268                     # Doing the hash search for instance names as well to tidy up DB
269                     row = instances.deobscure("?", blocked, blocked_hash)
270
271                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
272                     if row is None:
273                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
274                         continue
275
276                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
277                     blocked      = row[0]
278                     origin       = row[1]
279                     nodeinfo_url = row[2]
280
281                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
282                 if not validators.domain(blocked):
283                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
284                     continue
285                 elif blocked.endswith(".arpa"):
286                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
287                     continue
288                 elif blocked.endswith(".tld"):
289                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
290                     continue
291                 elif blacklist.is_blacklisted(blocked):
292                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
293                     continue
294                 elif not instances.is_registered(blocked):
295                     # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
296                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
297
298                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
299                 if not validators.domain(blocked):
300                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
301                     continue
302                 elif blocked.endswith(".arpa"):
303                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
304                     continue
305                 elif blocked.endswith(".tld"):
306                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
307                     continue
308                 elif blacklist.is_blacklisted(blocked):
309                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
310                     continue
311                 elif not instances.is_registered(blocked):
312                     # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
313                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
314
315                 if not blocks.is_instance_blocked(domain, blocked, block_level):
316                     # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
317                     blocks.add_instance(domain, blocked, reason, block_level)
318
319                     if block_level == "reject":
320                         found_blocks.append({
321                             "blocked": blocked,
322                             "reason" : reason
323                         })
324                 else:
325                     # DEBUG: print(f"DEBUG: Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
326                     blocks.update_last_seen(domain, blocked, block_level)
327                     blocks.update_reason(reason, domain, blocked, block_level)
328
329         # DEBUG: print("DEBUG: Committing changes ...")
330         fba.connection.commit()
331     except network.exceptions as exception:
332         print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'")
333         instances.set_last_error(domain, exception)
334
335     # DEBUG: print("DEBUG: EXIT!")