]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Continued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18
19 import bs4
20 import validators
21
22 from fba import blacklist
23 from fba import config
24 from fba import csrf
25 from fba import fba
26 from fba import network
27
28 from fba.helpers import tidyup
29
30 from fba.models import blocks
31 from fba.models import instances
32
33 language_mapping = {
34     # English -> English
35     "Silenced instances"            : "Silenced servers",
36     "Suspended instances"           : "Suspended servers",
37     "Limited instances"             : "Limited servers",
38     "Filtered media"                : "Filtered media",
39     # Mappuing German -> English
40     "Gesperrte Server"              : "Suspended servers",
41     "Gefilterte Medien"             : "Filtered media",
42     "Stummgeschaltete Server"       : "Silenced servers",
43     # Japanese -> English
44     "停止済みのサーバー"            : "Suspended servers",
45     "制限中のサーバー"              : "Limited servers",
46     "メディアを拒否しているサーバー": "Filtered media",
47     "サイレンス済みのサーバー"      : "Silenced servers",
48     # ??? -> English
49     "שרתים מושעים"                  : "Suspended servers",
50     "מדיה מסוננת"                   : "Filtered media",
51     "שרתים מוגבלים"                 : "Silenced servers",
52     # French -> English
53     "Serveurs suspendus"            : "Suspended servers",
54     "Médias filtrés"                : "Filtered media",
55     "Serveurs limités"              : "Limited servers",
56     "Serveurs modérés"              : "Limited servers",
57 }
58
59 def fetch_blocks_from_about(domain: str) -> dict:
60     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
61     if not isinstance(domain, str):
62         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
63     elif domain == "":
64         raise ValueError("Parameter 'domain' is empty")
65
66     # DEBUG: print("DEBUG: Fetching mastodon blocks from domain:", domain)
67     doc = None
68     for path in ["/about/more", "/about"]:
69         try:
70             # DEBUG: print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
71             doc = bs4.BeautifulSoup(
72                 network.fetch_response(
73                     domain,
74                     path,
75                     network.web_headers,
76                     (config.get("connection_timeout"), config.get("read_timeout"))
77                 ).text,
78                 "html.parser",
79             )
80
81             if len(doc.find_all("h3")) > 0:
82                 # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!")
83                 break
84
85         except network.exceptions as exception:
86             print(f"ERROR: Cannot fetch from domain='{domain}',exception='{type(exception)}'")
87             instances.set_last_error(domain, exception)
88             break
89
90     blocklist = {
91         "Suspended servers": [],
92         "Filtered media"   : [],
93         "Limited servers"  : [],
94         "Silenced servers" : [],
95     }
96
97     # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
98     if doc is None:
99         print(f"WARNING: Cannot fetch any /about pages for domain='{domain}' - EXIT!")
100         return blocklist
101
102     for header in doc.find_all("h3"):
103         header_text = tidyup.reason(header.text)
104
105         # DEBUG: print(f"DEBUG: header_text='{header_text}'")
106         if header_text in language_mapping:
107             # DEBUG: print(f"DEBUG: header_text='{header_text}'")
108             header_text = language_mapping[header_text]
109         else:
110             print(f"WARNING: header_text='{header_text}' not found in language mapping table")
111
112         if header_text in blocklist or header_text.lower() in blocklist:
113             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
114             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
115                 blocklist[header_text].append({
116                     "domain": tidyup.domain(line.find("span").text),
117                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
118                     "reason": tidyup.reason(line.find_all("td")[1].text),
119                 })
120         else:
121             print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
122
123     # DEBUG: print("DEBUG: Returning blocklist for domain:", domain)
124     return {
125         "reject"        : blocklist["Suspended servers"],
126         "media_removal" : blocklist["Filtered media"],
127         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
128     }
129
130 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
131     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
132     if not isinstance(domain, str):
133         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
134     elif domain == "":
135         raise ValueError("Parameter 'domain' is empty")
136     elif not isinstance(origin, str) and origin is not None:
137         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
138     elif origin == "":
139         raise ValueError("Parameter 'origin' is empty")
140     elif not isinstance(nodeinfo_url, str):
141         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
142     elif nodeinfo_url == "":
143         raise ValueError("Parameter 'nodeinfo_url' is empty")
144
145     # No CSRF by default, you don't have to add network.api_headers by yourself here
146     headers = tuple()
147
148     try:
149         # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
150         headers = csrf.determine(domain, dict())
151     except network.exceptions as exception:
152         print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
153         instances.set_last_error(domain, exception)
154         return
155
156     try:
157         # json endpoint for newer mastodongs
158         found_blocks = list()
159         blocklist = list()
160
161         rows = {
162             "reject"        : [],
163             "media_removal" : [],
164             "followers_only": [],
165             "report_removal": [],
166         }
167
168         # DEBUG: print("DEBUG: Querying API domain_blocks:", domain)
169         data = network.get_json_api(
170             domain,
171             "/api/v1/instance/domain_blocks",
172             headers,
173             (config.get("connection_timeout"), config.get("read_timeout"))
174         )
175
176         # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
177         if "error_message" in data:
178             # DEBUG: print(f"DEBUG: Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
179             instances.set_last_error(domain, data)
180             return
181         elif "json" in data and "error" in data["json"]:
182             print(f"WARNING: JSON API returned error message: '{data['json']['error']}'")
183             instances.set_last_error(domain, data)
184             return
185         else:
186             # Getting blocklist
187             blocklist = data["json"]
188
189         if len(blocklist) > 0:
190             print(f"INFO: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon' ...")
191             for block in blocklist:
192                 # Check type
193                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
194                 if not isinstance(block, dict):
195                     # DEBUG: print(f"DEBUG: block[]='{type(block)}' is of type 'dict' - SKIPPED!")
196                     continue
197
198                 # Map block -> entry
199                 # DEBUG: print(f"DEBUG: block[{type(block)}]='{block}'")
200                 entry = {
201                     "domain": block["domain"],
202                     "hash"  : block["digest"],
203                     "reason": block["comment"] if "comment" in block else None
204                 }
205
206                 # DEBUG: print("DEBUG: severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
207                 if block['severity'] == 'suspend':
208                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
209                     rows['reject'].append(entry)
210                 elif block['severity'] == 'silence':
211                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
212                     rows['followers_only'].append(entry)
213                 elif block['severity'] == 'reject_media':
214                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
215                     rows['media_removal'].append(entry)
216                 elif block['severity'] == 'reject_reports':
217                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
218                     rows['report_removal'].append(entry)
219                 else:
220                     print(f"WARNING: Unknown severity='{block['severity']}', domain='{block['domain']}'")
221         else:
222             # DEBUG: print(f"DEBUG: domain='{domain}' has returned zero rows, trying /about/more page ...")
223             rows = fetch_blocks_from_about(domain)
224
225         print(f"INFO: Checking {len(rows.items())} entries from domain='{domain}',software='mastodon' ...")
226         for block_level, blocklist in rows.items():
227             # DEBUG: print("DEBUG: domain,block_level,blocklist():", domain, block_level, len(blocklist))
228             block_level = tidyup.domain(block_level)
229
230             # DEBUG: print("DEBUG: AFTER-block_level:", block_level)
231             if block_level == "":
232                 print("WARNING: block_level is empty, domain:", domain)
233                 continue
234             elif block_level == "accept":
235                 # DEBUG: print(f"DEBUG: domain='{domain}' skipping block_level='accept'")
236                 continue
237
238             # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon',block_level='{block_level}' ...")
239             for block in blocklist:
240                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
241                 blocked, blocked_hash, reason = block.values()
242                 # DEBUG: print(f"DEBUG: blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
243                 blocked = tidyup.domain(blocked)
244                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
245                 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
246
247                 if blocked == "":
248                     print("WARNING: blocked is empty:", domain)
249                     continue
250                 elif blacklist.is_blacklisted(blocked):
251                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
252                     continue
253                 elif blocked.count("*") > 0:
254                     # Doing the hash search for instance names as well to tidy up DB
255                     row = instances.deobscure("*", blocked, blocked_hash)
256
257                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
258                     if row is None:
259                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
260                         continue
261
262                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
263                     blocked      = row[0]
264                     origin       = row[1]
265                     nodeinfo_url = row[2]
266                 elif blocked.count("?") > 0:
267                     # Doing the hash search for instance names as well to tidy up DB
268                     row = instances.deobscure("?", blocked, blocked_hash)
269
270                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
271                     if row is None:
272                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
273                         continue
274
275                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
276                     blocked      = row[0]
277                     origin       = row[1]
278                     nodeinfo_url = row[2]
279
280                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
281                 if not validators.domain(blocked):
282                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
283                     continue
284                 elif blocked.endswith(".arpa"):
285                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
286                     continue
287                 elif blocked.endswith(".tld"):
288                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
289                     continue
290                 elif not instances.is_registered(blocked):
291                     # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
292                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
293
294                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
295                 if not validators.domain(blocked):
296                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
297                     continue
298                 elif blocked.endswith(".arpa"):
299                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
300                     continue
301                 elif blocked.endswith(".tld"):
302                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
303                     continue
304                 elif not instances.is_registered(blocked):
305                     # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
306                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
307
308                 if not blocks.is_instance_blocked(domain, blocked, block_level):
309                     # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
310                     blocks.add_instance(domain, blocked, reason, block_level)
311
312                     if block_level == "reject":
313                         found_blocks.append({
314                             "blocked": blocked,
315                             "reason" : reason
316                         })
317                 else:
318                     # DEBUG: print(f"DEBUG: Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
319                     blocks.update_last_seen(domain, blocked, block_level)
320                     blocks.update_reason(reason, domain, blocked, block_level)
321
322         # DEBUG: print("DEBUG: Committing changes ...")
323         fba.connection.commit()
324     except network.exceptions as exception:
325         print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'")
326         instances.set_last_error(domain, exception)
327
328     # DEBUG: print("DEBUG: EXIT!")