]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Conntinued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18
19 import bs4
20 import validators
21
22 from fba import csrf
23 from fba import fba
24
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import tidyup
28
29 from fba.http import network
30
31 from fba.models import blocks
32 from fba.models import instances
33
34 language_mapping = {
35     # English -> English
36     "Silenced instances"            : "Silenced servers",
37     "Suspended instances"           : "Suspended servers",
38     "Limited instances"             : "Limited servers",
39     "Filtered media"                : "Filtered media",
40     # Mappuing German -> English
41     "Gesperrte Server"              : "Suspended servers",
42     "Gefilterte Medien"             : "Filtered media",
43     "Stummgeschaltete Server"       : "Silenced servers",
44     # Japanese -> English
45     "停止済みのサーバー"            : "Suspended servers",
46     "制限中のサーバー"              : "Limited servers",
47     "メディアを拒否しているサーバー": "Filtered media",
48     "サイレンス済みのサーバー"      : "Silenced servers",
49     # ??? -> English
50     "שרתים מושעים"                  : "Suspended servers",
51     "מדיה מסוננת"                   : "Filtered media",
52     "שרתים מוגבלים"                 : "Silenced servers",
53     # French -> English
54     "Serveurs suspendus"            : "Suspended servers",
55     "Médias filtrés"                : "Filtered media",
56     "Serveurs limités"              : "Limited servers",
57     "Serveurs modérés"              : "Limited servers",
58 }
59
60 def fetch_blocks_from_about(domain: str) -> dict:
61     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
62     if not isinstance(domain, str):
63         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
64     elif domain == "":
65         raise ValueError("Parameter 'domain' is empty")
66     elif domain.lower() != domain:
67         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
68     elif not validators.domain(domain.split("/")[0]):
69         raise ValueError(f"domain='{domain}' is not a valid domain")
70     elif domain.endswith(".arpa"):
71         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
72     elif domain.endswith(".tld"):
73         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
74
75     # DEBUG: print("DEBUG: Fetching mastodon blocks from domain:", domain)
76     doc = None
77     for path in ["/about/more", "/about"]:
78         try:
79             # DEBUG: print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
80             doc = bs4.BeautifulSoup(
81                 network.fetch_response(
82                     domain,
83                     path,
84                     network.web_headers,
85                     (config.get("connection_timeout"), config.get("read_timeout"))
86                 ).text,
87                 "html.parser",
88             )
89
90             if len(doc.find_all("h3")) > 0:
91                 # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!")
92                 break
93
94         except network.exceptions as exception:
95             print(f"ERROR: Cannot fetch from domain='{domain}',exception='{type(exception)}'")
96             instances.set_last_error(domain, exception)
97             break
98
99     blocklist = {
100         "Suspended servers": [],
101         "Filtered media"   : [],
102         "Limited servers"  : [],
103         "Silenced servers" : [],
104     }
105
106     # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
107     if doc is None:
108         print(f"WARNING: Cannot fetch any /about pages for domain='{domain}' - EXIT!")
109         return blocklist
110
111     for header in doc.find_all("h3"):
112         header_text = tidyup.reason(header.text)
113
114         # DEBUG: print(f"DEBUG: header_text='{header_text}'")
115         if header_text in language_mapping:
116             # DEBUG: print(f"DEBUG: header_text='{header_text}'")
117             header_text = language_mapping[header_text]
118         else:
119             print(f"WARNING: header_text='{header_text}' not found in language mapping table")
120
121         if header_text in blocklist or header_text.lower() in blocklist:
122             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
123             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
124                 blocklist[header_text].append({
125                     "domain": tidyup.domain(line.find("span").text),
126                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
127                     "reason": tidyup.reason(line.find_all("td")[1].text),
128                 })
129         else:
130             print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
131
132     # DEBUG: print("DEBUG: Returning blocklist for domain:", domain)
133     return {
134         "reject"        : blocklist["Suspended servers"],
135         "media_removal" : blocklist["Filtered media"],
136         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
137     }
138
139 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
140     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
141     if not isinstance(domain, str):
142         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
143     elif domain == "":
144         raise ValueError("Parameter 'domain' is empty")
145     elif domain.lower() != domain:
146         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
147     elif not validators.domain(domain.split("/")[0]):
148         raise ValueError(f"domain='{domain}' is not a valid domain")
149     elif domain.endswith(".arpa"):
150         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
151     elif domain.endswith(".tld"):
152         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
153     elif not isinstance(origin, str) and origin is not None:
154         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
155     elif origin == "":
156         raise ValueError("Parameter 'origin' is empty")
157     elif not isinstance(nodeinfo_url, str):
158         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
159     elif nodeinfo_url == "":
160         raise ValueError("Parameter 'nodeinfo_url' is empty")
161
162     # No CSRF by default, you don't have to add network.api_headers by yourself here
163     headers = tuple()
164
165     try:
166         # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
167         headers = csrf.determine(domain, dict())
168     except network.exceptions as exception:
169         print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
170         instances.set_last_error(domain, exception)
171         return
172
173     try:
174         # json endpoint for newer mastodongs
175         found_blocks = list()
176         blocklist = list()
177
178         rows = {
179             "reject"        : [],
180             "media_removal" : [],
181             "followers_only": [],
182             "report_removal": [],
183         }
184
185         # DEBUG: print("DEBUG: Querying API domain_blocks:", domain)
186         data = network.get_json_api(
187             domain,
188             "/api/v1/instance/domain_blocks",
189             headers,
190             (config.get("connection_timeout"), config.get("read_timeout"))
191         )
192
193         # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
194         if "error_message" in data:
195             # DEBUG: print(f"DEBUG: Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
196             instances.set_last_error(domain, data)
197             return
198         elif "json" in data and "error" in data["json"]:
199             print(f"WARNING: JSON API returned error message: '{data['json']['error']}'")
200             instances.set_last_error(domain, data)
201             return
202         else:
203             # Getting blocklist
204             blocklist = data["json"]
205
206         if len(blocklist) > 0:
207             print(f"INFO: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon' ...")
208             for block in blocklist:
209                 # Check type
210                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
211                 if not isinstance(block, dict):
212                     # DEBUG: print(f"DEBUG: block[]='{type(block)}' is of type 'dict' - SKIPPED!")
213                     continue
214
215                 # Map block -> entry
216                 # DEBUG: print(f"DEBUG: block[{type(block)}]='{block}'")
217                 entry = {
218                     "domain": block["domain"],
219                     "hash"  : block["digest"],
220                     "reason": block["comment"] if "comment" in block else None
221                 }
222
223                 # DEBUG: print("DEBUG: severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
224                 if block['severity'] == 'suspend':
225                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
226                     rows['reject'].append(entry)
227                 elif block['severity'] == 'silence':
228                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
229                     rows['followers_only'].append(entry)
230                 elif block['severity'] == 'reject_media':
231                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
232                     rows['media_removal'].append(entry)
233                 elif block['severity'] == 'reject_reports':
234                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
235                     rows['report_removal'].append(entry)
236                 else:
237                     print(f"WARNING: Unknown severity='{block['severity']}', domain='{block['domain']}'")
238         else:
239             # DEBUG: print(f"DEBUG: domain='{domain}' has returned zero rows, trying /about/more page ...")
240             rows = fetch_blocks_from_about(domain)
241
242         print(f"INFO: Checking {len(rows.items())} entries from domain='{domain}',software='mastodon' ...")
243         for block_level, blocklist in rows.items():
244             # DEBUG: print("DEBUG: domain,block_level,blocklist():", domain, block_level, len(blocklist))
245             block_level = tidyup.domain(block_level)
246
247             # DEBUG: print("DEBUG: AFTER-block_level:", block_level)
248             if block_level == "":
249                 print("WARNING: block_level is empty, domain:", domain)
250                 continue
251             elif block_level == "accept":
252                 # DEBUG: print(f"DEBUG: domain='{domain}' skipping block_level='accept'")
253                 continue
254
255             # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon',block_level='{block_level}' ...")
256             for block in blocklist:
257                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
258                 blocked, blocked_hash, reason = block.values()
259                 # DEBUG: print(f"DEBUG: blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
260                 blocked = tidyup.domain(blocked)
261                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
262                 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
263
264                 if blocked == "":
265                     print("WARNING: blocked is empty:", domain)
266                     continue
267                 elif blacklist.is_blacklisted(blocked):
268                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
269                     continue
270                 elif blocked.count("*") > 0:
271                     # Doing the hash search for instance names as well to tidy up DB
272                     row = instances.deobscure("*", blocked, blocked_hash)
273
274                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
275                     if row is None:
276                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
277                         continue
278
279                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
280                     blocked      = row[0]
281                     origin       = row[1]
282                     nodeinfo_url = row[2]
283                 elif blocked.count("?") > 0:
284                     # Doing the hash search for instance names as well to tidy up DB
285                     row = instances.deobscure("?", blocked, blocked_hash)
286
287                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
288                     if row is None:
289                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
290                         continue
291
292                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
293                     blocked      = row[0]
294                     origin       = row[1]
295                     nodeinfo_url = row[2]
296
297                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
298                 if not validators.domain(blocked):
299                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
300                     continue
301                 elif blocked.endswith(".arpa"):
302                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
303                     continue
304                 elif blocked.endswith(".tld"):
305                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
306                     continue
307                 elif blacklist.is_blacklisted(blocked):
308                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
309                     continue
310                 elif not instances.is_registered(blocked):
311                     # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
312                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
313
314                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
315                 if not validators.domain(blocked):
316                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
317                     continue
318                 elif blocked.endswith(".arpa"):
319                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
320                     continue
321                 elif blocked.endswith(".tld"):
322                     print(f"WARNING: blocked='{blocked}' is a fake domain, please don't crawl them!")
323                     continue
324                 elif blacklist.is_blacklisted(blocked):
325                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - SKIPPED!")
326                     continue
327                 elif not instances.is_registered(blocked):
328                     # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
329                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
330
331                 if not blocks.is_instance_blocked(domain, blocked, block_level):
332                     # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
333                     blocks.add_instance(domain, blocked, reason, block_level)
334
335                     if block_level == "reject":
336                         found_blocks.append({
337                             "blocked": blocked,
338                             "reason" : reason
339                         })
340                 else:
341                     # DEBUG: print(f"DEBUG: Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
342                     blocks.update_last_seen(domain, blocked, block_level)
343                     blocks.update_reason(reason, domain, blocked, block_level)
344
345         # DEBUG: print("DEBUG: Committing changes ...")
346         fba.connection.commit()
347     except network.exceptions as exception:
348         print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'")
349         instances.set_last_error(domain, exception)
350
351     # DEBUG: print("DEBUG: EXIT!")