]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Continued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18
19 import bs4
20 import validators
21
22 from fba import blacklist
23 from fba import blocks
24 from fba import config
25 from fba import csrf
26 from fba import fba
27 from fba import instances
28 from fba import network
29
30 from fba.helpers import tidyup
31
32 language_mapping = {
33     # English -> English
34     "Silenced instances"            : "Silenced servers",
35     "Suspended instances"           : "Suspended servers",
36     "Limited instances"             : "Limited servers",
37     "Filtered media"                : "Filtered media",
38     # Mappuing German -> English
39     "Gesperrte Server"              : "Suspended servers",
40     "Gefilterte Medien"             : "Filtered media",
41     "Stummgeschaltete Server"       : "Silenced servers",
42     # Japanese -> English
43     "停止済みのサーバー"            : "Suspended servers",
44     "制限中のサーバー"              : "Limited servers",
45     "メディアを拒否しているサーバー": "Filtered media",
46     "サイレンス済みのサーバー"      : "Silenced servers",
47     # ??? -> English
48     "שרתים מושעים"                  : "Suspended servers",
49     "מדיה מסוננת"                   : "Filtered media",
50     "שרתים מוגבלים"                 : "Silenced servers",
51     # French -> English
52     "Serveurs suspendus"            : "Suspended servers",
53     "Médias filtrés"                : "Filtered media",
54     "Serveurs limités"              : "Limited servers",
55     "Serveurs modérés"              : "Limited servers",
56 }
57
58 def fetch_blocks_from_about(domain: str) -> dict:
59     # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
60     if not isinstance(domain, str):
61         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
62     elif domain == "":
63         raise ValueError("Parameter 'domain' is empty")
64
65     # DEBUG: print("DEBUG: Fetching mastodon blocks from domain:", domain)
66     blocklist = {
67         "Suspended servers": [],
68         "Filtered media"   : [],
69         "Limited servers"  : [],
70         "Silenced servers" : [],
71     }
72
73     doc = None
74     for path in ("/about/more", "/about"):
75         try:
76             # DEBUG: print(f"DEBUG: Fetching path='{path}' from domain='{domain}' ...")
77             doc = bs4.BeautifulSoup(
78                 network.fetch_response(
79                     domain,
80                     path,
81                     network.web_headers,
82                     (config.get("connection_timeout"), config.get("read_timeout"))
83                 ).text,
84                 "html.parser",
85             )
86
87             if len(doc.find_all("h3")) > 0:
88                 # DEBUG: print(f"DEBUG: path='{path}' had some headlines - BREAK!")
89                 break
90
91         except BaseException as exception:
92             print("ERROR: Cannot fetch from domain:", domain, exception)
93             instances.update_last_error(domain, exception)
94             break
95
96     # DEBUG: print(f"DEBUG: doc[]='{type(doc)}'")
97     if doc is None:
98         print(f"WARNING: Cannot fetch any /about pages for domain='{domain}' - EXIT!")
99         return blocklist
100
101     for header in doc.find_all("h3"):
102         header_text = tidyup.reason(header.text)
103
104         # DEBUG: print(f"DEBUG: header_text='{header_text}'")
105         if header_text in language_mapping:
106             # DEBUG: print(f"DEBUG: header_text='{header_text}'")
107             header_text = language_mapping[header_text]
108         else:
109             print(f"WARNING: header_text='{header_text}' not found in language mapping table")
110
111         if header_text in blocklist or header_text.lower() in blocklist:
112             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
113             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
114                 blocklist[header_text].append(
115                     {
116                         "domain": tidyup.domain(line.find("span").text),
117                         "hash"  : tidyup.domain(line.find("span")["title"][9:]),
118                         "reason": tidyup.reason(line.find_all("td")[1].text),
119                     }
120                 )
121         else:
122             print(f"WARNING: header_text='{header_text}' not found in blocklist()={len(blocklist)}")
123
124     # DEBUG: print("DEBUG: Returning blocklist for domain:", domain)
125     return {
126         "reject"        : blocklist["Suspended servers"],
127         "media_removal" : blocklist["Filtered media"],
128         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
129     }
130
131 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
132     # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
133     if not isinstance(domain, str):
134         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
135     elif domain == "":
136         raise ValueError("Parameter 'domain' is empty")
137     elif not isinstance(origin, str) and origin is not None:
138         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
139     elif origin == "":
140         raise ValueError("Parameter 'origin' is empty")
141     elif not isinstance(nodeinfo_url, str):
142         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
143     elif nodeinfo_url == "":
144         raise ValueError("Parameter 'nodeinfo_url' is empty")
145
146     # No CSRF by default, you don't have to add network.api_headers by yourself here
147     headers = tuple()
148
149     try:
150         # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
151         headers = csrf.determine(domain, dict())
152     except network.exceptions as exception:
153         print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
154         return
155
156     try:
157         # json endpoint for newer mastodongs
158         found_blocks = list()
159         blocklist = list()
160
161         rows = {
162             "reject"        : [],
163             "media_removal" : [],
164             "followers_only": [],
165             "report_removal": [],
166         }
167
168         # DEBUG: print("DEBUG: Querying API domain_blocks:", domain)
169         data = network.get_json_api(
170             domain,
171             "/api/v1/instance/domain_blocks",
172             headers,
173             (config.get("connection_timeout"), config.get("read_timeout"))
174         )
175
176         if "error_message" in data:
177             # DEBUG: print(f"DEBUG: Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
178             instances.update_last_error(domain, data)
179             return
180         elif "json" in data and "error" in data["json"]:
181             print(f"WARNING: JSON API returned error message: '{data['json']['error']}'")
182             instances.update_last_error(domain, data)
183             return
184         else:
185             # Getting blocklist
186             blocklist = data["json"]
187
188         if len(blocklist) > 0:
189             print(f"INFO: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon' ...")
190             for block in blocklist:
191                 # Map block -> entry
192                 # DEBUG: print(f"DEBUG: block[{type(block)}]='{block}'")
193                 entry = {
194                     "domain": block["domain"],
195                     "hash"  : block["digest"],
196                     "reason": block["comment"] if "comment" in block else None
197                 }
198
199                 # DEBUG: print("DEBUG: severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
200                 if block['severity'] == 'suspend':
201                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
202                     rows['reject'].append(entry)
203                 elif block['severity'] == 'silence':
204                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
205                     rows['followers_only'].append(entry)
206                 elif block['severity'] == 'reject_media':
207                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
208                     rows['media_removal'].append(entry)
209                 elif block['severity'] == 'reject_reports':
210                     # DEBUG: print(f"DEBUG: Adding entry='{entry}' with severity='{block['severity']}' ...")
211                     rows['report_removal'].append(entry)
212                 else:
213                     print("WARNING: Unknown severity:", block['severity'], block['domain'])
214         else:
215             # DEBUG: print(f"DEBUG: domain='{domain}' has returned zero rows, trying /about/more page ...")
216             rows = fetch_blocks_from_about(domain)
217
218         print(f"INFO: Checking {len(rows.items())} entries from domain='{domain}',software='mastodon' ...")
219         for block_level, blocklist in rows.items():
220             # DEBUG: print("DEBUG: domain,block_level,blocklist():", domain, block_level, len(blocklist))
221             block_level = tidyup.domain(block_level)
222
223             # DEBUG: print("DEBUG: AFTER-block_level:", block_level)
224             if block_level == "":
225                 print("WARNING: block_level is empty, domain:", domain)
226                 continue
227
228             # DEBUG: print(f"DEBUG: Checking {len(blocklist)} entries from domain='{domain}',software='mastodon',block_level='{block_level}' ...")
229             for block in blocklist:
230                 # DEBUG: print(f"DEBUG: block[]='{type(block)}'")
231                 blocked, blocked_hash, reason = block.values()
232                 # DEBUG: print(f"DEBUG: blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
233                 blocked = tidyup.domain(blocked)
234                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
235                 # DEBUG: print(f"DEBUG: blocked='{blocked}',reason='{reason}' - AFTER!")
236
237                 if blocked == "":
238                     print("WARNING: blocked is empty:", domain)
239                     continue
240                 elif blacklist.is_blacklisted(blocked):
241                     # DEBUG: print(f"DEBUG: blocked='{blocked}' is blacklisted - skipping!")
242                     continue
243                 elif blocked.count("*") > 0:
244                     # Doing the hash search for instance names as well to tidy up DB
245                     row = instances.deobscure("*", blocked, blocked_hash)
246
247                     # DEBUG: print(f"DEBUG: row[]='{type(row)}'")
248                     if row is None:
249                         print(f"WARNING: Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
250                         continue
251
252                     # DEBUG: print("DEBUG: Updating domain: ", row[0])
253                     blocked      = row[0]
254                     origin       = row[1]
255                     nodeinfo_url = row[2]
256
257                     # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
258                     if not validators.domain(blocked):
259                         print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
260                         continue
261                     elif blocked.endswith(".arpa"):
262                         print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
263                         continue
264                     elif not instances.is_registered(blocked):
265                         # DEBUG: print(f"DEBUG: Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
266                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
267                 elif not validators.domain(blocked):
268                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
269                     continue
270
271                 # DEBUG: print("DEBUG: Looking up instance by domain:", blocked)
272                 if not validators.domain(blocked):
273                     print(f"WARNING: blocked='{blocked}',software='mastodon' is not a valid domain name - SKIPPED!")
274                     continue
275                 elif blocked.endswith(".arpa"):
276                     print(f"WARNING: blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
277                     continue
278                 elif not instances.is_registered(blocked):
279                     # DEBUG: print("DEBUG: Hash wasn't found, adding:", blocked, domain)
280                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
281
282                 blocking = blocked if blocked.count("*") <= 1 else blocked_hash
283                 # DEBUG: print(f"DEBUG: blocking='{blocking}',blocked='{blocked}',blocked_hash='{blocked_hash}'")
284
285                 if not blocks.is_instance_blocked(domain, blocked, block_level):
286                     # DEBUG: print("DEBUG: Blocking:", domain, blocked, block_level)
287                     blocks.add_instance(domain, blocking, reason, block_level)
288
289                     if block_level == "reject":
290                         found_blocks.append({
291                             "blocked": blocked,
292                             "reason" : reason
293                         })
294                 else:
295                     # DEBUG: print(f"DEBUG: Updating block last seen and reason for domain='{domain}',blocking='{blocking}' ...")
296                     blocks.update_last_seen(domain, blocking, block_level)
297                     blocks.update_reason(reason, domain, blocking, block_level)
298
299         # DEBUG: print("DEBUG: Committing changes ...")
300         fba.connection.commit()
301     except network.exceptions as exception:
302         print(f"ERROR: domain='{domain}',software='mastodon',exception[{type(exception)}]:'{str(exception)}'")
303
304     # DEBUG: print("DEBUG: EXIT!")