]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Fixed:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21 import validators
22
23 from fba import csrf
24 from fba import fba
25
26 from fba.helpers import blacklist
27 from fba.helpers import config
28 from fba.helpers import tidyup
29
30 from fba.http import network
31
32 from fba.models import blocks
33 from fba.models import instances
34
35 logging.basicConfig(level=logging.INFO)
36 logger = logging.getLogger(__name__)
37
38 # Language mapping X -> English
39 language_mapping = {
40     # English -> English
41     "Silenced instances"            : "Silenced servers",
42     "Suspended instances"           : "Suspended servers",
43     "Limited instances"             : "Limited servers",
44     "Filtered media"                : "Filtered media",
45     # Mappuing German -> English
46     "Gesperrte Server"              : "Suspended servers",
47     "Gefilterte Medien"             : "Filtered media",
48     "Stummgeschaltete Server"       : "Silenced servers",
49     # Japanese -> English
50     "停止済みのサーバー"            : "Suspended servers",
51     "制限中のサーバー"              : "Limited servers",
52     "メディアを拒否しているサーバー": "Filtered media",
53     "サイレンス済みのサーバー"      : "Silenced servers",
54     # ??? -> English
55     "שרתים מושעים"                  : "Suspended servers",
56     "מדיה מסוננת"                   : "Filtered media",
57     "שרתים מוגבלים"                 : "Silenced servers",
58     # French -> English
59     "Serveurs suspendus"            : "Suspended servers",
60     "Médias filtrés"                : "Filtered media",
61     "Serveurs limités"              : "Limited servers",
62     "Serveurs modérés"              : "Limited servers",
63 }
64
65 def fetch_blocks_from_about(domain: str) -> dict:
66     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
67     if not isinstance(domain, str):
68         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
69     elif domain == "":
70         raise ValueError("Parameter 'domain' is empty")
71     elif domain.lower() != domain:
72         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
73     elif not validators.domain(domain.split("/")[0]):
74         raise ValueError(f"domain='{domain}' is not a valid domain")
75     elif domain.endswith(".arpa"):
76         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
77     elif domain.endswith(".tld"):
78         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
79
80     logger.debug("Fetching mastodon blocks from domain:", domain)
81     doc = None
82     for path in ["/about/more", "/about"]:
83         try:
84             logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
85             doc = bs4.BeautifulSoup(
86                 network.fetch_response(
87                     domain,
88                     path,
89                     network.web_headers,
90                     (config.get("connection_timeout"), config.get("read_timeout"))
91                 ).text,
92                 "html.parser",
93             )
94
95             if len(doc.find_all("h3")) > 0:
96                 logger.debug(f"path='{path}' had some headlines - BREAK!")
97                 break
98
99         except network.exceptions as exception:
100             logger.warning(f"Cannot fetch from domain='{domain}',exception='{type(exception)}'")
101             instances.set_last_error(domain, exception)
102             break
103
104     blocklist = {
105         "Suspended servers": [],
106         "Filtered media"   : [],
107         "Limited servers"  : [],
108         "Silenced servers" : [],
109     }
110
111     logger.debug("doc[]='%'", type(doc))
112     if doc is None:
113         logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
114         return blocklist
115
116     for header in doc.find_all("h3"):
117         header_text = tidyup.reason(header.text)
118
119         logger.debug("header_text='%s'", header_text)
120         if header_text in language_mapping:
121             logger.debug("header_text='%s'", header_text)
122             header_text = language_mapping[header_text]
123         else:
124             logger.warning(f"header_text='{header_text}' not found in language mapping table")
125
126         if header_text in blocklist or header_text.lower() in blocklist:
127             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
128             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
129                 blocklist[header_text].append({
130                     "domain": tidyup.domain(line.find("span").text),
131                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
132                     "reason": tidyup.reason(line.find_all("td")[1].text),
133                 })
134         else:
135             logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
136
137     logger.debug("Returning blocklist for domain:", domain)
138     return {
139         "reject"        : blocklist["Suspended servers"],
140         "media_removal" : blocklist["Filtered media"],
141         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
142     }
143
144 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
145     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
146     if not isinstance(domain, str):
147         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
148     elif domain == "":
149         raise ValueError("Parameter 'domain' is empty")
150     elif domain.lower() != domain:
151         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
152     elif not validators.domain(domain.split("/")[0]):
153         raise ValueError(f"domain='{domain}' is not a valid domain")
154     elif domain.endswith(".arpa"):
155         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
156     elif domain.endswith(".tld"):
157         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
158     elif not isinstance(origin, str) and origin is not None:
159         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
160     elif origin == "":
161         raise ValueError("Parameter 'origin' is empty")
162     elif not isinstance(nodeinfo_url, str):
163         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
164     elif nodeinfo_url == "":
165         raise ValueError("Parameter 'nodeinfo_url' is empty")
166
167     # No CSRF by default, you don't have to add network.api_headers by yourself here
168     headers = tuple()
169
170     try:
171         logger.debug(f"Checking CSRF for domain='{domain}'")
172         headers = csrf.determine(domain, dict())
173     except network.exceptions as exception:
174         logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
175         instances.set_last_error(domain, exception)
176         return
177
178     try:
179         # json endpoint for newer mastodongs
180         found_blocks = list()
181         blocklist = list()
182
183         rows = {
184             "reject"        : [],
185             "media_removal" : [],
186             "followers_only": [],
187             "report_removal": [],
188         }
189
190         logger.debug("Querying API domain_blocks:", domain)
191         data = network.get_json_api(
192             domain,
193             "/api/v1/instance/domain_blocks",
194             headers,
195             (config.get("connection_timeout"), config.get("read_timeout"))
196         )
197
198         logger.debug("data[]='%s'", type(data))
199         if "error_message" in data:
200             logger.debug(f"Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
201             instances.set_last_error(domain, data)
202             return
203         elif "json" in data and "error" in data["json"]:
204             logger.warning(f"JSON API returned error message: '{data['json']['error']}'")
205             instances.set_last_error(domain, data)
206             return
207         else:
208             # Getting blocklist
209             blocklist = data["json"]
210
211         if len(blocklist) > 0:
212             logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
213             for block in blocklist:
214                 # Check type
215                 logger.debug(f"block[]='{type(block)}'")
216                 if not isinstance(block, dict):
217                     logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
218                     continue
219
220                 # Map block -> entry
221                 logger.debug(f"block[{type(block)}]='{block}'")
222                 entry = {
223                     "domain": block["domain"],
224                     "hash"  : block["digest"],
225                     "reason": block["comment"] if "comment" in block else None
226                 }
227
228                 logger.debug("severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
229                 if block['severity'] == 'suspend':
230                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
231                     rows['reject'].append(entry)
232                 elif block['severity'] == 'silence':
233                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
234                     rows['followers_only'].append(entry)
235                 elif block['severity'] == 'reject_media':
236                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
237                     rows['media_removal'].append(entry)
238                 elif block['severity'] == 'reject_reports':
239                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
240                     rows['report_removal'].append(entry)
241                 else:
242                     logger.warning(f"Unknown severity='{block['severity']}', domain='{block['domain']}'")
243         else:
244             logger.debug(f"domain='{domain}' has returned zero rows, trying /about/more page ...")
245             rows = fetch_blocks_from_about(domain)
246
247         logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
248         for block_level, blocklist in rows.items():
249             logger.debug("domain,block_level,blocklist():", domain, block_level, len(blocklist))
250             block_level = tidyup.domain(block_level)
251
252             logger.debug("AFTER-block_level:", block_level)
253             if block_level == "":
254                 logger.warning("block_level is empty, domain:", domain)
255                 continue
256             elif block_level == "accept":
257                 logger.debug(f"domain='{domain}' skipping block_level='accept'")
258                 continue
259
260             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
261             for block in blocklist:
262                 logger.debug(f"block[]='{type(block)}'")
263                 blocked, blocked_hash, reason = block.values()
264                 logger.debug(f"blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
265                 blocked = tidyup.domain(blocked)
266                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
267                 logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
268
269                 if blocked == "":
270                     logger.warning("blocked is empty, domain='%s'", domain)
271                     continue
272                 elif blacklist.is_blacklisted(blocked):
273                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
274                     continue
275                 elif blocked.count("*") > 0:
276                     # Doing the hash search for instance names as well to tidy up DB
277                     row = instances.deobscure("*", blocked, blocked_hash)
278
279                     logger.debug("row[]='%s'", type(row))
280                     if row is None:
281                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
282                         continue
283
284                     logger.debug("Updating domain: ", row[0])
285                     blocked      = row[0]
286                     origin       = row[1]
287                     nodeinfo_url = row[2]
288                 elif blocked.count("?") > 0:
289                     # Doing the hash search for instance names as well to tidy up DB
290                     row = instances.deobscure("?", blocked, blocked_hash)
291
292                     logger.debug("row[]='%s'", type(row))
293                     if row is None:
294                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
295                         continue
296
297                     logger.debug("Updating domain: ", row[0])
298                     blocked      = row[0]
299                     origin       = row[1]
300                     nodeinfo_url = row[2]
301
302                 logger.debug("Looking up instance by domain:", blocked)
303                 if not validators.domain(blocked):
304                     logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
305                     continue
306                 elif blocked.endswith(".arpa"):
307                     logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
308                     continue
309                 elif blocked.endswith(".tld"):
310                     logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
311                     continue
312                 elif blacklist.is_blacklisted(blocked):
313                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
314                     continue
315                 elif not instances.is_registered(blocked):
316                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
317                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
318
319                 logger.debug("Looking up instance by domain:", blocked)
320                 if not validators.domain(blocked):
321                     logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
322                     continue
323                 elif blocked.endswith(".arpa"):
324                     logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
325                     continue
326                 elif blocked.endswith(".tld"):
327                     logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
328                     continue
329                 elif blacklist.is_blacklisted(blocked):
330                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
331                     continue
332                 elif not instances.is_registered(blocked):
333                     logger.debug("Hash wasn't found, adding:", blocked, domain)
334                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
335
336                 if not blocks.is_instance_blocked(domain, blocked, block_level):
337                     logger.debug("Blocking:", domain, blocked, block_level)
338                     blocks.add_instance(domain, blocked, reason, block_level)
339
340                     if block_level == "reject":
341                         found_blocks.append({
342                             "blocked": blocked,
343                             "reason" : reason
344                         })
345                 else:
346                     logger.debug(f"Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
347                     blocks.update_last_seen(domain, blocked, block_level)
348                     blocks.update_reason(reason, domain, blocked, block_level)
349
350         logger.debug("Committing changes ...")
351         fba.connection.commit()
352     except network.exceptions as exception:
353         logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
354         instances.set_last_error(domain, exception)
355
356     logger.debug("EXIT!")