]> git.mxchange.org Git - fba.git/blob - fba/networks/mastodon.py
Continued:
[fba.git] / fba / networks / mastodon.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21 import validators
22
23 from fba import csrf
24 from fba import database
25 from fba import utils
26
27 from fba.helpers import blacklist
28 from fba.helpers import config
29 from fba.helpers import tidyup
30
31 from fba.http import network
32
33 from fba.models import blocks
34 from fba.models import instances
35
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
38
39 # Language mapping X -> English
40 language_mapping = {
41     # English -> English
42     "Silenced instances"            : "Silenced servers",
43     "Suspended instances"           : "Suspended servers",
44     "Limited instances"             : "Limited servers",
45     "Filtered media"                : "Filtered media",
46     # Mappuing German -> English
47     "Gesperrte Server"              : "Suspended servers",
48     "Gefilterte Medien"             : "Filtered media",
49     "Stummgeschaltete Server"       : "Silenced servers",
50     # Japanese -> English
51     "停止済みのサーバー"            : "Suspended servers",
52     "制限中のサーバー"              : "Limited servers",
53     "メディアを拒否しているサーバー": "Filtered media",
54     "サイレンス済みのサーバー"      : "Silenced servers",
55     # ??? -> English
56     "שרתים מושעים"                  : "Suspended servers",
57     "מדיה מסוננת"                   : "Filtered media",
58     "שרתים מוגבלים"                 : "Silenced servers",
59     # French -> English
60     "Serveurs suspendus"            : "Suspended servers",
61     "Médias filtrés"                : "Filtered media",
62     "Serveurs limités"              : "Limited servers",
63     "Serveurs modérés"              : "Limited servers",
64 }
65
66 def fetch_blocks_from_about(domain: str) -> dict:
67     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
68     if not isinstance(domain, str):
69         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
70     elif domain == "":
71         raise ValueError("Parameter 'domain' is empty")
72     elif domain.lower() != domain:
73         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
74     elif not validators.domain(domain.split("/")[0]):
75         raise ValueError(f"domain='{domain}' is not a valid domain")
76     elif domain.endswith(".arpa"):
77         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
78     elif domain.endswith(".tld"):
79         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
80
81     logger.debug("Fetching mastodon blocks from domain:", domain)
82     doc = None
83     for path in ["/about/more", "/about"]:
84         try:
85             logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
86             doc = bs4.BeautifulSoup(
87                 network.fetch_response(
88                     domain,
89                     path,
90                     network.web_headers,
91                     (config.get("connection_timeout"), config.get("read_timeout"))
92                 ).text,
93                 "html.parser",
94             )
95
96             if len(doc.find_all("h3")) > 0:
97                 logger.debug(f"path='{path}' had some headlines - BREAK!")
98                 break
99
100         except network.exceptions as exception:
101             logger.warning(f"Cannot fetch from domain='{domain}',exception='{type(exception)}'")
102             instances.set_last_error(domain, exception)
103             break
104
105     blocklist = {
106         "Suspended servers": [],
107         "Filtered media"   : [],
108         "Limited servers"  : [],
109         "Silenced servers" : [],
110     }
111
112     logger.debug("doc[]='%s'", type(doc))
113     if doc is None:
114         logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
115         return blocklist
116
117     for header in doc.find_all("h3"):
118         header_text = tidyup.reason(header.text)
119
120         logger.debug("header_text='%s'", header_text)
121         if header_text in language_mapping:
122             logger.debug("header_text='%s'", header_text)
123             header_text = language_mapping[header_text]
124         else:
125             logger.warning(f"header_text='{header_text}' not found in language mapping table")
126
127         if header_text in blocklist or header_text.lower() in blocklist:
128             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
129             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
130                 blocklist[header_text].append({
131                     "domain": tidyup.domain(line.find("span").text),
132                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
133                     "reason": tidyup.reason(line.find_all("td")[1].text),
134                 })
135         else:
136             logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
137
138     logger.debug("Returning blocklist for domain:", domain)
139     return {
140         "reject"        : blocklist["Suspended servers"],
141         "media_removal" : blocklist["Filtered media"],
142         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
143     }
144
145 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
146     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
147     if not isinstance(domain, str):
148         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
149     elif domain == "":
150         raise ValueError("Parameter 'domain' is empty")
151     elif domain.lower() != domain:
152         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
153     elif not validators.domain(domain.split("/")[0]):
154         raise ValueError(f"domain='{domain}' is not a valid domain")
155     elif domain.endswith(".arpa"):
156         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
157     elif domain.endswith(".tld"):
158         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
159     elif not isinstance(origin, str) and origin is not None:
160         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
161     elif origin == "":
162         raise ValueError("Parameter 'origin' is empty")
163     elif not isinstance(nodeinfo_url, str):
164         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
165     elif nodeinfo_url == "":
166         raise ValueError("Parameter 'nodeinfo_url' is empty")
167
168     # No CSRF by default, you don't have to add network.api_headers by yourself here
169     headers = tuple()
170
171     try:
172         logger.debug("Checking CSRF for domain='%s'", domain)
173         headers = csrf.determine(domain, dict())
174     except network.exceptions as exception:
175         logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
176         instances.set_last_error(domain, exception)
177         return
178
179     try:
180         # json endpoint for newer mastodongs
181         found_blocks = list()
182         blocklist = list()
183
184         rows = {
185             "reject"        : [],
186             "media_removal" : [],
187             "followers_only": [],
188             "report_removal": [],
189         }
190
191         logger.debug("Querying API domain_blocks:", domain)
192         data = network.get_json_api(
193             domain,
194             "/api/v1/instance/domain_blocks",
195             headers,
196             (config.get("connection_timeout"), config.get("read_timeout"))
197         )
198
199         logger.debug("data[]='%s'", type(data))
200         if "error_message" in data:
201             logger.debug(f"Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
202             instances.set_last_error(domain, data)
203             return
204         elif "json" in data and "error" in data["json"]:
205             logger.warning(f"JSON API returned error message: '{data['json']['error']}'")
206             instances.set_last_error(domain, data)
207             return
208         else:
209             # Getting blocklist
210             blocklist = data["json"]
211
212         if len(blocklist) > 0:
213             logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
214             for block in blocklist:
215                 # Check type
216                 logger.debug("block[]='%s'", type(block))
217                 if not isinstance(block, dict):
218                     logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
219                     continue
220
221                 # Map block -> entry
222                 logger.debug(f"block[{type(block)}]='{block}'")
223                 entry = {
224                     "domain": block["domain"],
225                     "hash"  : block["digest"],
226                     "reason": block["comment"] if "comment" in block else None
227                 }
228
229                 logger.debug("severity='%s',domain='%s',hash='%s',comment='%s'", block['severity'], block['domain'], block['digest'], block['comment'])
230                 if block['severity'] == 'suspend':
231                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
232                     rows['reject'].append(entry)
233                 elif block['severity'] == 'silence':
234                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
235                     rows['followers_only'].append(entry)
236                 elif block['severity'] == 'reject_media':
237                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
238                     rows['media_removal'].append(entry)
239                 elif block['severity'] == 'reject_reports':
240                     logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
241                     rows['report_removal'].append(entry)
242                 else:
243                     logger.warning("Unknown severity='%s', domain='%s'", block['severity'], block['domain'])
244         else:
245             logger.debug("domain='%s' has returned zero rows, trying /about/more page ...", domain)
246             rows = fetch_blocks_from_about(domain)
247
248         logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
249         for block_level, blocklist in rows.items():
250             logger.debug("domain='%s',block_level='%s',blocklist()=%d", domain, block_level, len(blocklist))
251             block_level = tidyup.domain(block_level)
252
253             logger.debug("block_level='%s' - AFTER!", block_level)
254             if block_level == "":
255                 logger.warning("block_level is empty, domain='%s'", domain)
256                 continue
257             elif block_level == "accept":
258                 logger.debug("domain='%s' skipping block_level='accept'", domain)
259                 continue
260
261             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
262             for block in blocklist:
263                 logger.debug("block[]='%s'", type(block))
264                 blocked, blocked_hash, reason = block.values()
265                 logger.debug(f"blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
266                 blocked = tidyup.domain(blocked)
267                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
268                 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
269
270                 if blocked == "":
271                     logger.warning("blocked is empty, domain='%s'", domain)
272                     continue
273                 elif blacklist.is_blacklisted(blocked):
274                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
275                     continue
276                 elif blocked.count("*") > 0:
277                     # Doing the hash search for instance names as well to tidy up DB
278                     row = instances.deobscure("*", blocked, blocked_hash)
279
280                     logger.debug("row[]='%s'", type(row))
281                     if row is None:
282                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
283                         continue
284
285                     logger.debug("Updating domain: row[0]='%s'", row[0])
286                     blocked      = row[0]
287                     origin       = row[1]
288                     nodeinfo_url = row[2]
289                 elif blocked.count("?") > 0:
290                     # Doing the hash search for instance names as well to tidy up DB
291                     row = instances.deobscure("?", blocked, blocked_hash)
292
293                     logger.debug("row[]='%s'", type(row))
294                     if row is None:
295                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
296                         continue
297
298                     logger.debug("Updating domain: row[0]='%s'", row[0])
299                     blocked      = row[0]
300                     origin       = row[1]
301                     nodeinfo_url = row[2]
302
303                 logger.debug("Looking up instance by domain:", blocked)
304                 if not utils.is_domain_wanted((blocked):
305                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
306                     continue
307                 elif not instances.is_registered(blocked):
308                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
309                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
310
311                 logger.debug("Looking up instance by domain:", blocked)
312                 if not utils.is_domain_wanted((blocked):
313                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
314                     continue
315                 elif not instances.is_registered(blocked):
316                     logger.debug("Hash wasn't found, adding:", blocked, domain)
317                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
318
319                 if not blocks.is_instance_blocked(domain, blocked, block_level):
320                     logger.debug("Blocking:", domain, blocked, block_level)
321                     blocks.add_instance(domain, blocked, reason, block_level)
322
323                     if block_level == "reject":
324                         found_blocks.append({
325                             "blocked": blocked,
326                             "reason" : reason
327                         })
328                 else:
329                     logger.debug(f"Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
330                     blocks.update_last_seen(domain, blocked, block_level)
331                     blocks.update_reason(reason, domain, blocked, block_level)
332
333         logger.debug("Invoking commit() ...")
334         database.connection.commit()
335     except network.exceptions as exception:
336         logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
337         instances.set_last_error(domain, exception)
338
339     logger.debug("EXIT!")