]> git.mxchange.org Git - fba.git/blob - fba/networks/pleroma.py
Continued:
[fba.git] / fba / networks / pleroma.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21
22 from fba import database
23 from fba import utils
24
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29
30 from fba.http import federation
31 from fba.http import network
32
33 from fba.models import blocks
34 from fba.models import instances
35
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
38
39 # Language mapping X -> English
40 language_mapping = {
41     # English -> English
42     "Reject": "Suspended servers",
43 }
44
45 def fetch_blocks(domain: str, nodeinfo_url: str) -> list:
46     logger.debug("domain='%s',nodeinfo_url='%s' - CALLED!", domain, nodeinfo_url)
47     domain_helper.raise_on(domain)
48
49     if not isinstance(nodeinfo_url, str):
50         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
51     elif nodeinfo_url == "":
52         raise ValueError("Parameter 'nodeinfo_url' is empty")
53
54     blockdict = list()
55     rows = None
56     try:
57         logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
58         rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
59     except network.exceptions as exception:
60         logger.warning("Exception '%s' during fetching nodeinfo from domain='%s'", type(exception), domain)
61         instances.set_last_error(domain, exception)
62
63     if rows is None:
64         logger.warning("Could not fetch nodeinfo from domain='%s'", domain)
65         return list()
66     elif "metadata" not in rows:
67         logger.warning("rows()=%d does not have key 'metadata', domain='%s'", len(rows), domain)
68         return list()
69     elif "federation" not in rows["metadata"]:
70         logger.warning("rows()=%d does not have key 'federation', domain='%s'", len(rows['metadata']), domain)
71         return list()
72
73     data = rows["metadata"]["federation"]
74     found = False
75
76     logger.debug("data[]='%s'", type(data))
77     if "mrf_simple" in data:
78         logger.debug("Found mrf_simple in API response from domain='%s'", domain)
79         found = True
80         for block_level, blocklist in (
81             {
82                 **data["mrf_simple"],
83                 **{
84                     "quarantined_instances": data["quarantined_instances"]
85                 }
86             }
87         ).items():
88             logger.debug("block_level='%s', blocklist()=%d", block_level, len(blocklist))
89             block_level = tidyup.domain(block_level)
90             logger.debug("block_level='%s' - AFTER!", block_level)
91
92             if block_level == "":
93                 logger.warning("block_level is now empty!")
94                 continue
95             elif block_level == "accept":
96                 logger.debug("domain='%s' skipping block_level='accept'", domain)
97                 continue
98
99             logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(blocklist), domain, block_level)
100             if len(blocklist) > 0:
101                 for blocked in blocklist:
102                     logger.debug("blocked='%s' - BEFORE!", blocked)
103                     blocked = tidyup.domain(blocked)
104                     logger.debug("blocked='%s' - AFTER!", blocked)
105
106                     if blocked == "":
107                         logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
108                         continue
109                     elif blocked.endswith(".arpa"):
110                         logger.debug("blocked='%s' is a reverse IP address - SKIPPED!", blocked)
111                         continue
112                     elif blocked.endswith(".tld"):
113                         logger.debug("blocked='%s' is a fake domain - SKIPPED!", blocked)
114                         continue
115                     elif blocked.count("*") > 0:
116                         logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
117                         instances.set_has_obfuscation(domain, True)
118
119                         # Obscured domain name with no hash
120                         row = instances.deobfuscate("*", blocked)
121
122                         logger.debug("row[]='%s'", type(row))
123                         if row is None:
124                             logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
125                             continue
126
127                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
128                         blocked = row[0]
129                     elif blocked.count("?") > 0:
130                         logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
131                         instances.set_has_obfuscation(domain, True)
132
133                         # Obscured domain name with no hash
134                         row = instances.deobfuscate("?", blocked)
135
136                         logger.debug("row[]='%s'", type(row))
137                         if row is None:
138                             logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
139                             continue
140
141                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
142                         blocked = row[0]
143
144                     logger.debug("blocked='%s'", blocked)
145                     if not utils.is_domain_wanted(blocked):
146                         logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
147                         continue
148
149                     logger.debug("Appending blocker='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
150                     blockdict.append({
151                         "blocker"    : domain,
152                         "blocked"    : blocked,
153                         "reason"     : None,
154                         "block_level": block_level,
155                     })
156
157     elif "quarantined_instances" in data:
158         logger.debug("Found 'quarantined_instances' in JSON response: domain='%s'", domain)
159         found = True
160         block_level = "quarantined"
161
162         for blocked in data["quarantined_instances"]:
163             logger.debug("blocked='%s' - BEFORE!", blocked)
164             blocked = tidyup.domain(blocked)
165             logger.debug("blocked='%s' - AFTER!", blocked)
166
167             if blocked == "":
168                 logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
169                 continue
170             elif blocked.endswith(".arpa"):
171                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED!", blocked)
172                 continue
173             elif blocked.endswith(".tld"):
174                 logger.debug("blocked='%s' is a fake domain - SKIPPED!", blocked)
175                 continue
176             elif blocked.count("*") > 0:
177                 logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
178                 instances.set_has_obfuscation(domain, True)
179
180                 # Obscured domain name with no hash
181                 row = instances.deobfuscate("*", blocked)
182
183                 logger.debug("row[]='%s'", type(row))
184                 if row is None:
185                     logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
186                     continue
187
188                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
189                 blocked = row[0]
190             elif blocked.count("?") > 0:
191                 logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
192                 instances.set_has_obfuscation(domain, True)
193
194                 # Obscured domain name with no hash
195                 row = instances.deobfuscate("?", blocked)
196
197                 logger.debug("row[]='%s'", type(row))
198                 if row is None:
199                     logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
200                     continue
201
202                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
203                 blocked = row[0]
204
205             logger.debug("blocked='%s' - DEobfuscatED!", blocked)
206             if not utils.is_domain_wanted(blocked):
207                 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
208                 continue
209
210             logger.debug("Appending blocker='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
211             blockdict.append({
212                 "blocker"    : domain,
213                 "blocked"    : blocked,
214                 "reason"     : None,
215                 "block_level": block_level,
216             })
217
218     else:
219         logger.warning("Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='%s'", domain)
220
221     logger.debug("Invoking commit() ...")
222     database.connection.commit()
223
224     # Reasons
225     if "mrf_simple_info" in data:
226         logger.debug("Found mrf_simple_info in API response: domain='%s'", domain)
227         found = True
228         for block_level, info in (
229             {
230                 **data["mrf_simple_info"],
231                 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
232             }
233         ).items():
234             logger.debug("block_level='%s', info.items()=%d", block_level, len(info.items()))
235             block_level = tidyup.domain(block_level)
236             logger.debug("block_level='%s' - AFTER!", block_level)
237
238             if block_level == "":
239                 logger.warning("block_level is now empty!")
240                 continue
241             elif block_level == "accept":
242                 logger.debug("domain='%s' skipping block_level='accept'", domain)
243                 continue
244
245             logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(info.items()), domain, block_level)
246             for blocked, reason in info.items():
247                 logger.debug("blocked='%s',reason[%s]='%s' - BEFORE!", blocked, type(reason), reason)
248                 blocked = tidyup.domain(blocked)
249                 logger.debug("blocked='%s' - AFTER!", blocked)
250
251                 if isinstance(reason, str):
252                     logger.debug("reason[] is a string")
253                     reason = tidyup.reason(reason)
254                 elif isinstance(reason, dict) and "reason" in reason:
255                     logger.debug("reason[] is a dict")
256                     reason = tidyup.reason(reason["reason"])
257                 elif reason is not None:
258                     raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
259
260                 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
261
262                 if blocked == "":
263                     logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
264                     continue
265                 elif blocked.count("*") > 0:
266                     logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
267                     instances.set_has_obfuscation(domain, True)
268
269                     # Obscured domain name with no hash
270                     row = instances.deobfuscate("*", blocked)
271
272                     logger.debug("row[]='%s'", type(row))
273                     if row is None:
274                         logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
275                         continue
276
277                     logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
278                     blocked = row[0]
279                 elif blocked.count("?") > 0:
280                     logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
281                     instances.set_has_obfuscation(domain, True)
282
283                     # Obscured domain name with no hash
284                     row = instances.deobfuscate("?", blocked)
285
286                     logger.debug("row[]='%s'", type(row))
287                     if row is None:
288                         logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
289                         continue
290
291                     logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
292                     blocked = row[0]
293
294                 logger.debug("blocked='%s' - DEobfuscatED!", blocked)
295                 if not utils.is_domain_wanted(blocked):
296                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
297                     continue
298
299                 logger.debug("Checking %d blockdict records ...", len(blockdict))
300                 for block in blockdict:
301                     logger.debug("block[blocked]='%s',blocked='%s'", block['blocked'], blocked)
302                     if block['blocked'] == blocked:
303                         logger.debug("Updating reason='%s' for blocker='%s'", reason, block['blocked'])
304                         block['reason'] = reason
305
306     elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
307         logger.debug("Found 'quarantined_instances_info' in JSON response: domain='%s'", domain)
308         found = True
309         block_level = "quarantined"
310
311         #print(data["quarantined_instances_info"])
312         rows = data["quarantined_instances_info"]["quarantined_instances"]
313         for blocked in rows:
314             logger.debug("blocked='%s' - BEFORE!", blocked)
315             blocked = tidyup.domain(blocked)
316             logger.debug("blocked='%s' - AFTER!", blocked)
317
318             if blocked not in rows or "reason" not in rows[blocked]:
319                 logger.warning("Cannot find blocked='%s' in rows()=%d,domain='%s' - BREAK!", blocked, len(rows), domain)
320                 break
321
322             reason = rows[blocked]["reason"]
323             logger.debug("reason='%s'", reason)
324
325             if blocked == "":
326                 logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
327                 continue
328             elif blocked.count("*") > 0:
329                 logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
330                 instances.set_has_obfuscation(domain, True)
331
332                 # Obscured domain name with no hash
333                 row = instances.deobfuscate("*", blocked)
334
335                 logger.debug("row[]='%s'", type(row))
336                 if row is None:
337                     logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
338                     continue
339
340                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
341                 blocked = row[0]
342             elif blocked.count("?") > 0:
343                 logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
344                 instances.set_has_obfuscation(domain, True)
345
346                 # Obscured domain name with no hash
347                 row = instances.deobfuscate("?", blocked)
348
349                 logger.debug("row[]='%s'", type(row))
350                 if row is None:
351                     logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
352                     continue
353
354                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
355                 blocked = row[0]
356
357             logger.debug("blocked='%s'", blocked)
358             if not utils.is_domain_wanted(blocked):
359                 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
360                 continue
361
362             logger.debug("Checking %d blockdict records ...", len(blockdict))
363             for block in blockdict:
364                 logger.debug("block[blocked]='%s',blocked='%s'", block['blocked'], blocked)
365                 if block['blocked'] == blocked:
366                     logger.debug("Updating reason='%s' for blocker='%s'", reason, block['blocked'])
367                     block['reason'] = reason
368     else:
369         logger.warning("Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='%s'", domain)
370
371     if not found:
372         logger.debug("Did not find any useable JSON elements, domain='%s', continuing with /about page ...", domain)
373         blocklist = fetch_blocks_from_about(domain)
374
375         logger.debug("blocklist()=%d", len(blocklist))
376         if len(blocklist) > 0:
377             logger.info("Checking %d record(s) ...", len(blocklist))
378             for block_level in blocklist:
379                 logger.debug("block_level='%s'", block_level)
380
381                 rows = blocklist[block_level]
382                 logger.debug("rows[%s]()=%d'", type(rows), len(rows))
383                 for record in rows:
384                     logger.debug("record[]='%s'", type(record))
385                     blocked = tidyup.domain(record["blocked"])
386                     reason  = tidyup.reason(record["reason"])
387                     logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
388
389                     if blocked == "":
390                         logger.warning("blocked is empty after tidyup.domain(): domain='%s',block_level='%s'", domain, block_level)
391                         continue
392                     elif blocked.count("*") > 0:
393                         logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
394                         instances.set_has_obfuscation(domain, True)
395
396                         # Obscured domain name with no hash
397                         row = instances.deobfuscate("*", blocked)
398
399                         logger.debug("row[]='%s'", type(row))
400                         if row is None:
401                             logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
402                             continue
403
404                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
405                         blocked = row[0]
406                     elif blocked.count("?") > 0:
407                         logger.debug("domain='%s' uses obfuscated domains, marking ...", domain)
408                         instances.set_has_obfuscation(domain, True)
409
410                         # Obscured domain name with no hash
411                         row = instances.deobfuscate("?", blocked)
412
413                         logger.debug("row[]='%s'", type(row))
414                         if row is None:
415                             logger.warning("Cannot deobfuscate blocked='%s',domain='%s' - SKIPPED!", blocked, domain)
416                             continue
417
418                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
419                         blocked = row[0]
420
421                     logger.debug("blocked='%s' - DEobfuscatED!", blocked)
422                     if not utils.is_domain_wanted(blocked):
423                         logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
424                         continue
425
426                     logger.debug("Appending blocker='%s',blocked='%s',reason='%s',block_level='%s' ...",domain, blocked, reason, block_level)
427                     blockdict.append({
428                         "blocker"    : domain,
429                         "blocked"    : blocked,
430                         "reason"     : reason,
431                         "block_level": block_level,
432                     })
433
434     logger.debug("blockdict()=%d - EXIT!", len(blockdict))
435     return blockdict
436
437 def fetch_blocks_from_about(domain: str) -> dict:
438     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
439     domain_helper.raise_on(domain)
440
441     logger.debug("Fetching mastodon blocks from domain='%s'", domain)
442     doc = None
443     for path in ["/instance/about/index.html"]:
444         try:
445             # Resetting doc type
446             doc = None
447
448             logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
449             response = network.fetch_response(
450                 domain,
451                 path,
452                 network.web_headers,
453                 (config.get("connection_timeout"), config.get("read_timeout"))
454             )
455
456             logger.debug("response.ok='%s',response.status_code='%d',response.text()=%d", response.ok, response.status_code, len(response.text))
457             if not response.ok or response.text.strip() == "":
458                 logger.warning("path='%s' does not exist on domain='%s' - SKIPPED!", path, domain)
459                 continue
460
461             logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
462             doc = bs4.BeautifulSoup(
463                 response.text,
464                 "html.parser",
465             )
466
467             logger.debug("doc[]='%s'", type(doc))
468             if doc.find("h2") is not None:
469                 logger.debug("Found 'h2' header in path='%s' - BREAK!", path)
470                 break
471
472         except network.exceptions as exception:
473             logger.warning("Cannot fetch from domain='%s',exception[%s]='%s'", domain, type(exception), str(exception))
474             instances.set_last_error(domain, exception)
475             break
476
477     blocklist = {
478         "Suspended servers": [],
479         "Filtered media"   : [],
480         "Limited servers"  : [],
481         "Silenced servers" : [],
482     }
483
484     logger.debug("doc[]='%s'", type(doc))
485     if doc is None:
486         logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain)
487         return list()
488
489     for header in doc.find_all("h2"):
490         header_text = tidyup.reason(header.text)
491
492         logger.debug("header_text='%s' - BEFORE!", header_text)
493         if header_text in language_mapping:
494             logger.debug("header_text='%s' - FOUND!", header_text)
495             header_text = language_mapping[header_text]
496         else:
497             logger.warning("header_text='%s' not found in language mapping table", header_text)
498
499         logger.debug("header_text='%s - AFTER!'", header_text)
500         if header_text in blocklist or header_text.lower() in blocklist:
501             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
502             logger.debug("Found header_text='%s', importing domain blocks ...", header_text)
503             for line in header.find_next("table").find_all("tr")[1:]:
504                 logger.debug("line[]='%s'", type(line))
505                 blocklist[header_text].append({
506                     "blocked": tidyup.domain(line.find_all("td")[0].text),
507                     "reason" : tidyup.reason(line.find_all("td")[1].text),
508                 })
509         else:
510             logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist))
511
512     logger.debug("Returning blocklist for domain='%s' - EXIT!", domain)
513     return {
514         "reject"        : blocklist["Suspended servers"],
515         "media_removal" : blocklist["Filtered media"],
516         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
517     }