]> git.mxchange.org Git - fba.git/blob - fba/networks/pleroma.py
390b01e064c0f93bfe7195a39f779a57922c89c0
[fba.git] / fba / networks / pleroma.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21
22 from fba import database
23 from fba import utils
24
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import domain as domain_helper
28 from fba.helpers import tidyup
29
30 from fba.http import federation
31 from fba.http import network
32
33 from fba.models import blocks
34 from fba.models import instances
35
36 logging.basicConfig(level=logging.INFO)
37 logger = logging.getLogger(__name__)
38
39 # Language mapping X -> English
40 language_mapping = {
41     # English -> English
42     "Reject": "Suspended servers",
43 }
44
45 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
46     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
47     domain_helper.raise_on(domain)
48     if not isinstance(origin, str) and origin is not None:
49         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
50     elif origin == "":
51         raise ValueError("Parameter 'origin' is empty")
52     elif not isinstance(nodeinfo_url, str):
53         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
54     elif nodeinfo_url == "":
55         raise ValueError("Parameter 'nodeinfo_url' is empty")
56
57     # @TODO Unused blockdict
58     blockdict = list()
59     rows = None
60     try:
61         logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
62         rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
63     except network.exceptions as exception:
64         logger.warning("Exception '%s' during fetching nodeinfo from domain='%s'", type(exception), domain)
65         instances.set_last_error(domain, exception)
66
67     if rows is None:
68         logger.warning("Could not fetch nodeinfo from domain:", domain)
69         return
70     elif "metadata" not in rows:
71         logger.warning("rows()=%d does not have key 'metadata', domain='%s'", len(rows), domain)
72         return
73     elif "federation" not in rows["metadata"]:
74         logger.warning("rows()=%d does not have key 'federation', domain='%s'", len(rows['metadata']), domain)
75         return
76
77     data = rows["metadata"]["federation"]
78     found = False
79
80     logger.debug("data[]='%s'", type(data))
81     if "mrf_simple" in data:
82         logger.debug("Found mrf_simple:", domain)
83         found = True
84         for block_level, blocklist in (
85             {
86                 **data["mrf_simple"],
87                 **{
88                     "quarantined_instances": data["quarantined_instances"]
89                 }
90             }
91         ).items():
92             logger.debug("block_level='%s', blocklist()=%d", block_level, len(blocklist))
93             block_level = tidyup.domain(block_level)
94             logger.debug("block_level='%s' - AFTER!", block_level)
95
96             if block_level == "":
97                 logger.warning("block_level is now empty!")
98                 continue
99             elif block_level == "accept":
100                 logger.debug("domain='%s' skipping block_level='accept'", domain)
101                 continue
102
103             logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(blocklist), domain, block_level)
104             if len(blocklist) > 0:
105                 for blocked in blocklist:
106                     logger.debug("blocked='%s' - BEFORE!", blocked)
107                     blocked = tidyup.domain(blocked)
108                     logger.debug("blocked='%s' - AFTER!", blocked)
109
110                     if blocked == "":
111                         logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
112                         continue
113                     elif blocked.endswith(".arpa"):
114                         logger.debug("blocked='%s' is a reverse IP address - SKIPPED!", blocked)
115                         continue
116                     elif blocked.endswith(".tld"):
117                         logger.debug("blocked='%s' is a fake domain - SKIPPED!", blocked)
118                         continue
119                     elif blocked.count("*") > 0:
120                         # Obscured domain name with no hash
121                         row = instances.deobscure("*", blocked)
122
123                         logger.debug("row[]='%s'", type(row))
124                         if row is None:
125                             logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
126                             continue
127
128                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
129                         blocked      = row[0]
130                         origin       = row[1]
131                         nodeinfo_url = row[2]
132                     elif blocked.count("?") > 0:
133                         # Obscured domain name with no hash
134                         row = instances.deobscure("?", blocked)
135
136                         logger.debug("row[]='%s'", type(row))
137                         if row is None:
138                             logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
139                             continue
140
141                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
142                         blocked      = row[0]
143                         origin       = row[1]
144                         nodeinfo_url = row[2]
145
146                     logger.debug("blocked='%s'", blocked)
147                     if not utils.is_domain_wanted(blocked):
148                         logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
149                         continue
150                     elif not instances.is_registered(blocked):
151                         # Commit changes
152                         logger.debug("Invoking commit() ...")
153                         database.connection.commit()
154
155                         logger.debug("Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='%s'", blocked, domain, origin, nodeinfo_url)
156                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
157
158                     if not blocks.is_instance_blocked(domain, blocked, block_level):
159                         logger.debug("Blocking domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
160                         blocks.add_instance(domain, blocked, None, block_level)
161
162                         if block_level == "reject":
163                             logger.debug("Appending blocked='%s' ...", blocked)
164                             blockdict.append({
165                                 "blocked": blocked,
166                                 "reason" : None
167                             })
168                     else:
169                         logger.debug("Updating block last seen for domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
170                         blocks.update_last_seen(domain, blocked, block_level)
171     elif "quarantined_instances" in data:
172         logger.debug("Found 'quarantined_instances' in JSON response: domain='%s'", domain)
173         found = True
174         block_level = "quarantined"
175
176         for blocked in data["quarantined_instances"]:
177             logger.debug("blocked='%s' - BEFORE!", blocked)
178             blocked = tidyup.domain(blocked)
179             logger.debug("blocked='%s' - AFTER!", blocked)
180
181             if blocked == "":
182                 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
183                 continue
184             elif blocked.endswith(".arpa"):
185                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED!", blocked)
186                 continue
187             elif blocked.endswith(".tld"):
188                 logger.debug("blocked='%s' is a fake domain - SKIPPED!", blocked)
189                 continue
190             elif blocked.count("*") > 0:
191                 # Obscured domain name with no hash
192                 row = instances.deobscure("*", blocked)
193
194                 logger.debug("row[]='%s'", type(row))
195                 if row is None:
196                     logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
197                     continue
198
199                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
200                 blocked      = row[0]
201                 origin       = row[1]
202                 nodeinfo_url = row[2]
203             elif blocked.count("?") > 0:
204                 # Obscured domain name with no hash
205                 row = instances.deobscure("?", blocked)
206
207                 logger.debug("row[]='%s'", type(row))
208                 if row is None:
209                     logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
210                     continue
211
212                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
213                 blocked      = row[0]
214                 origin       = row[1]
215                 nodeinfo_url = row[2]
216
217             logger.debug("blocked='%s' - DEOBSFUCATED!", blocked)
218             if not utils.is_domain_wanted(blocked):
219                 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
220                 continue
221             elif not instances.is_registered(blocked):
222                 # Commit changes
223                 logger.debug("Invoking commit() ...")
224                 database.connection.commit()
225
226                 logger.debug("Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='{nodeinfo_url}'", blocked, domain, origin)
227                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
228
229             if not blocks.is_instance_blocked(domain, blocked, block_level):
230                 logger.debug("Blocking domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
231                 blocks.add_instance(domain, blocked, None, block_level)
232
233                 if block_level == "reject":
234                     logger.debug("Appending blocked='%s' ...", blocked)
235                     blockdict.append({
236                         "blocked": blocked,
237                         "reason" : None
238                     })
239             else:
240                 logger.debug("Updating block last seen for domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
241                 blocks.update_last_seen(domain, blocked, block_level)
242     else:
243         logger.warning("Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='%s'", domain)
244
245     logger.debug("Invoking commit() ...")
246     database.connection.commit()
247
248     # Reasons
249     if "mrf_simple_info" in data:
250         logger.debug("Found mrf_simple_info in API response: domain='%s'", domain)
251         found = True
252         for block_level, info in (
253             {
254                 **data["mrf_simple_info"],
255                 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
256             }
257         ).items():
258             logger.debug("block_level='%s', info.items()=%d", block_level, len(info.items()))
259             block_level = tidyup.domain(block_level)
260             logger.debug("block_level='%s' - AFTER!", block_level)
261
262             if block_level == "":
263                 logger.warning("block_level is now empty!")
264                 continue
265             elif block_level == "accept":
266                 logger.debug("domain='%s' skipping block_level='accept'", domain)
267                 continue
268
269             logger.debug("Checking %d entries from domain='%s',block_level='%s' ...", len(info.items()), domain, block_level)
270             for blocked, reason in info.items():
271                 logger.debug("blocked='%s',reason[%s]='%s' - BEFORE!", blocked, type(reason), reason)
272                 blocked = tidyup.domain(blocked)
273                 logger.debug("blocked='%s' - AFTER!", blocked)
274
275                 if isinstance(reason, str):
276                     logger.debug("reason[] is a string")
277                     reason = tidyup.reason(reason)
278                 elif isinstance(reason, dict) and "reason" in reason:
279                     logger.debug("reason[] is a dict")
280                     reason = tidyup.reason(reason["reason"])
281                 elif reason is not None:
282                     raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
283
284                 logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
285
286                 if blocked == "":
287                     logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
288                     continue
289                 elif blocked.count("*") > 0:
290                     # Obscured domain name with no hash
291                     row = instances.deobscure("*", blocked)
292
293                     logger.debug("row[]='%s'", type(row))
294                     if row is None:
295                         logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
296                         continue
297
298                     logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
299                     blocked      = row[0]
300                     origin       = row[1]
301                     nodeinfo_url = row[2]
302                 elif blocked.count("?") > 0:
303                     # Obscured domain name with no hash
304                     row = instances.deobscure("?", blocked)
305
306                     logger.debug("row[]='%s'", type(row))
307                     if row is None:
308                         logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
309                         continue
310
311                     logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
312                     blocked      = row[0]
313                     origin       = row[1]
314                     nodeinfo_url = row[2]
315
316                 logger.debug("blocked='%s' - DEOBSFUCATED!", blocked)
317                 if not utils.is_domain_wanted(blocked):
318                     logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
319                     continue
320                 elif not instances.is_registered(blocked):
321                     logger.debug("Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='%s'", blocked, domain, origin, nodeinfo_url)
322                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
323
324                 logger.debug("Updating block reason: reason='%s',domain='%s',blocked='%s',block_level='%s'", reason, domain, blocked, block_level)
325                 blocks.update_reason(reason, domain, blocked, block_level)
326
327                 logger.debug("Checking %d blockdict records ...", len(blockdict))
328                 for entry in blockdict:
329                     if entry["blocked"] == blocked:
330                         logger.debug("Updating entry reason: blocked='%s',reason='%s'", blocked, reason)
331                         entry["reason"] = reason
332
333     elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
334         logger.debug("Found 'quarantined_instances_info' in JSON response: domain='%s'", domain)
335         found = True
336         block_level = "quarantined"
337
338         #print(data["quarantined_instances_info"])
339         rows = data["quarantined_instances_info"]["quarantined_instances"]
340         for blocked in rows:
341             logger.debug("blocked='%s' - BEFORE!", blocked)
342             blocked = tidyup.domain(blocked)
343             logger.debug("blocked='%s' - AFTER!", blocked)
344
345             if blocked not in rows or "reason" not in rows[blocked]:
346                 logger.warning("Cannot find blocked='%s' in rows()=%d,domain='%s' - BREAK!", blocked, len(rows), domain)
347                 break
348
349             reason = rows[blocked]["reason"]
350             logger.debug("reason='%s'", reason)
351
352             if blocked == "":
353                 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
354                 continue
355             elif blocked.count("*") > 0:
356                 # Obscured domain name with no hash
357                 row = instances.deobscure("*", blocked)
358
359                 logger.debug("row[]='%s'", type(row))
360                 if row is None:
361                     logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
362                     continue
363
364                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
365                 blocked      = row[0]
366                 origin       = row[1]
367                 nodeinfo_url = row[2]
368             elif blocked.count("?") > 0:
369                 # Obscured domain name with no hash
370                 row = instances.deobscure("?", blocked)
371
372                 logger.debug("row[]='%s'", type(row))
373                 if row is None:
374                     logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
375                     continue
376
377                 logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
378                 blocked      = row[0]
379                 origin       = row[1]
380                 nodeinfo_url = row[2]
381
382             logger.debug("blocked='%s'", blocked)
383             if not utils.is_domain_wanted(blocked):
384                 logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
385                 continue
386             elif not instances.is_registered(blocked):
387                 logger.debug("Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='%s'", blocked, domain, origin, nodeinfo_url)
388                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
389
390             logger.debug("Updating block reason: reason='%s',domain='%s',blocked='%s',block_level='%s'", reason, domain, blocked, block_level)
391             blocks.update_reason(reason, domain, blocked, block_level)
392
393             logger.debug("Checking %d blockdict records ...", len(blockdict))
394             for entry in blockdict:
395                 if entry["blocked"] == blocked:
396                     logger.debug("Updating entry reason: blocked='%s',reason='%s'", blocked, reason)
397                     entry["reason"] = reason
398     else:
399         logger.warning("Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='%s'", domain)
400
401     if not found:
402         logger.debug("Did not find any useable JSON elements, domain='%s', continuing with /about page ...", domain)
403         blocklist = fetch_blocks_from_about(domain)
404
405         logger.debug("blocklist()=%d", len(blocklist))
406         if len(blocklist) > 0:
407             logger.info("Checking %d record(s) ...", len(blocklist))
408             for block_level in blocklist:
409                 logger.debug("block_level='%s'", block_level)
410
411                 rows = blocklist[block_level]
412                 logger.debug("rows[%s]()=%d'", type(rows), len(rows))
413                 for record in rows:
414                     logger.debug("record[]='%s'", type(record))
415                     blocked = tidyup.domain(record["blocked"])
416                     reason  = tidyup.reason(record["reason"])
417                     logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
418
419                     if blocked == "":
420                         logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
421                         continue
422                     elif blocked.count("*") > 0:
423                         # Obscured domain name with no hash
424                         row = instances.deobscure("*", blocked)
425
426                         logger.debug("row[]='%s'", type(row))
427                         if row is None:
428                             logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
429                             continue
430
431                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
432                         blocked      = row[0]
433                         origin       = row[1]
434                         nodeinfo_url = row[2]
435                     elif blocked.count("?") > 0:
436                         # Obscured domain name with no hash
437                         row = instances.deobscure("?", blocked)
438
439                         logger.debug("row[]='%s'", type(row))
440                         if row is None:
441                             logger.warning("Cannot deobsfucate blocked='%s',domain='%s',origin='%s' - SKIPPED!", blocked, domain, origin)
442                             continue
443
444                         logger.debug("blocked='%s' de-obscured to '%s'", blocked, row[0])
445                         blocked      = row[0]
446                         origin       = row[1]
447                         nodeinfo_url = row[2]
448
449                     logger.debug("blocked='%s' - DEOBSFUCATED!", blocked)
450                     if not utils.is_domain_wanted(blocked):
451                         logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
452                         continue
453                     elif not instances.is_registered(blocked):
454                         logger.debug("Domain blocked='%s' wasn't found, adding ..., domain='%s',origin='%s',nodeinfo_url='%s'", blocked, domain, origin, nodeinfo_url)
455                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
456
457                     if not blocks.is_instance_blocked(domain, blocked, block_level):
458                         logger.debug("Blocking domain='%s',blocked='%s', block_level='%s' ...", domain, blocked, block_level)
459                         blocks.add_instance(domain, blocked, reason, block_level)
460
461                         if block_level == "reject":
462                             logger.debug("Appending blocked='%s' ...", blocked)
463                             blockdict.append({
464                                 "blocked": blocked,
465                                 "reason" : reason
466                             })
467                     else:
468                         logger.debug("Updating block last seen for domain='%s',blocked='%s',block_level='%s' ...", domain, blocked, block_level)
469                         blocks.update_reason(reason, domain, blocked, block_level)
470
471     logger.debug("Invoking commit() ...")
472     database.connection.commit()
473
474     logger.debug("EXIT!")
475
476 def fetch_blocks_from_about(domain: str) -> dict:
477     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
478     domain_helper.raise_on(domain)
479
480     logger.debug("Fetching mastodon blocks from domain='%s'", domain)
481     doc = None
482     for path in ["/instance/about/index.html"]:
483         try:
484             # Resetting doc type
485             doc = None
486
487             logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
488             response = network.fetch_response(
489                 domain,
490                 path,
491                 network.web_headers,
492                 (config.get("connection_timeout"), config.get("read_timeout"))
493             )
494
495             logger.debug("response.ok='%s',response.status_code='%d',response.text()=%d", response.ok, response.status_code, len(response.text))
496             if not response.ok or response.text.strip() == "":
497                 logger.warning("path='%s' does not exist on domain='%s' - SKIPPED!", path, domain)
498                 continue
499
500             logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
501             doc = bs4.BeautifulSoup(
502                 response.text,
503                 "html.parser",
504             )
505
506             logger.debug("doc[]='%s'", type(doc))
507             if doc.find("h2") is not None:
508                 logger.debug("Found 'h2' header in path='%s' - BREAK!", path)
509                 break
510
511         except network.exceptions as exception:
512             logger.warning("Cannot fetch from domain:", domain, exception)
513             instances.set_last_error(domain, exception)
514             break
515
516     blocklist = {
517         "Suspended servers": [],
518         "Filtered media"   : [],
519         "Limited servers"  : [],
520         "Silenced servers" : [],
521     }
522
523     logger.debug("doc[]='%s'", type(doc))
524     if doc is None:
525         logger.warning("Cannot fetch any /about pages for domain='%s' - EXIT!", domain)
526         return blocklist
527
528     for header in doc.find_all("h2"):
529         header_text = tidyup.reason(header.text)
530
531         logger.debug("header_text='%s' - BEFORE!", header_text)
532         if header_text in language_mapping:
533             logger.debug("header_text='%s' - FOUND!", header_text)
534             header_text = language_mapping[header_text]
535         else:
536             logger.warning("header_text='%s' not found in language mapping table", header_text)
537
538         logger.debug("header_text='%s - AFTER!'", header_text)
539         if header_text in blocklist or header_text.lower() in blocklist:
540             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
541             logger.debug("Found header_text='%s', importing domain blocks ...", header_text)
542             for line in header.find_next("table").find_all("tr")[1:]:
543                 logger.debug("line[]='%s'", type(line))
544                 blocklist[header_text].append({
545                     "blocked": tidyup.domain(line.find_all("td")[0].text),
546                     "reason" : tidyup.reason(line.find_all("td")[1].text),
547                 })
548         else:
549             logger.warning("header_text='%s' not found in blocklist()=%d", header_text, len(blocklist))
550
551     logger.debug("Returning blocklist for domain='%s' - EXIT!", domain)
552     return {
553         "reject"        : blocklist["Suspended servers"],
554         "media_removal" : blocklist["Filtered media"],
555         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
556     }