]> git.mxchange.org Git - fba.git/blob - fba/networks/pleroma.py
de5864c1f0155c5a58df8b3ade4f13612cab7ff9
[fba.git] / fba / networks / pleroma.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import inspect
18 import logging
19
20 import bs4
21 import validators
22
23 from fba import fba
24
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import tidyup
28
29 from fba.http import federation
30 from fba.http import network
31
32 from fba.models import blocks
33 from fba.models import instances
34
35 logging.basicConfig(level=logging.INFO)
36 logger = logging.getLogger(__name__)
37
38 # Language mapping X -> English
39 language_mapping = {
40     # English -> English
41     "Reject": "Suspended servers",
42 }
43
44 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
45     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
46     if not isinstance(domain, str):
47         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
48     elif domain == "":
49         raise ValueError("Parameter 'domain' is empty")
50     elif domain.lower() != domain:
51         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
52     elif not validators.domain(domain.split("/")[0]):
53         raise ValueError(f"domain='{domain}' is not a valid domain")
54     elif domain.endswith(".arpa"):
55         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
56     elif domain.endswith(".tld"):
57         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
58     elif not isinstance(origin, str) and origin is not None:
59         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
60     elif origin == "":
61         raise ValueError("Parameter 'origin' is empty")
62     elif not isinstance(nodeinfo_url, str):
63         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
64     elif nodeinfo_url == "":
65         raise ValueError("Parameter 'nodeinfo_url' is empty")
66
67     # @TODO Unused blockdict
68     blockdict = list()
69     rows = None
70     try:
71         logger.debug(f"Fetching nodeinfo: domain='{domain}',nodeinfo_url='{nodeinfo_url}'")
72         rows = federation.fetch_nodeinfo(domain, nodeinfo_url)
73     except network.exceptions as exception:
74         logger.warning(f"Exception '{type(exception)}' during fetching nodeinfo")
75         instances.set_last_error(domain, exception)
76
77     if rows is None:
78         logger.warning("Could not fetch nodeinfo from domain:", domain)
79         return
80     elif "metadata" not in rows:
81         logger.warning(f"rows()={len(rows)} does not have key 'metadata', domain='{domain}'")
82         return
83     elif "federation" not in rows["metadata"]:
84         logger.warning(f"rows()={len(rows['metadata'])} does not have key 'federation', domain='{domain}'")
85         return
86
87     data = rows["metadata"]["federation"]
88     found = False
89
90     logger.debug("data[]='%s'", type(data))
91     if "mrf_simple" in data:
92         logger.debug("Found mrf_simple:", domain)
93         found = True
94         for block_level, blocklist in (
95             {
96                 **data["mrf_simple"],
97                 **{
98                     "quarantined_instances": data["quarantined_instances"]
99                 }
100             }
101         ).items():
102             logger.debug("block_level, blocklist():", block_level, len(blocklist))
103             block_level = tidyup.domain(block_level)
104             logger.debug("BEFORE block_level:", block_level)
105
106             if block_level == "":
107                 logger.warning("block_level is now empty!")
108                 continue
109             elif block_level == "accept":
110                 logger.debug(f"domain='{domain}' skipping block_level='accept'")
111                 continue
112
113             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
114             if len(blocklist) > 0:
115                 for blocked in blocklist:
116                     logger.debug("BEFORE blocked:", blocked)
117                     blocked = tidyup.domain(blocked)
118                     logger.debug("AFTER blocked:", blocked)
119
120                     if blocked == "":
121                         logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
122                         continue
123                     elif blacklist.is_blacklisted(blocked):
124                         logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
125                         continue
126                     elif blocked.count("*") > 0:
127                         # Obscured domain name with no hash
128                         row = instances.deobscure("*", blocked)
129
130                         logger.debug("row[]='%s'", type(row))
131                         if row is None:
132                             logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
133                             continue
134
135                         logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
136                         blocked      = row[0]
137                         origin       = row[1]
138                         nodeinfo_url = row[2]
139                     elif blocked.count("?") > 0:
140                         # Obscured domain name with no hash
141                         row = instances.deobscure("?", blocked)
142
143                         logger.debug("row[]='%s'", type(row))
144                         if row is None:
145                             logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
146                             continue
147
148                         logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
149                         blocked      = row[0]
150                         origin       = row[1]
151                         nodeinfo_url = row[2]
152
153                     logger.debug(f"blocked='{blocked}'")
154                     if not validators.domain(blocked):
155                         logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
156                         continue
157                     elif blocked.endswith(".arpa"):
158                         logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
159                         continue
160                     elif blocked.endswith(".tld"):
161                         logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
162                         continue
163                     elif blacklist.is_blacklisted(blocked):
164                         logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
165                         continue
166                     elif not instances.is_registered(blocked):
167                         # Commit changes
168                         fba.connection.commit()
169
170                         logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
171                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
172
173                     if not blocks.is_instance_blocked(domain, blocked, block_level):
174                         logger.debug("Blocking:", domain, blocked, block_level)
175                         blocks.add_instance(domain, blocked, None, block_level)
176
177                         if block_level == "reject":
178                             logger.debug("Adding to blockdict:", blocked)
179                             blockdict.append({
180                                 "blocked": blocked,
181                                 "reason" : None
182                             })
183                     else:
184                         logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
185                         blocks.update_last_seen(domain, blocked, block_level)
186     elif "quarantined_instances" in data:
187         logger.debug(f"Found 'quarantined_instances' in JSON response: domain='{domain}'")
188         found = True
189         block_level = "quarantined"
190
191         for blocked in data["quarantined_instances"]:
192             logger.debug("BEFORE blocked:", blocked)
193             blocked = tidyup.domain(blocked)
194             logger.debug("AFTER blocked:", blocked)
195
196             if blocked == "":
197                 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
198                 continue
199             elif blacklist.is_blacklisted(blocked):
200                 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
201                 continue
202             elif blocked.count("*") > 0:
203                 # Obscured domain name with no hash
204                 row = instances.deobscure("*", blocked)
205
206                 logger.debug("row[]='%s'", type(row))
207                 if row is None:
208                     logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
209                     continue
210
211                 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
212                 blocked      = row[0]
213                 origin       = row[1]
214                 nodeinfo_url = row[2]
215             elif blocked.count("?") > 0:
216                 # Obscured domain name with no hash
217                 row = instances.deobscure("?", blocked)
218
219                 logger.debug("row[]='%s'", type(row))
220                 if row is None:
221                     logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
222                     continue
223
224                 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
225                 blocked      = row[0]
226                 origin       = row[1]
227                 nodeinfo_url = row[2]
228
229             logger.debug(f"blocked='{blocked}'")
230             if not validators.domain(blocked):
231                 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
232                 continue
233             elif blocked.endswith(".arpa"):
234                 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
235                 continue
236             elif blocked.endswith(".tld"):
237                 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
238                 continue
239             elif blacklist.is_blacklisted(blocked):
240                 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
241                 continue
242             elif not instances.is_registered(blocked):
243                 # Commit changes
244                 fba.connection.commit()
245
246                 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
247                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
248
249             if not blocks.is_instance_blocked(domain, blocked, block_level):
250                 logger.debug("Blocking:", domain, blocked, block_level)
251                 blocks.add_instance(domain, blocked, None, block_level)
252
253                 if block_level == "reject":
254                     logger.debug("Adding to blockdict:", blocked)
255                     blockdict.append({
256                         "blocked": blocked,
257                         "reason" : None
258                     })
259             else:
260                 logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
261                 blocks.update_last_seen(domain, blocked, block_level)
262     else:
263         logger.warning(f"Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'")
264
265     logger.debug("Committing changes ...")
266     fba.connection.commit()
267
268     # Reasons
269     if "mrf_simple_info" in data:
270         logger.debug("Found mrf_simple_info:", domain)
271         found = True
272         for block_level, info in (
273             {
274                 **data["mrf_simple_info"],
275                 **(data["quarantined_instances_info"] if "quarantined_instances_info" in data else {})
276             }
277         ).items():
278             logger.debug("block_level, info.items():", block_level, len(info.items()))
279             block_level = tidyup.domain(block_level)
280             logger.debug("BEFORE block_level:", block_level)
281
282             if block_level == "":
283                 logger.warning("block_level is now empty!")
284                 continue
285             elif block_level == "accept":
286                 logger.debug(f"domain='{domain}' skipping block_level='accept'")
287                 continue
288
289             logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',software='pleroma',block_level='{block_level}' ...")
290             for blocked, reason in info.items():
291                 logger.debug(f"blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!")
292                 blocked = tidyup.domain(blocked)
293
294                 if isinstance(reason, str):
295                     logger.debug("reason[] is a string")
296                     reason = tidyup.reason(reason)
297                 elif isinstance(reason, dict) and "reason" in reason:
298                     logger.debug("reason[] is a dict")
299                     reason = tidyup.reason(reason["reason"])
300                 elif reason is not None:
301                     raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
302
303                 logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
304
305                 if blocked == "":
306                     logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
307                     continue
308                 elif blacklist.is_blacklisted(blocked):
309                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
310                     continue
311                 elif blocked.count("*") > 0:
312                     # Obscured domain name with no hash
313                     row = instances.deobscure("*", blocked)
314
315                     logger.debug("row[]='%s'", type(row))
316                     if row is None:
317                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
318                         continue
319
320                     logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
321                     blocked      = row[0]
322                     origin       = row[1]
323                     nodeinfo_url = row[2]
324                 elif blocked.count("?") > 0:
325                     # Obscured domain name with no hash
326                     row = instances.deobscure("?", blocked)
327
328                     logger.debug("row[]='%s'", type(row))
329                     if row is None:
330                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
331                         continue
332
333                     logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
334                     blocked      = row[0]
335                     origin       = row[1]
336                     nodeinfo_url = row[2]
337
338                 logger.debug(f"blocked='{blocked}'")
339                 if not validators.domain(blocked):
340                     logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
341                     continue
342                 elif blocked.endswith(".arpa"):
343                     logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
344                     continue
345                 elif blocked.endswith(".tld"):
346                     logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
347                     continue
348                 elif blacklist.is_blacklisted(blocked):
349                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
350                     continue
351                 elif not instances.is_registered(blocked):
352                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
353                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
354
355                 logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
356                 blocks.update_reason(reason, domain, blocked, block_level)
357
358                 logger.debug(f"blockdict()={len(blockdict)}")
359                 for entry in blockdict:
360                     if entry["blocked"] == blocked:
361                         logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
362                         entry["reason"] = reason
363
364     elif "quarantined_instances_info" in data and "quarantined_instances" in data["quarantined_instances_info"]:
365         logger.debug(f"Found 'quarantined_instances_info' in JSON response: domain='{domain}'")
366         found = True
367         block_level = "quarantined"
368
369         #print(data["quarantined_instances_info"])
370         rows = data["quarantined_instances_info"]["quarantined_instances"]
371         for blocked in rows:
372             logger.debug("BEFORE blocked:", blocked)
373             blocked = tidyup.domain(blocked)
374             logger.debug("AFTER blocked:", blocked)
375
376             if blocked not in rows or "reason" not in rows[blocked]:
377                 logger.warning(f"Cannot find blocked='{blocked}' in rows()={len(rows)},domain='{domain}'")
378                 break
379
380             reason = rows[blocked]["reason"]
381             logger.debug(f"reason='{reason}'")
382
383             if blocked == "":
384                 logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
385                 continue
386             elif blacklist.is_blacklisted(blocked):
387                 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
388                 continue
389             elif blocked.count("*") > 0:
390                 # Obscured domain name with no hash
391                 row = instances.deobscure("*", blocked)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
396                     continue
397
398                 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
399                 blocked      = row[0]
400                 origin       = row[1]
401                 nodeinfo_url = row[2]
402             elif blocked.count("?") > 0:
403                 # Obscured domain name with no hash
404                 row = instances.deobscure("?", blocked)
405
406                 logger.debug("row[]='%s'", type(row))
407                 if row is None:
408                     logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
409                     continue
410
411                 logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
412                 blocked      = row[0]
413                 origin       = row[1]
414                 nodeinfo_url = row[2]
415
416             logger.debug(f"blocked='{blocked}'")
417             if not validators.domain(blocked):
418                 logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
419                 continue
420             elif blocked.endswith(".arpa"):
421                 logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
422                 continue
423             elif blocked.endswith(".tld"):
424                 logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
425                 continue
426             elif blacklist.is_blacklisted(blocked):
427                 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
428                 continue
429             elif not instances.is_registered(blocked):
430                 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
431                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
432
433             logger.debug(f"Updating block reason: reason='{reason}',domain='{domain}',blocked='{blocked}',block_level='{block_level}'")
434             blocks.update_reason(reason, domain, blocked, block_level)
435
436             logger.debug(f"blockdict()={len(blockdict)}")
437             for entry in blockdict:
438                 if entry["blocked"] == blocked:
439                     logger.debug(f"Updating entry reason: blocked='{blocked}',reason='{reason}'")
440                     entry["reason"] = reason
441     else:
442         logger.warning(f"Cannot find 'mrf_simple_info' or 'quarantined_instances_info' in JSON reply: domain='{domain}'")
443
444     if not found:
445         logger.debug(f"Did not find any useable JSON elements, domain='{domain}', continuing with /about page ...")
446         blocklist = fetch_blocks_from_about(domain)
447
448         logger.debug(f"blocklist()={len(blocklist)}")
449         if len(blocklist) > 0:
450             logger.info("Checking %d record(s) ...", len(blocklist))
451             for block_level in blocklist:
452                 logger.debug("block_level='%s'", block_level)
453
454                 rows = blocklist[block_level]
455                 logger.debug(f"rows['{type(rows)}]()={len(rows)}'")
456                 for record in rows:
457                     logger.debug(f"record[]='{type(record)}'")
458                     blocked = tidyup.domain(record["blocked"])
459                     reason  = tidyup.reason(record["reason"])
460                     logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
461
462                     if blocked == "":
463                         logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
464                         continue
465                     elif blacklist.is_blacklisted(blocked):
466                         logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
467                         continue
468                     elif blocked.count("*") > 0:
469                         # Obscured domain name with no hash
470                         row = instances.deobscure("*", blocked)
471
472                         logger.debug("row[]='%s'", type(row))
473                         if row is None:
474                             logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
475                             continue
476
477                         logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
478                         blocked      = row[0]
479                         origin       = row[1]
480                         nodeinfo_url = row[2]
481                     elif blocked.count("?") > 0:
482                         # Obscured domain name with no hash
483                         row = instances.deobscure("?", blocked)
484
485                         logger.debug("row[]='%s'", type(row))
486                         if row is None:
487                             logger.warning(f"Cannot deobsfucate blocked='{blocked}',domain='{domain}',origin='{origin}' - SKIPPED!")
488                             continue
489
490                         logger.debug(f"blocked='{blocked}' de-obscured to '{row[0]}'")
491                         blocked      = row[0]
492                         origin       = row[1]
493                         nodeinfo_url = row[2]
494
495                     logger.debug(f"blocked='{blocked}'")
496                     if not validators.domain(blocked):
497                         logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
498                         continue
499                     elif blocked.endswith(".arpa"):
500                         logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
501                         continue
502                     elif blocked.endswith(".tld"):
503                         logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
504                         continue
505                     elif not instances.is_registered(blocked):
506                         logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
507                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
508
509                     if not blocks.is_instance_blocked(domain, blocked, block_level):
510                         logger.debug("Blocking:", domain, blocked, block_level)
511                         blocks.add_instance(domain, blocked, reason, block_level)
512
513                         if block_level == "reject":
514                             logger.debug("Adding to blockdict:", blocked)
515                             blockdict.append({
516                                 "blocked": blocked,
517                                 "reason" : reason
518                             })
519                     else:
520                         logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
521                         blocks.update_reason(reason, domain, blocked, block_level)
522
523     fba.connection.commit()
524     logger.debug("EXIT!")
525
526 def fetch_blocks_from_about(domain: str) -> dict:
527     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
528     if not isinstance(domain, str):
529         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
530     elif domain == "":
531         raise ValueError("Parameter 'domain' is empty")
532     elif domain.lower() != domain:
533         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
534     elif not validators.domain(domain.split("/")[0]):
535         raise ValueError(f"domain='{domain}' is not a valid domain")
536     elif domain.endswith(".arpa"):
537         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
538     elif domain.endswith(".tld"):
539         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
540
541     logger.debug(f"Fetching mastodon blocks from domain='{domain}'")
542     doc = None
543     for path in ["/instance/about/index.html"]:
544         try:
545             # Resetting doc type
546             doc = None
547
548             logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
549             response = network.fetch_response(
550                 domain,
551                 path,
552                 network.web_headers,
553                 (config.get("connection_timeout"), config.get("read_timeout"))
554             )
555
556             logger.debug(f"response.ok='{response.ok}',response.status_code='{response.status_code}',response.text()={len(response.text)}")
557             if not response.ok or response.text.strip() == "":
558                 logger.warning(f"path='{path}' does not exist on domain='{domain}' - SKIPPED!")
559                 continue
560
561             logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
562             doc = bs4.BeautifulSoup(
563                 response.text,
564                 "html.parser",
565             )
566
567             logger.debug("doc[]='%s'", type(doc))
568             if doc.find("h2") is not None:
569                 logger.debug(f"Found 'h2' header in path='{path}' - BREAK!")
570                 break
571
572         except network.exceptions as exception:
573             logger.warning("Cannot fetch from domain:", domain, exception)
574             instances.set_last_error(domain, exception)
575             break
576
577     blocklist = {
578         "Suspended servers": [],
579         "Filtered media"   : [],
580         "Limited servers"  : [],
581         "Silenced servers" : [],
582     }
583
584     logger.debug("doc[]='%s'", type(doc))
585     if doc is None:
586         logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
587         return blocklist
588
589     for header in doc.find_all("h2"):
590         header_text = tidyup.reason(header.text)
591
592         logger.debug(f"header_text='{header_text}' - BEFORE!")
593         if header_text in language_mapping:
594             logger.debug(f"header_text='{header_text}' - FOUND!")
595             header_text = language_mapping[header_text]
596         else:
597             logger.warning(f"header_text='{header_text}' not found in language mapping table")
598
599         logger.debug(f"header_text='{header_text} - AFTER!'")
600         if header_text in blocklist or header_text.lower() in blocklist:
601             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
602             logger.debug(f"Found header_text='{header_text}', importing domain blocks ...")
603             for line in header.find_next("table").find_all("tr")[1:]:
604                 logger.debug(f"line[]='{type(line)}'")
605                 blocklist[header_text].append({
606                     "blocked": tidyup.domain(line.find_all("td")[0].text),
607                     "reason" : tidyup.reason(line.find_all("td")[1].text),
608                 })
609         else:
610             logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
611
612     logger.debug(f"Returning blocklist for domain='{domain}'")
613     return {
614         "reject"        : blocklist["Suspended servers"],
615         "media_removal" : blocklist["Filtered media"],
616         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
617     }