]> git.mxchange.org Git - fba.git/blob - fba/commands.py
fb363fe6ea528cc506024621360683003ccbf62f
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import federation
47 from fba.http import network
48
49 from fba.models import blocks
50 from fba.models import instances
51 from fba.models import sources
52
53 from fba.networks import friendica
54 from fba.networks import lemmy
55 from fba.networks import mastodon
56 from fba.networks import misskey
57 from fba.networks import pleroma
58
59 logging.basicConfig(level=logging.INFO)
60 logger = logging.getLogger(__name__)
61 #logger.setLevel(logging.DEBUG)
62
63 def check_instance(args: argparse.Namespace) -> int:
64     logger.debug("args.domain='%s' - CALLED!", args.domain)
65     status = 0
66     if not validators.domain(args.domain):
67         logger.warning("args.domain='%s' is not valid", args.domain)
68         status = 100
69     elif blacklist.is_blacklisted(args.domain):
70         logger.warning("args.domain='%s' is blacklisted", args.domain)
71         status = 101
72     elif instances.is_registered(args.domain):
73         logger.warning("args.domain='%s' is already registered", args.domain)
74         status = 102
75     else:
76         logger.info("args.domain='%s' is not known", args.domain)
77
78     logger.debug("status=%d - EXIT!", status)
79     return status
80
81 def check_nodeinfo(args: argparse.Namespace) -> int:
82     logger.debug("args[]='%s' - CALLED!", type(args))
83
84     # Fetch rows
85     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
86
87     cnt = 0
88     for row in database.cursor.fetchall():
89         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
90         punycode = row["domain"].encode("idna").decode("utf-8")
91
92         if row["nodeinfo_url"].startswith("/"):
93             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
94             continue
95         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
96             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
97             cnt = cnt + 1
98
99     logger.info("Found %d row(s)", cnt)
100
101     logger.debug("EXIT!")
102     return 0
103
104 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
105     logger.debug("args[]='%s' - CALLED!", type(args))
106
107     # No CSRF by default, you don't have to add network.source_headers by yourself here
108     headers = tuple()
109     source_domain = "pixelfed.org"
110
111     if sources.is_recent(source_domain):
112         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
113         return 0
114     else:
115         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
116         sources.update(source_domain)
117
118     try:
119         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
120         headers = csrf.determine(source_domain, dict())
121     except network.exceptions as exception:
122         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
123         return list()
124
125     try:
126         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
127         fetched = network.get_json_api(
128             source_domain,
129             "/api/v1/servers/all.json?scope=All&country=all&language=all",
130             headers,
131             (config.get("connection_timeout"), config.get("read_timeout"))
132         )
133
134         logger.debug("JSON API returned %d elements", len(fetched))
135         if "error_message" in fetched:
136             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
137             return 101
138         elif "data" not in fetched["json"]:
139             logger.warning("API did not return JSON with 'data' element - EXIT!")
140             return 102
141
142         rows = fetched["json"]["data"]
143         logger.info("Checking %d fetched rows ...", len(rows))
144         for row in rows:
145             logger.debug("row[]='%s'", type(row))
146             if "domain" not in row:
147                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
148                 continue
149             elif row["domain"] == "":
150                 logger.debug("row[domain] is empty - SKIPPED!")
151                 continue
152
153             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
154             domain = row["domain"].encode("idna").decode("utf-8")
155             logger.debug("domain='%s' - AFTER!", domain)
156
157             if not domain_helper.is_wanted(domain):
158                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
159                 continue
160             elif instances.is_registered(domain):
161                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
162                 continue
163             elif instances.is_recent(domain):
164                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
165                 continue
166
167             logger.debug("Fetching instances from domain='%s' ...", domain)
168             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
169
170     except network.exceptions as exception:
171         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
172         return 103
173
174     logger.debug("Success! - EXIT!")
175     return 0
176
177 def fetch_bkali(args: argparse.Namespace) -> int:
178     logger.debug("args[]='%s' - CALLED!", type(args))
179
180     logger.debug("Invoking locking.acquire() ...")
181     locking.acquire()
182
183     source_domain = "gql.api.bka.li"
184     if sources.is_recent(source_domain):
185         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
186         return 0
187     else:
188         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
189         sources.update(source_domain)
190
191     domains = list()
192     try:
193         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
194         fetched = network.post_json_api(
195             source_domain,
196             "/v1/graphql",
197             json.dumps({
198                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
199             })
200         )
201
202         logger.debug("fetched[]='%s'", type(fetched))
203         if "error_message" in fetched:
204             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
205             return 100
206         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
207             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
208             return 101
209
210         rows = fetched["json"]
211
212         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
213         if len(rows) == 0:
214             raise Exception("WARNING: Returned no records")
215         elif "data" not in rows:
216             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
217         elif "nodeinfo" not in rows["data"]:
218             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
219
220         for entry in rows["data"]["nodeinfo"]:
221             logger.debug("entry[%s]='%s'", type(entry), entry)
222             if "domain" not in entry:
223                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
224                 continue
225             elif entry["domain"] == "":
226                 logger.debug("entry[domain] is empty - SKIPPED!")
227                 continue
228             elif not domain_helper.is_wanted(entry["domain"]):
229                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
230                 continue
231             elif instances.is_registered(entry["domain"]):
232                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
233                 continue
234             elif instances.is_recent(entry["domain"]):
235                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
236                 continue
237
238             logger.debug("Adding domain='%s' ...", entry["domain"])
239             domains.append(entry["domain"])
240
241     except network.exceptions as exception:
242         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
243         return 102
244
245     logger.debug("domains()=%d", len(domains))
246     if len(domains) > 0:
247         logger.info("Adding %d new instances ...", len(domains))
248         for domain in domains:
249             logger.debug("domain='%s' - BEFORE!", domain)
250             domain = domain.encode("idna").decode("utf-8")
251             logger.debug("domain='%s' - AFTER!", domain)
252
253             try:
254                 logger.info("Fetching instances from domain='%s' ...", domain)
255                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
256             except network.exceptions as exception:
257                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
258                 instances.set_last_error(domain, exception)
259                 return 100
260
261     logger.debug("Success - EXIT!")
262     return 0
263
264 def fetch_blocks(args: argparse.Namespace) -> int:
265     logger.debug("args[]='%s' - CALLED!", type(args))
266     if args.domain is not None and args.domain != "":
267         logger.debug("args.domain='%s' - checking ...", args.domain)
268         if not validators.domain(args.domain):
269             logger.warning("args.domain='%s' is not valid.", args.domain)
270             return 100
271         elif blacklist.is_blacklisted(args.domain):
272             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
273             return 101
274         elif not instances.is_registered(args.domain):
275             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
276             return 102
277
278     logger.debug("Invoking locking.acquire() ...")
279     locking.acquire()
280
281     if args.domain is not None and args.domain != "":
282         # Re-check single domain
283         logger.debug("Querying database for args.domain='%s' ...", args.domain)
284         database.cursor.execute(
285             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
286         )
287     elif args.software is not None and args.software != "":
288         # Re-check single software
289         logger.debug("Querying database for args.software='%s' ...", args.software)
290         database.cursor.execute(
291             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
292         )
293     elif args.force:
294         # Re-check all
295         logger.debug("Re-checking all instances ...")
296         database.cursor.execute(
297             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC"
298         )
299     else:
300         # Re-check after "timeout" (aka. minimum interval)
301         database.cursor.execute(
302             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
303         )
304
305     rows = database.cursor.fetchall()
306     logger.info("Checking %d entries ...", len(rows))
307     for blocker, software, origin, nodeinfo_url in rows:
308         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
309
310         if nodeinfo_url is None:
311             logger.debug("blocker='%s',software='%s' has no nodeinfo_url set - SKIPPED!", blocker, software)
312             continue
313         elif not domain_helper.is_wanted(blocker):
314             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
315             continue
316
317         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
318         instances.set_last_blocked(blocker)
319         instances.set_has_obfuscation(blocker, False)
320
321         blocking = list()
322
323         if blocker != "chaos.social":
324             logger.debug("blocker='%s',software='%s'", blocker, software)
325             if software == "pleroma":
326                 logger.info("blocker='%s',software='%s'", blocker, software)
327                 blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
328                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
329             elif software == "mastodon":
330                 logger.info("blocker='%s',software='%s'", blocker, software)
331                 blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
332                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
333             elif software == "lemmy":
334                 logger.info("blocker='%s',software='%s'", blocker, software)
335                 blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "friendica":
338                 logger.info("blocker='%s',software='%s'", blocker, software)
339                 blocking = friendica.fetch_blocks(blocker)
340                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
341             elif software == "misskey":
342                 logger.info("blocker='%s',software='%s'", blocker, software)
343                 blocking = misskey.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             else:
346                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
347
348             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
349             instances.set_total_blocks(blocker, blocking)
350         else:
351             logger.debug("Skipping chaos.social, run ./fba.py fetch_cs instead!")
352
353         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
354         blockdict = list()
355         for block in blocking:
356             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
357
358             if block["block_level"] == "":
359                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
360                 continue
361
362             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
363             block["blocked"] = tidyup.domain(block["blocked"])
364             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
365             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
366
367             if block["blocked"] == "":
368                 logger.warning("blocked is empty, blocker='%s'", blocker)
369                 continue
370             elif block["blocked"].endswith(".onion"):
371                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
372                 continue
373             elif block["blocked"].endswith(".arpa"):
374                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".tld"):
377                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].find("*") >= 0:
380                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
381
382                 # Some friendica servers also obscure domains without hash
383                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
384
385                 logger.debug("row[]='%s'", type(row))
386                 if row is None:
387                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
388                     instances.set_has_obfuscation(blocker, True)
389                     continue
390
391                 block["blocked"] = row["domain"]
392                 origin           = row["origin"]
393                 nodeinfo_url     = row["nodeinfo_url"]
394             elif block["blocked"].find("?") >= 0:
395                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
396
397                 # Some obscure them with question marks, not sure if that's dependent on version or not
398                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
399
400                 logger.debug("row[]='%s'", type(row))
401                 if row is None:
402                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
403                     instances.set_has_obfuscation(blocker, True)
404                     continue
405
406                 block["blocked"] = row["domain"]
407                 origin           = row["origin"]
408                 nodeinfo_url     = row["nodeinfo_url"]
409
410             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
411             if block["blocked"] == "":
412                 logger.debug("block[blocked] is empty - SKIPPED!")
413                 continue
414
415             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
416             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
417             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
418
419             if not domain_helper.is_wanted(block["blocked"]):
420                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
421                 continue
422             elif block["block_level"] in ["accept", "accepted"]:
423                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
424                 continue
425             elif not instances.is_registered(block["blocked"]):
426                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
427                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
428
429             block["block_level"] = blocks.alias_block_level(block["block_level"])
430
431             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
432                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
433                 blockdict.append({
434                     "blocked": block["blocked"],
435                     "reason" : block["reason"],
436                 })
437
438             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
439             cookies.clear(block["blocked"])
440
441         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
442         if instances.has_pending(blocker):
443             logger.debug("Flushing updates for blocker='%s' ...", blocker)
444             instances.update_data(blocker)
445
446         logger.debug("Invoking commit() ...")
447         database.connection.commit()
448
449         logger.debug("Invoking cookies.clear(%s) ...", blocker)
450         cookies.clear(blocker)
451
452         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
453         if config.get("bot_enabled") and len(blockdict) > 0:
454             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
455             network.send_bot_post(blocker, blockdict)
456
457     logger.debug("Success! - EXIT!")
458     return 0
459
460 def fetch_observer(args: argparse.Namespace) -> int:
461     logger.debug("args[]='%s' - CALLED!", type(args))
462
463     logger.debug("Invoking locking.acquire() ...")
464     locking.acquire()
465
466     source_domain = "fediverse.observer"
467     if sources.is_recent(source_domain):
468         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
469         return 0
470     else:
471         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
472         sources.update(source_domain)
473
474     types = list()
475     if args.software is None:
476         logger.info("Fetching software list ...")
477         raw = utils.fetch_url(
478             f"https://{source_domain}",
479             network.web_headers,
480             (config.get("connection_timeout"), config.get("read_timeout"))
481         ).text
482         logger.debug("raw[%s]()=%d", type(raw), len(raw))
483
484         doc = bs4.BeautifulSoup(raw, features="html.parser")
485         logger.debug("doc[]='%s'", type(doc))
486
487         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
488         logger.debug("navbar[]='%s'", type(navbar))
489         if navbar is None:
490             logger.warning("Cannot find navigation bar, cannot continue!")
491             return 1
492
493         items = navbar.findAll("a", {"class": "dropdown-item"})
494         logger.debug("items[]='%s'", type(items))
495
496         logger.info("Checking %d menu items ...", len(items))
497         for item in items:
498             logger.debug("item[%s]='%s'", type(item), item)
499             if item.text.lower() == "all":
500                 logger.debug("Skipping 'All' menu entry ...")
501                 continue
502
503             logger.debug("Appending item.text='%s' ...", item.text)
504             types.append(tidyup.domain(item.text))
505     else:
506         logger.info("Adding args.software='%s' as type ...", args.software)
507         types.append(args.software)
508
509     logger.info("Fetching %d different table data ...", len(types))
510     for software in types:
511         logger.debug("software='%s' - BEFORE!", software)
512         if args.software is not None and args.software != software:
513             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
514             continue
515
516         doc = None
517         try:
518             logger.debug("Fetching table data for software='%s' ...", software)
519             raw = utils.fetch_url(
520                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
521                 network.web_headers,
522                 (config.get("connection_timeout"), config.get("read_timeout"))
523             ).text
524             logger.debug("raw[%s]()=%d", type(raw), len(raw))
525
526             doc = bs4.BeautifulSoup(raw, features="html.parser")
527             logger.debug("doc[]='%s'", type(doc))
528         except network.exceptions as exception:
529             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
530             continue
531
532         items = doc.findAll("a", {"class": "url"})
533         logger.info("Checking %d items,software='%s' ...", len(items), software)
534         for item in items:
535             logger.debug("item[]='%s'", type(item))
536             domain = item.decode_contents()
537             logger.debug("domain='%s' - AFTER!", domain)
538
539             if domain == "":
540                 logger.debug("domain is empty - SKIPPED!")
541                 continue
542
543             logger.debug("domain='%s' - BEFORE!", domain)
544             domain = domain.encode("idna").decode("utf-8")
545             logger.debug("domain='%s' - AFTER!", domain)
546
547             if not domain_helper.is_wanted(domain):
548                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
549                 continue
550             elif instances.is_registered(domain):
551                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
552                 continue
553
554             software = software_helper.alias(software)
555             logger.info("Fetching instances for domain='%s'", domain)
556             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
557
558     logger.debug("Success! - EXIT!")
559     return 0
560
561 def fetch_todon_wiki(args: argparse.Namespace) -> int:
562     logger.debug("args[]='%s' - CALLED!", type(args))
563
564     logger.debug("Invoking locking.acquire() ...")
565     locking.acquire()
566
567     source_domain = "wiki.todon.eu"
568     if sources.is_recent(source_domain):
569         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
570         return 0
571     else:
572         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
573         sources.update(source_domain)
574
575     blocklist = {
576         "silenced": list(),
577         "reject": list(),
578     }
579
580     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
581     raw = utils.fetch_url(
582         f"https://{source_domain}/todon/domainblocks",
583         network.web_headers,
584         (config.get("connection_timeout"), config.get("read_timeout"))
585     ).text
586     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
587
588     doc = bs4.BeautifulSoup(raw, "html.parser")
589     logger.debug("doc[]='%s'", type(doc))
590
591     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
592     logger.info("Checking %d silenced/limited entries ...", len(silenced))
593     blocklist["silenced"] = utils.find_domains(silenced, "div")
594
595     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
596     logger.info("Checking %d suspended entries ...", len(suspended))
597     blocklist["reject"] = utils.find_domains(suspended, "div")
598
599     blocking = blocklist["silenced"] + blocklist["reject"]
600     blocker = "todon.eu"
601
602     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
603     instances.set_last_blocked(blocker)
604     instances.set_total_blocks(blocker, blocking)
605
606     blockdict = list()
607     for block_level in blocklist:
608         blockers = blocklist[block_level]
609
610         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
611         for blocked in blockers:
612             logger.debug("blocked='%s'", blocked)
613
614             if not instances.is_registered(blocked):
615                 try:
616                     logger.info("Fetching instances from domain='%s' ...", blocked)
617                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
618                 except network.exceptions as exception:
619                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
620                     instances.set_last_error(blocked, exception)
621
622             if blocks.is_instance_blocked(blocker, blocked, block_level):
623                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
624                 continue
625
626             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
627             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
628                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
629                 blockdict.append({
630                     "blocked": blocked,
631                     "reason" : None,
632                 })
633
634         logger.debug("Invoking commit() ...")
635         database.connection.commit()
636
637         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
638         if config.get("bot_enabled") and len(blockdict) > 0:
639             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
640             network.send_bot_post(blocker, blockdict)
641
642     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
643     if instances.has_pending(blocker):
644         logger.debug("Flushing updates for blocker='%s' ...", blocker)
645         instances.update_data(blocker)
646
647     logger.debug("Success! - EXIT!")
648     return 0
649
650 def fetch_cs(args: argparse.Namespace):
651     logger.debug("args[]='%s' - CALLED!", type(args))
652
653     logger.debug("Invoking locking.acquire() ...")
654     locking.acquire()
655
656     extensions = [
657         "extra",
658         "abbr",
659         "attr_list",
660         "def_list",
661         "fenced_code",
662         "footnotes",
663         "md_in_html",
664         "admonition",
665         "codehilite",
666         "legacy_attrs",
667         "legacy_em",
668         "meta",
669         "nl2br",
670         "sane_lists",
671         "smarty",
672         "toc",
673         "wikilinks"
674     ]
675
676     blocklist = {
677         "silenced": list(),
678         "reject"  : list(),
679     }
680
681     source_domain = "raw.githubusercontent.com"
682     if sources.is_recent(source_domain):
683         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
684         return 0
685     else:
686         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
687         sources.update(source_domain)
688
689     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
690     raw = utils.fetch_url(
691         f"https://{source_domain}/chaossocial/meta/master/federation.md",
692         network.web_headers,
693         (config.get("connection_timeout"), config.get("read_timeout"))
694     ).text
695     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
696
697     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
698     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
699
700     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
701     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
702     blocklist["silenced"] = federation.find_domains(silenced)
703
704     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
705     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
706     blocklist["reject"] = federation.find_domains(blocked)
707
708     blocking = blocklist["silenced"] + blocklist["reject"]
709     blocker = "chaos.social"
710
711     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
712     instances.set_last_blocked(blocker)
713     instances.set_total_blocks(blocker, blocking)
714
715     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
716     if len(blocking) > 0:
717         blockdict = list()
718         for block_level in blocklist:
719             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
720
721             for row in blocklist[block_level]:
722                 logger.debug("row[%s]='%s'", type(row), row)
723                 if not "domain" in row:
724                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
725                     continue
726                 elif not instances.is_registered(row["domain"]):
727                     try:
728                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
729                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
730                     except network.exceptions as exception:
731                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
732                         instances.set_last_error(row["domain"], exception)
733
734                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
735                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
736                     blockdict.append({
737                         "blocked": row["domain"],
738                         "reason" : row["reason"],
739                     })
740
741         logger.debug("Invoking commit() ...")
742         database.connection.commit()
743
744         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
745         if config.get("bot_enabled") and len(blockdict) > 0:
746             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
747             network.send_bot_post(blocker, blockdict)
748
749     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
750     if instances.has_pending(blocker):
751         logger.debug("Flushing updates for blocker='%s' ...", blocker)
752         instances.update_data(blocker)
753
754     logger.debug("Success! - EXIT!")
755     return 0
756
757 def fetch_fba_rss(args: argparse.Namespace) -> int:
758     logger.debug("args[]='%s' - CALLED!", type(args))
759
760     domains = list()
761
762     logger.debug("Invoking locking.acquire() ...")
763     locking.acquire()
764
765     components = urlparse(args.feed)
766
767     if sources.is_recent(components.netloc):
768         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
769         return 0
770     else:
771         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
772         sources.update(components.netloc)
773
774     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
775     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
776
777     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
778     if response.ok and response.status_code < 300 and len(response.text) > 0:
779         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
780         rss = atoma.parse_rss_bytes(response.content)
781
782         logger.debug("rss[]='%s'", type(rss))
783         for item in rss.items:
784             logger.debug("item[%s]='%s'", type(item), item)
785             domain = tidyup.domain(item.link.split("=")[1])
786
787             logger.debug("domain='%s' - AFTER!", domain)
788             if domain == "":
789                 logger.debug("domain is empty - SKIPPED!")
790                 continue
791
792             logger.debug("domain='%s' - BEFORE!", domain)
793             domain = domain.encode("idna").decode("utf-8")
794             logger.debug("domain='%s' - AFTER!", domain)
795
796             if not domain_helper.is_wanted(domain):
797                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
798                 continue
799             elif domain in domains:
800                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
801                 continue
802             elif instances.is_registered(domain):
803                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
804                 continue
805             elif instances.is_recent(domain):
806                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
807                 continue
808
809             logger.debug("Adding domain='%s'", domain)
810             domains.append(domain)
811
812     logger.debug("domains()=%d", len(domains))
813     if len(domains) > 0:
814         logger.info("Adding %d new instances ...", len(domains))
815         for domain in domains:
816             logger.debug("domain='%s'", domain)
817             try:
818                 logger.info("Fetching instances from domain='%s' ...", domain)
819                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
820             except network.exceptions as exception:
821                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
822                 instances.set_last_error(domain, exception)
823                 return 100
824
825     logger.debug("Success! - EXIT!")
826     return 0
827
828 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
829     logger.debug("args[]='%s' - CALLED!", type(args))
830
831     logger.debug("Invoking locking.acquire() ...")
832     locking.acquire()
833
834     source_domain = "ryona.agency"
835     feed = f"https://{source_domain}/users/fba/feed.atom"
836
837     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
838     if args.feed is not None and validators.url(args.feed):
839         logger.debug("Setting feed='%s' ...", args.feed)
840         feed = str(args.feed)
841         source_domain = urlparse(args.feed).netloc
842
843     if sources.is_recent(source_domain):
844         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
845         return 0
846     else:
847         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
848         sources.update(source_domain)
849
850     domains = list()
851
852     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
853     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
854
855     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
856     if response.ok and response.status_code < 300 and len(response.text) > 0:
857         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
858         atom = atoma.parse_atom_bytes(response.content)
859
860         logger.debug("atom[]='%s'", type(atom))
861         for entry in atom.entries:
862             logger.debug("entry[]='%s'", type(entry))
863             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
864             logger.debug("doc[]='%s'", type(doc))
865             for element in doc.findAll("a"):
866                 logger.debug("element[]='%s'", type(element))
867                 for href in element["href"].split(","):
868                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
869                     domain = tidyup.domain(href)
870
871                     logger.debug("domain='%s' - AFTER!", domain)
872                     if domain == "":
873                         logger.debug("domain is empty - SKIPPED!")
874                         continue
875
876                     logger.debug("domain='%s' - BEFORE!", domain)
877                     domain = domain.encode("idna").decode("utf-8")
878                     logger.debug("domain='%s' - AFTER!", domain)
879
880                     if not domain_helper.is_wanted(domain):
881                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
882                         continue
883                     elif domain in domains:
884                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
885                         continue
886                     elif instances.is_registered(domain):
887                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
888                         continue
889                     elif instances.is_recent(domain):
890                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
891                         continue
892
893                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
894                     domains.append(domain)
895
896     logger.debug("domains()=%d", len(domains))
897     if len(domains) > 0:
898         logger.info("Adding %d new instances ...", len(domains))
899         for domain in domains:
900             logger.debug("domain='%s'", domain)
901             try:
902                 logger.info("Fetching instances from domain='%s' ...", domain)
903                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
904             except network.exceptions as exception:
905                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
906                 instances.set_last_error(domain, exception)
907                 return 100
908
909     logger.debug("Success! - EXIT!")
910     return 0
911
912 def fetch_instances(args: argparse.Namespace) -> int:
913     logger.debug("args[]='%s' - CALLED!", type(args))
914
915     logger.debug("args.domain='%s' - checking ...", args.domain)
916     if not validators.domain(args.domain):
917         logger.warning("args.domain='%s' is not valid.", args.domain)
918         return 100
919     elif blacklist.is_blacklisted(args.domain):
920         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
921         return 101
922
923     logger.debug("Invoking locking.acquire() ...")
924     locking.acquire()
925
926     # Initial fetch
927     try:
928         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
929         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
930     except network.exceptions as exception:
931         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
932         instances.set_last_error(args.domain, exception)
933         instances.update_data(args.domain)
934         return 100
935
936     if args.single:
937         logger.debug("Not fetching more instances - EXIT!")
938         return 0
939
940     # Loop through some instances
941     database.cursor.execute(
942         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
943     )
944
945     rows = database.cursor.fetchall()
946     logger.info("Checking %d entries ...", len(rows))
947     for row in rows:
948         logger.debug("row[domain]='%s'", row["domain"])
949         if row["domain"] == "":
950             logger.debug("row[domain] is empty - SKIPPED!")
951             continue
952
953         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
954         domain = row["domain"].encode("idna").decode("utf-8")
955         logger.debug("domain='%s' - AFTER!", domain)
956
957         if not domain_helper.is_wanted(domain):
958             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
959             continue
960
961         try:
962             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
963             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
964         except network.exceptions as exception:
965             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
966             instances.set_last_error(domain, exception)
967
968     logger.debug("Success - EXIT!")
969     return 0
970
971 def fetch_oliphant(args: argparse.Namespace) -> int:
972     logger.debug("args[]='%s' - CALLED!", type(args))
973
974     logger.debug("Invoking locking.acquire() ...")
975     locking.acquire()
976
977     source_domain = "codeberg.org"
978     if sources.is_recent(source_domain):
979         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
980         return 0
981     else:
982         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
983         sources.update(source_domain)
984
985     # Base URL
986     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
987
988     # URLs to fetch
989     blocklists = (
990         {
991             "blocker": "artisan.chat",
992             "csv_url": "mastodon/artisan.chat.csv",
993         },{
994             "blocker": "mastodon.art",
995             "csv_url": "mastodon/mastodon.art.csv",
996         },{
997             "blocker": "pleroma.envs.net",
998             "csv_url": "mastodon/pleroma.envs.net.csv",
999         },{
1000             "blocker": "oliphant.social",
1001             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
1002         },{
1003             "blocker": "mastodon.online",
1004             "csv_url": "mastodon/mastodon.online.csv",
1005         },{
1006             "blocker": "mastodon.social",
1007             "csv_url": "mastodon/mastodon.social.csv",
1008         },{
1009             "blocker": "mastodon.social",
1010             "csv_url": "other/missing-tier0-mastodon.social.csv",
1011         },{
1012             "blocker": "rage.love",
1013             "csv_url": "mastodon/rage.love.csv",
1014         },{
1015             "blocker": "sunny.garden",
1016             "csv_url": "mastodon/sunny.garden.csv",
1017         },{
1018             "blocker": "sunny.garden",
1019             "csv_url": "mastodon/gardenfence.csv",
1020         },{
1021             "blocker": "solarpunk.moe",
1022             "csv_url": "mastodon/solarpunk.moe.csv",
1023         },{
1024             "blocker": "toot.wales",
1025             "csv_url": "mastodon/toot.wales.csv",
1026         },{
1027             "blocker": "union.place",
1028             "csv_url": "mastodon/union.place.csv",
1029         },{
1030             "blocker": "oliphant.social",
1031             "csv_url": "mastodon/birdsite.csv",
1032         }
1033     )
1034
1035     domains = list()
1036
1037     logger.debug("Downloading %d files ...", len(blocklists))
1038     for block in blocklists:
1039         # Is domain given and not equal blocker?
1040         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1041             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1042             continue
1043         elif args.domain in domains:
1044             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1045             continue
1046
1047         instances.set_last_blocked(block["blocker"])
1048
1049         # Fetch this URL
1050         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1051         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1052
1053         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1054         if not response.ok or response.status_code >= 300 or response.content == "":
1055             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1056             continue
1057
1058         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1059         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1060
1061         blockdict = list()
1062
1063         cnt = 0
1064         for row in reader:
1065             logger.debug("row[%s]='%s'", type(row), row)
1066             domain = severity = None
1067             reject_media = reject_reports = False
1068
1069             if "#domain" in row:
1070                 domain = row["#domain"]
1071             elif "domain" in row:
1072                 domain = row["domain"]
1073             else:
1074                 logger.debug("row='%s' does not contain domain column", row)
1075                 continue
1076
1077             if "#severity" in row:
1078                 severity = blocks.alias_block_level(row["#severity"])
1079             elif "severity" in row:
1080                 severity = blocks.alias_block_level(row["severity"])
1081             else:
1082                 logger.debug("row='%s' does not contain severity column", row)
1083                 continue
1084
1085             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1086                 reject_media = True
1087             elif "reject_media" in row and row["reject_media"].lower() == "true":
1088                 reject_media = True
1089
1090             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1091                 reject_reports = True
1092             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1093                 reject_reports = True
1094
1095             cnt = cnt + 1
1096             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1097             if domain == "":
1098                 logger.debug("domain is empty - SKIPPED!")
1099                 continue
1100             elif domain.endswith(".onion"):
1101                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1102                 continue
1103             elif domain.endswith(".arpa"):
1104                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1105                 continue
1106             elif domain.endswith(".tld"):
1107                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1108                 continue
1109             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1110                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1111                 domain = utils.deobfuscate(domain, block["blocker"])
1112                 logger.debug("domain='%s' - AFTER!", domain)
1113
1114             if not validators.domain(domain):
1115                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1116                 continue
1117             elif blacklist.is_blacklisted(domain):
1118                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1119                 continue
1120             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1121                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1122                 continue
1123
1124             logger.debug("Marking domain='%s' as handled", domain)
1125             domains.append(domain)
1126
1127             logger.debug("Processing domain='%s' ...", domain)
1128             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1129             logger.debug("processed='%s'", processed)
1130
1131             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1132                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1133                 blockdict.append({
1134                     "blocked": domain,
1135                     "reason" : block["reason"],
1136                 })
1137
1138             if reject_media:
1139                 processing.block(block["blocker"], domain, None, "reject_media")
1140             if reject_reports:
1141                 processing.block(block["blocker"], domain, None, "reject_reports")
1142
1143         logger.debug("block[blocker]='%s'", block["blocker"])
1144         if block["blocker"] != "chaos.social":
1145             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1146             instances.set_total_blocks(block["blocker"], domains)
1147
1148         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1149         if instances.has_pending(block["blocker"]):
1150             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1151             instances.update_data(block["blocker"])
1152
1153         logger.debug("Invoking commit() ...")
1154         database.connection.commit()
1155
1156         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1157         if config.get("bot_enabled") and len(blockdict) > 0:
1158             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1159             network.send_bot_post(block["blocker"], blockdict)
1160
1161     logger.debug("Success! - EXIT!")
1162     return 0
1163
1164 def fetch_txt(args: argparse.Namespace) -> int:
1165     logger.debug("args[]='%s' - CALLED!", type(args))
1166
1167     logger.debug("Invoking locking.acquire() ...")
1168     locking.acquire()
1169
1170     # Static URLs
1171     urls = ({
1172         "blocker": "seirdy.one",
1173         "url"    : "https://seirdy.one/pb/bsl.txt",
1174     },)
1175
1176     logger.info("Checking %d text file(s) ...", len(urls))
1177     for row in urls:
1178         logger.debug("Fetching row[url]='%s' ...", row["url"])
1179         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1180
1181         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1182         if response.ok and response.status_code < 300 and response.text != "":
1183             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1184             domains = response.text.split("\n")
1185
1186             logger.info("Processing %d domains ...", len(domains))
1187             for domain in domains:
1188                 logger.debug("domain='%s' - BEFORE!", domain)
1189                 domain = tidyup.domain(domain)
1190
1191                 logger.debug("domain='%s' - AFTER!", domain)
1192                 if domain == "":
1193                     logger.debug("domain is empty - SKIPPED!")
1194                     continue
1195                 elif not domain_helper.is_wanted(domain):
1196                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1197                     continue
1198                 elif instances.is_recent(domain):
1199                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1200                     continue
1201
1202                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1203                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1204
1205                 logger.debug("processed='%s'", processed)
1206                 if not processed:
1207                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1208                     continue
1209
1210     logger.debug("Success! - EXIT!")
1211     return 0
1212
1213 def fetch_fedipact(args: argparse.Namespace) -> int:
1214     logger.debug("args[]='%s' - CALLED!", type(args))
1215
1216     logger.debug("Invoking locking.acquire() ...")
1217     locking.acquire()
1218
1219     source_domain = "fedipact.online"
1220     if sources.is_recent(source_domain):
1221         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1222         return 0
1223     else:
1224         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1225         sources.update(source_domain)
1226
1227     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1228     response = utils.fetch_url(
1229         f"https://{source_domain}",
1230         network.web_headers,
1231         (config.get("connection_timeout"), config.get("read_timeout"))
1232     )
1233
1234     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1235     if response.ok and response.status_code < 300 and response.text != "":
1236         logger.debug("Parsing %d Bytes ...", len(response.text))
1237
1238         doc = bs4.BeautifulSoup(response.text, "html.parser")
1239         logger.debug("doc[]='%s'", type(doc))
1240
1241         rows = doc.findAll("li")
1242         logger.info("Checking %d row(s) ...", len(rows))
1243         for row in rows:
1244             logger.debug("row[]='%s'", type(row))
1245             domain = tidyup.domain(row.contents[0])
1246
1247             logger.debug("domain='%s' - AFTER!", domain)
1248             if domain == "":
1249                 logger.debug("domain is empty - SKIPPED!")
1250                 continue
1251
1252             logger.debug("domain='%s' - BEFORE!", domain)
1253             domain = domain.encode("idna").decode("utf-8")
1254             logger.debug("domain='%s' - AFTER!", domain)
1255
1256             if not domain_helper.is_wanted(domain):
1257                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1258                 continue
1259             elif instances.is_registered(domain):
1260                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1261                 continue
1262             elif instances.is_recent(domain):
1263                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1264                 continue
1265
1266             logger.info("Fetching domain='%s' ...", domain)
1267             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1268
1269     logger.debug("Success! - EXIT!")
1270     return 0
1271
1272 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1273     logger.debug("args[]='%s' - CALLED!", type(args))
1274
1275     logger.debug("Invoking locking.acquire() ...")
1276     locking.acquire()
1277
1278     source_domain = "instances.joinmobilizon.org"
1279     if sources.is_recent(source_domain):
1280         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1281         return 0
1282     else:
1283         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1284         sources.update(source_domain)
1285
1286     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1287     raw = utils.fetch_url(
1288         f"https://{source_domain}/api/v1/instances",
1289         network.web_headers,
1290         (config.get("connection_timeout"), config.get("read_timeout"))
1291     ).text
1292     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1293
1294     parsed = json.loads(raw)
1295     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1296
1297     if "data" not in parsed:
1298         logger.warning("parsed()=%d does not contain key 'data'")
1299         return 1
1300
1301     logger.info("Checking %d instances ...", len(parsed["data"]))
1302     for row in parsed["data"]:
1303         logger.debug("row[]='%s'", type(row))
1304         if "host" not in row:
1305             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1306             continue
1307         elif not domain_helper.is_wanted(row["host"]):
1308             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1309             continue
1310         elif instances.is_registered(row["host"]):
1311             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1312             continue
1313
1314         logger.info("Fetching row[host]='%s' ...", row["host"])
1315         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1316
1317     logger.debug("Success! - EXIT!")
1318     return 0
1319
1320 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1321     logger.debug("args[]='%s' - CALLED!", type(args))
1322
1323     logger.debug("Invoking locking.acquire() ...")
1324     locking.acquire()
1325
1326     source_domain = "instanceapp.misskey.page"
1327     if sources.is_recent(source_domain):
1328         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1329         return 0
1330     else:
1331         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1332         sources.update(source_domain)
1333
1334     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1335     raw = utils.fetch_url(
1336         f"https://{source_domain}/instances.json",
1337         network.web_headers,
1338         (config.get("connection_timeout"), config.get("read_timeout"))
1339     ).text
1340     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1341
1342     parsed = json.loads(raw)
1343     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1344
1345     if "instancesInfos" not in parsed:
1346         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1347         return 1
1348
1349     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1350     for row in parsed["instancesInfos"]:
1351         logger.debug("row[%s]='%s'", type(row), row)
1352         if "url" not in row:
1353             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1354             continue
1355         elif not domain_helper.is_wanted(row["url"]):
1356             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1357             continue
1358         elif instances.is_registered(row["url"]):
1359             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1360             continue
1361
1362         logger.info("Fetching row[url]='%s' ...", row["url"])
1363         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1364
1365     logger.debug("Success! - EXIT!")
1366     return 0
1367
1368 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1369     logger.debug("args[]='%s' - CALLED!", type(args))
1370
1371     logger.debug("Invoking locking.acquire() ...")
1372     locking.acquire()
1373
1374     source_domain = "joinfediverse.wiki"
1375     if sources.is_recent(source_domain):
1376         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1377         return 0
1378     else:
1379         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1380         sources.update(source_domain)
1381
1382     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1383     raw = utils.fetch_url(
1384         f"https://{source_domain}/FediBlock",
1385         network.web_headers,
1386         (config.get("connection_timeout"), config.get("read_timeout"))
1387     ).text
1388     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1389
1390     doc = bs4.BeautifulSoup(raw, "html.parser")
1391     logger.debug("doc[]='%s'", type(doc))
1392
1393     tables = doc.findAll("table", {"class": "wikitable"})
1394
1395     logger.info("Analyzing %d table(s) ...", len(tables))
1396     blocklist = list()
1397     for table in tables:
1398         logger.debug("table[]='%s'", type(table))
1399
1400         rows = table.findAll("tr")
1401         logger.info("Checking %d row(s) ...", len(rows))
1402         block_headers = dict()
1403         for row in rows:
1404             logger.debug("row[%s]='%s'", type(row), row)
1405
1406             headers = row.findAll("th")
1407             logger.debug("Found headers()=%d header(s)", len(headers))
1408             if len(headers) > 1:
1409                 block_headers = dict()
1410                 cnt = 0
1411                 for header in headers:
1412                     cnt = cnt + 1
1413                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1414                     text = header.contents[0]
1415
1416                     logger.debug("text[]='%s'", type(text))
1417                     if not isinstance(text, str):
1418                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1419                         continue
1420                     elif validators.domain(text.strip()):
1421                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1422                         continue
1423
1424                     text = tidyup.domain(text.strip())
1425                     logger.debug("text='%s' - AFTER!", text)
1426                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1427                         logger.debug("Found header: '%s'=%d", text, cnt)
1428                         block_headers[cnt] = text
1429
1430             elif len(block_headers) == 0:
1431                 logger.debug("row is not scrapable - SKIPPED!")
1432                 continue
1433             elif len(block_headers) > 0:
1434                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1435                 cnt = 0
1436                 block = dict()
1437
1438                 for element in row.find_all(["th", "td"]):
1439                     cnt = cnt + 1
1440                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1441                     if cnt in block_headers:
1442                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1443
1444                         text = element.text.strip()
1445                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1446
1447                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1448                         if key in ["domain", "instance"]:
1449                             block[key] = text
1450                         elif key == "reason":
1451                             block[key] = tidyup.reason(text)
1452                         elif key == "subdomain(s)":
1453                             block[key] = list()
1454                             if text != "":
1455                                 block[key] = text.split("/")
1456                         else:
1457                             logger.debug("key='%s'", key)
1458                             block[key] = text
1459
1460                 logger.debug("block()=%d ...", len(block))
1461                 if len(block) > 0:
1462                     logger.debug("Appending block()=%d ...", len(block))
1463                     blocklist.append(block)
1464
1465     logger.debug("blocklist()=%d", len(blocklist))
1466
1467     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1468     domains = database.cursor.fetchall()
1469
1470     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1471     blocking = list()
1472     for block in blocklist:
1473         logger.debug("block='%s'", block)
1474         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1475             origin = block["blocked"]
1476             logger.debug("origin='%s'", origin)
1477             for subdomain in block["subdomain(s)"]:
1478                 block["blocked"] = subdomain + "." + origin
1479                 logger.debug("block[blocked]='%s'", block["blocked"])
1480                 blocking.append(block)
1481         else:
1482             blocking.append(block)
1483
1484     logger.debug("blocking()=%d", blocking)
1485     for block in blocking:
1486         logger.debug("block[]='%s'", type(block))
1487         if "blocked" not in block:
1488             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1489
1490         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1491         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1492
1493         if block["blocked"] == "":
1494             logger.debug("block[blocked] is empty - SKIPPED!")
1495             continue
1496         elif not domain_helper.is_wanted(block["blocked"]):
1497             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1498             continue
1499         elif instances.is_recent(block["blocked"]):
1500             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1501             continue
1502
1503         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1504         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1505
1506     blockdict = list()
1507     for blocker in domains:
1508         blocker = blocker[0]
1509         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1510         instances.set_last_blocked(blocker)
1511
1512         for block in blocking:
1513             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1514             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1515
1516             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1517             if block["blocked"] == "":
1518                 logger.debug("block[blocked] is empty - SKIPPED!")
1519                 continue
1520             elif not domain_helper.is_wanted(block["blocked"]):
1521                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1522                 continue
1523
1524             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1525             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1526                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1527                 blockdict.append({
1528                     "blocked": block["blocked"],
1529                     "reason" : block["reason"],
1530                 })
1531
1532         if instances.has_pending(blocker):
1533             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1534             instances.update_data(blocker)
1535
1536         logger.debug("Invoking commit() ...")
1537         database.connection.commit()
1538
1539         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1540         if config.get("bot_enabled") and len(blockdict) > 0:
1541             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1542             network.send_bot_post(blocker, blockdict)
1543
1544     logger.debug("Success! - EXIT!")
1545     return 0
1546
1547 def recheck_obfuscation(args: argparse.Namespace) -> int:
1548     logger.debug("args[]='%s' - CALLED!", type(args))
1549
1550     logger.debug("Invoking locking.acquire() ...")
1551     locking.acquire()
1552
1553     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1554         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1555     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1556         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1557     else:
1558         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1559
1560     rows = database.cursor.fetchall()
1561     logger.info("Checking %d domains ...", len(rows))
1562     for row in rows:
1563         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1564         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1565             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1566             continue
1567
1568         blocking = list()
1569         if row["software"] == "pleroma":
1570             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1571             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1572         elif row["software"] == "mastodon":
1573             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1574             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1575         elif row["software"] == "lemmy":
1576             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1577             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1578         elif row["software"] == "friendica":
1579             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1580             blocking = friendica.fetch_blocks(row["domain"])
1581         elif row["software"] == "misskey":
1582             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1583             blocking = misskey.fetch_blocks(row["domain"])
1584         else:
1585             logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1586
1587         logger.debug("row[domain]='%s'", row["domain"])
1588         # chaos.social requires special care ...
1589         if row["domain"] != "chaos.social":
1590             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1591             instances.set_last_blocked(row["domain"])
1592             instances.set_total_blocks(row["domain"], blocking)
1593
1594         obfuscated = 0
1595         blockdict = list()
1596
1597         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1598         for block in blocking:
1599             logger.debug("block[blocked]='%s'", block["blocked"])
1600             blocked = None
1601
1602             if block["blocked"] == "":
1603                 logger.debug("block[blocked] is empty - SKIPPED!")
1604                 continue
1605             elif block["blocked"].endswith(".arpa"):
1606                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1607                 continue
1608             elif block["blocked"].endswith(".tld"):
1609                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1610                 continue
1611             elif block["blocked"].endswith(".onion"):
1612                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1613                 continue
1614             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1615                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1616                 obfuscated = obfuscated + 1
1617                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1618             elif not domain_helper.is_wanted(block["blocked"]):
1619                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1620                 continue
1621             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1622                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1623                 continue
1624
1625             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1626             if blocked is not None and blocked != block["blocked"]:
1627                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1628                 obfuscated = obfuscated - 1
1629
1630                 if blocks.is_instance_blocked(row["domain"], blocked):
1631                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1632                     continue
1633                 elif blacklist.is_blacklisted(blocked):
1634                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1635                     continue
1636
1637                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1638
1639                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1640                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1641                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1642                     blockdict.append({
1643                         "blocked": blocked,
1644                         "reason" : block["reason"],
1645                     })
1646
1647         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1648         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1649
1650         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1651         if obfuscated == 0 and len(blocking) > 0:
1652             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1653             instances.set_has_obfuscation(row["domain"], False)
1654
1655         if instances.has_pending(row["domain"]):
1656             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1657             instances.update_data(row["domain"])
1658
1659         logger.debug("Invoking commit() ...")
1660         database.connection.commit()
1661
1662         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1663         if config.get("bot_enabled") and len(blockdict) > 0:
1664             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1665             network.send_bot_post(row["domain"], blockdict)
1666
1667     logger.debug("Success! - EXIT!")
1668     return 0
1669
1670 def fetch_fedilist(args: argparse.Namespace) -> int:
1671     logger.debug("args[]='%s' - CALLED!", type(args))
1672
1673     logger.debug("Invoking locking.acquire() ...")
1674     locking.acquire()
1675
1676     source_domain = "demo.fedilist.com"
1677     if sources.is_recent(source_domain):
1678         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1679         return 0
1680     else:
1681         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1682         sources.update(source_domain)
1683
1684     url = f"http://{source_domain}/instance/csv?onion=not"
1685     if args.software is not None and args.software != "":
1686         logger.debug("args.software='%s'", args.software)
1687         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1688
1689     logger.info("Fetching url='%s' ...", url)
1690     response = reqto.get(
1691         url,
1692         headers=network.web_headers,
1693         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1694         allow_redirects=False
1695     )
1696
1697     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1698     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1699         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1700         return 1
1701
1702     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1703
1704     logger.debug("reader[]='%s'", type(reader))
1705     if reader is None:
1706         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1707         return 2
1708
1709     rows = list(reader)
1710
1711     logger.info("Checking %d rows ...", len(rows))
1712     for row in rows:
1713         logger.debug("row[]='%s'", type(row))
1714         if "hostname" not in row:
1715             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1716             continue
1717
1718         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1719         domain = tidyup.domain(row["hostname"])
1720         logger.debug("domain='%s' - AFTER!", domain)
1721
1722         if domain == "":
1723             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1724             continue
1725
1726         logger.debug("domain='%s' - BEFORE!", domain)
1727         domain = domain.encode("idna").decode("utf-8")
1728         logger.debug("domain='%s' - AFTER!", domain)
1729
1730         if not domain_helper.is_wanted(domain):
1731             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1732             continue
1733         elif (args.force is None or not args.force) and instances.is_registered(domain):
1734             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1735             continue
1736         elif instances.is_recent(domain):
1737             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1738             continue
1739
1740         logger.info("Fetching instances from domain='%s' ...", domain)
1741         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1742
1743     logger.debug("Success! - EXIT!")
1744     return 0
1745
1746 def update_nodeinfo(args: argparse.Namespace) -> int:
1747     logger.debug("args[]='%s' - CALLED!", type(args))
1748
1749     logger.debug("Invoking locking.acquire() ...")
1750     locking.acquire()
1751
1752     if args.domain is not None and args.domain != "":
1753         logger.debug("Fetching args.domain='%s'", args.domain)
1754         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1755     elif args.software is not None and args.software != "":
1756         logger.info("Fetching domains for args.software='%s'", args.software)
1757         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1758     else:
1759         logger.info("Fetching domains for recently updated ...")
1760         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1761
1762     domains = database.cursor.fetchall()
1763
1764     logger.info("Checking %d domain(s) ...", len(domains))
1765     cnt = 0
1766     for row in domains:
1767         logger.debug("row[]='%s'", type(row))
1768         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1769             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1770             continue
1771
1772         try:
1773             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1774             software = federation.determine_software(row["domain"])
1775
1776             logger.debug("Determined software='%s'", software)
1777             if (software != row["software"] and software is not None) or args.force is True:
1778                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1779                 instances.set_software(row["domain"], software)
1780
1781             if software is not None:
1782                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1783                 instances.set_success(row["domain"])
1784         except network.exceptions as exception:
1785             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1786             instances.set_last_error(row["domain"], exception)
1787
1788         instances.set_last_nodeinfo(row["domain"])
1789         instances.update_data(row["domain"])
1790         cnt = cnt + 1
1791
1792     logger.debug("Success! - EXIT!")
1793     return 0
1794
1795 def fetch_instances_social(args: argparse.Namespace) -> int:
1796     logger.debug("args[]='%s' - CALLED!", type(args))
1797
1798     logger.debug("Invoking locking.acquire() ...")
1799     locking.acquire()
1800
1801     source_domain = "instances.social"
1802
1803     if config.get("instances_social_api_key") == "":
1804         logger.error("API key not set. Please set in your config.json file.")
1805         return 1
1806     elif sources.is_recent(source_domain):
1807         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1808         return 0
1809     else:
1810         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1811         sources.update(source_domain)
1812
1813     headers = {
1814         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1815     }
1816
1817     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1818     fetched = network.get_json_api(
1819         source_domain,
1820         "/api/1.0/instances/list?count=0&sort_by=name",
1821         headers,
1822         (config.get("connection_timeout"), config.get("read_timeout"))
1823     )
1824     logger.debug("fetched[]='%s'", type(fetched))
1825
1826     if "error_message" in fetched:
1827         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1828         return 2
1829     elif "exception" in fetched:
1830         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1831         return 3
1832     elif "json" not in fetched:
1833         logger.warning("fetched has no element 'json' - EXIT!")
1834         return 4
1835     elif "instances" not in fetched["json"]:
1836         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1837         return 5
1838
1839     domains = list()
1840     rows = fetched["json"]["instances"]
1841
1842     logger.info("Checking %d row(s) ...", len(rows))
1843     for row in rows:
1844         logger.debug("row[]='%s'", type(row))
1845         domain = tidyup.domain(row["name"])
1846         logger.debug("domain='%s' - AFTER!", domain)
1847
1848         if domain == "":
1849             logger.debug("domain is empty - SKIPPED!")
1850             continue
1851
1852         logger.debug("domain='%s' - BEFORE!", domain)
1853         domain = domain.encode("idna").decode("utf-8")
1854         logger.debug("domain='%s' - AFTER!", domain)
1855
1856         if not domain_helper.is_wanted(domain):
1857             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1858             continue
1859         elif domain in domains:
1860             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1861             continue
1862         elif instances.is_registered(domain):
1863             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1864             continue
1865         elif instances.is_recent(domain):
1866             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1867             continue
1868
1869         logger.info("Fetching instances from domain='%s'", domain)
1870         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1871
1872     logger.debug("Success! - EXIT!")
1873     return 0
1874
1875 def fetch_relays(args: argparse.Namespace) -> int:
1876     logger.debug("args[]='%s' - CALLED!", type(args))
1877
1878     logger.debug("Invoking locking.acquire() ...")
1879     locking.acquire()
1880
1881     if args.domain is not None and args.domain != "":
1882         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1883     else:
1884         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1885
1886     domains = list()
1887     rows = database.cursor.fetchall()
1888
1889     logger.info("Checking %d relays ...", len(rows))
1890     for row in rows:
1891         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1892         peers = list()
1893         if not args.force and instances.is_recent(row["domain"]):
1894             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1895             continue
1896
1897         try:
1898             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1899             raw = utils.fetch_url(
1900                 f"https://{row['domain']}",
1901                 network.web_headers,
1902                 (config.get("connection_timeout"), config.get("read_timeout"))
1903             ).text
1904             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1905         except network.exceptions as exception:
1906             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1907             instances.set_last_error(row["domain"], exception)
1908             instances.set_last_instance_fetch(row["domain"])
1909             instances.update_data(row["domain"])
1910             continue
1911
1912         doc = bs4.BeautifulSoup(raw, features="html.parser")
1913         logger.debug("doc[]='%s'", type(doc))
1914
1915         logger.debug("row[software]='%s'", row["software"])
1916         if row["software"] == "activityrelay":
1917             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1918             tags = doc.findAll("p")
1919
1920             logger.debug("Checking %d paragraphs ...", len(tags))
1921             for tag in tags:
1922                 logger.debug("tag[]='%s'", type(tag))
1923                 if len(tag.contents) == 0:
1924                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1925                     continue
1926                 elif "registered instances" not in tag.contents[0]:
1927                     logger.debug("Skipping paragraph, text not found.")
1928                     continue
1929
1930                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1931                 for domain in tag.contents:
1932                     logger.debug("domain[%s]='%s'", type(domain), domain)
1933                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1934                         continue
1935
1936                     domain = str(domain)
1937                     logger.debug("domain='%s'", domain)
1938                     if not domain_helper.is_wanted(domain):
1939                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1940                         continue
1941
1942                     logger.debug("domain='%s' - BEFORE!", domain)
1943                     domain = tidyup.domain(domain)
1944                     logger.debug("domain='%s' - AFTER!", domain)
1945
1946                     if domain == "":
1947                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1948                         continue
1949                     elif domain not in peers:
1950                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1951                         peers.append(domain)
1952
1953                     if dict_helper.has_key(domains, "domain", domain):
1954                         logger.debug("domain='%s' already added", domain)
1955                         continue
1956
1957                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1958                     domains.append({
1959                         "domain": domain,
1960                         "origin": row["domain"],
1961                     })
1962         elif row["software"] in ["aoderelay", "selective-relay"]:
1963             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1964             if row["software"] == "aoderelay":
1965                 tags = doc.findAll("section", {"class": "instance"})
1966             else:
1967                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1968
1969             logger.debug("Checking %d tags ...", len(tags))
1970             for tag in tags:
1971                 logger.debug("tag[]='%s'", type(tag))
1972
1973                 link = tag.find("a")
1974                 logger.debug("link[%s]='%s'", type(link), link)
1975                 if link is None:
1976                     logger.warning("tag='%s' has no a-tag ...", tag)
1977                     continue
1978
1979                 components = urlparse(link["href"])
1980                 domain = components.netloc.lower()
1981
1982                 if not domain_helper.is_wanted(domain):
1983                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1984                     continue
1985
1986                 logger.debug("domain='%s' - BEFORE!", domain)
1987                 domain = tidyup.domain(domain)
1988                 logger.debug("domain='%s' - AFTER!", domain)
1989
1990                 if domain == "":
1991                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1992                     continue
1993                 elif domain not in peers:
1994                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1995                     peers.append(domain)
1996
1997                 if dict_helper.has_key(domains, "domain", domain):
1998                     logger.debug("domain='%s' already added", domain)
1999                     continue
2000
2001                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
2002                 domains.append({
2003                     "domain": domain,
2004                     "origin": row["domain"],
2005                 })
2006         else:
2007             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
2008
2009         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
2010         instances.set_last_instance_fetch(row["domain"])
2011
2012         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
2013         instances.set_total_peers(row["domain"], peers)
2014
2015         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
2016         instances.update_data(row["domain"])
2017
2018     logger.info("Checking %d domains ...", len(domains))
2019     for row in domains:
2020         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2021         if instances.is_registered(row["domain"]):
2022             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2023             continue
2024
2025         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2026         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2027
2028     logger.debug("Success! - EXIT!")
2029     return 0
2030
2031 def convert_idna(args: argparse.Namespace) -> int:
2032     logger.debug("args[]='%s' - CALLED!", type(args))
2033
2034     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2035     rows = database.cursor.fetchall()
2036
2037     logger.debug("rows[]='%s'", type(rows))
2038     instances.translate_idnas(rows, "domain")
2039
2040     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2041     rows = database.cursor.fetchall()
2042
2043     logger.debug("rows[]='%s'", type(rows))
2044     instances.translate_idnas(rows, "origin")
2045
2046     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2047     rows = database.cursor.fetchall()
2048
2049     logger.debug("rows[]='%s'", type(rows))
2050     blocks.translate_idnas(rows, "blocker")
2051
2052     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2053     rows = database.cursor.fetchall()
2054
2055     logger.debug("rows[]='%s'", type(rows))
2056     blocks.translate_idnas(rows, "blocked")
2057
2058     logger.debug("Success! - EXIT!")
2059     return 0