]> git.mxchange.org Git - fba.git/blob - fba/commands.py
c6199ef8ba86295d3401920626953618e9c0fefa
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 0
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] == "":
152                 logger.debug("row[domain] is empty - SKIPPED!")
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 0
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] == "":
228                 logger.debug("entry[domain] is empty - SKIPPED!")
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocking()=%d,nodeinfo_url='%s'", len(blocking), nodeinfo_url)
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s'", blocker, software)
331             if software == "pleroma":
332                 logger.info("blocker='%s',software='%s'", blocker, software)
333                 blocking = pleroma.fetch_blocks(blocker)
334                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
335             elif software == "mastodon":
336                 logger.info("blocker='%s',software='%s'", blocker, software)
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 logger.info("blocker='%s',software='%s'", blocker, software)
341                 blocking = lemmy.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "friendica":
344                 logger.info("blocker='%s',software='%s'", blocker, software)
345                 blocking = friendica.fetch_blocks(blocker)
346                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
347             elif software == "misskey":
348                 logger.info("blocker='%s',software='%s'", blocker, software)
349                 blocking = misskey.fetch_blocks(blocker)
350                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
351             else:
352                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
353
354         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
355         instances.set_total_blocks(blocker, blocking)
356
357         blockdict = list()
358
359         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
360         for block in blocking:
361             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
362
363             if block["block_level"] == "":
364                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
365                 continue
366
367             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
368             block["blocked"] = tidyup.domain(block["blocked"])
369             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
370             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
371
372             if block["blocked"] == "":
373                 logger.warning("blocked is empty, blocker='%s'", blocker)
374                 continue
375             elif block["blocked"].endswith(".onion"):
376                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
377                 continue
378             elif block["blocked"].endswith(".arpa"):
379                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
380                 continue
381             elif block["blocked"].endswith(".tld"):
382                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
383                 continue
384             elif block["blocked"].find("*") >= 0:
385                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
386
387                 # Some friendica servers also obscure domains without hash
388                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
389
390                 logger.debug("row[]='%s'", type(row))
391                 if row is None:
392                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
393                     instances.set_has_obfuscation(blocker, True)
394                     continue
395
396                 block["blocked"] = row["domain"]
397                 origin           = row["origin"]
398                 nodeinfo_url     = row["nodeinfo_url"]
399             elif block["blocked"].find("?") >= 0:
400                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
401
402                 # Some obscure them with question marks, not sure if that's dependent on version or not
403                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
404
405                 logger.debug("row[]='%s'", type(row))
406                 if row is None:
407                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
408                     instances.set_has_obfuscation(blocker, True)
409                     continue
410
411                 block["blocked"] = row["domain"]
412                 origin           = row["origin"]
413                 nodeinfo_url     = row["nodeinfo_url"]
414
415             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
416             if block["blocked"] == "":
417                 logger.debug("block[blocked] is empty - SKIPPED!")
418                 continue
419
420             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
421             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
422             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
423
424             if not domain_helper.is_wanted(block["blocked"]):
425                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
426                 continue
427             elif block["block_level"] in ["accept", "accepted"]:
428                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
429                 continue
430             elif not instances.is_registered(block["blocked"]):
431                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
432                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
433
434             block["block_level"] = blocks.alias_block_level(block["block_level"])
435
436             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
437                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
438                 blockdict.append({
439                     "blocked": block["blocked"],
440                     "reason" : block["reason"],
441                 })
442
443             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
444             cookies.clear(block["blocked"])
445
446         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
447         if instances.has_pending(blocker):
448             logger.debug("Flushing updates for blocker='%s' ...", blocker)
449             instances.update(blocker)
450
451         logger.debug("Invoking commit() ...")
452         database.connection.commit()
453
454         logger.debug("Invoking cookies.clear(%s) ...", blocker)
455         cookies.clear(blocker)
456
457         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
458         if config.get("bot_enabled") and len(blockdict) > 0:
459             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
460             network.send_bot_post(blocker, blockdict)
461
462     logger.debug("Success! - EXIT!")
463     return 0
464
465 def fetch_observer(args: argparse.Namespace) -> int:
466     logger.debug("args[]='%s' - CALLED!", type(args))
467
468     logger.debug("Invoking locking.acquire() ...")
469     locking.acquire()
470
471     source_domain = "fediverse.observer"
472     if sources.is_recent(source_domain):
473         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
474         return 0
475     else:
476         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
477         sources.update(source_domain)
478
479     types = list()
480     if args.software is None:
481         logger.info("Fetching software list ...")
482         raw = utils.fetch_url(
483             f"https://{source_domain}",
484             network.web_headers,
485             (config.get("connection_timeout"), config.get("read_timeout"))
486         ).text
487         logger.debug("raw[%s]()=%d", type(raw), len(raw))
488
489         doc = bs4.BeautifulSoup(raw, features="html.parser")
490         logger.debug("doc[]='%s'", type(doc))
491
492         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
493         logger.debug("navbar[]='%s'", type(navbar))
494         if navbar is None:
495             logger.warning("Cannot find navigation bar, cannot continue!")
496             return 1
497
498         items = navbar.findAll("a", {"class": "dropdown-item"})
499         logger.debug("items[]='%s'", type(items))
500
501         logger.info("Checking %d menu items ...", len(items))
502         for item in items:
503             logger.debug("item[%s]='%s'", type(item), item)
504             if item.text.lower() == "all":
505                 logger.debug("Skipping 'All' menu entry ...")
506                 continue
507
508             logger.debug("Appending item.text='%s' ...", item.text)
509             types.append(tidyup.domain(item.text))
510     else:
511         logger.info("Adding args.software='%s' as type ...", args.software)
512         types.append(args.software)
513
514     logger.info("Fetching %d different table data ...", len(types))
515     for software in types:
516         logger.debug("software='%s' - BEFORE!", software)
517         if args.software is not None and args.software != software:
518             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
519             continue
520
521         doc = None
522         try:
523             logger.debug("Fetching table data for software='%s' ...", software)
524             raw = utils.fetch_url(
525                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
526                 network.web_headers,
527                 (config.get("connection_timeout"), config.get("read_timeout"))
528             ).text
529             logger.debug("raw[%s]()=%d", type(raw), len(raw))
530
531             doc = bs4.BeautifulSoup(raw, features="html.parser")
532             logger.debug("doc[]='%s'", type(doc))
533         except network.exceptions as exception:
534             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
535             continue
536
537         items = doc.findAll("a", {"class": "url"})
538         logger.info("Checking %d items,software='%s' ...", len(items), software)
539         for item in items:
540             logger.debug("item[]='%s'", type(item))
541             domain = item.decode_contents()
542             logger.debug("domain='%s' - AFTER!", domain)
543
544             if domain == "":
545                 logger.debug("domain is empty - SKIPPED!")
546                 continue
547
548             logger.debug("domain='%s' - BEFORE!", domain)
549             domain = domain.encode("idna").decode("utf-8")
550             logger.debug("domain='%s' - AFTER!", domain)
551
552             if not domain_helper.is_wanted(domain):
553                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
554                 continue
555             elif instances.is_registered(domain):
556                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
557                 continue
558
559             software = software_helper.alias(software)
560             logger.info("Fetching instances for domain='%s'", domain)
561             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
562
563     logger.debug("Success! - EXIT!")
564     return 0
565
566 def fetch_todon_wiki(args: argparse.Namespace) -> int:
567     logger.debug("args[]='%s' - CALLED!", type(args))
568
569     logger.debug("Invoking locking.acquire() ...")
570     locking.acquire()
571
572     source_domain = "wiki.todon.eu"
573     if sources.is_recent(source_domain):
574         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
575         return 0
576     else:
577         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
578         sources.update(source_domain)
579
580     blocklist = {
581         "silenced": list(),
582         "reject": list(),
583     }
584
585     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
586     raw = utils.fetch_url(
587         f"https://{source_domain}/todon/domainblocks",
588         network.web_headers,
589         (config.get("connection_timeout"), config.get("read_timeout"))
590     ).text
591     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
592
593     doc = bs4.BeautifulSoup(raw, "html.parser")
594     logger.debug("doc[]='%s'", type(doc))
595
596     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
597     logger.info("Checking %d silenced/limited entries ...", len(silenced))
598     blocklist["silenced"] = utils.find_domains(silenced, "div")
599
600     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
601     logger.info("Checking %d suspended entries ...", len(suspended))
602     blocklist["reject"] = utils.find_domains(suspended, "div")
603
604     blocking = blocklist["silenced"] + blocklist["reject"]
605     blocker = "todon.eu"
606
607     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
608     instances.set_last_blocked(blocker)
609     instances.set_total_blocks(blocker, blocking)
610
611     blockdict = list()
612     for block_level in blocklist:
613         blockers = blocklist[block_level]
614
615         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
616         for blocked in blockers:
617             logger.debug("blocked='%s'", blocked)
618
619             if not instances.is_registered(blocked):
620                 try:
621                     logger.info("Fetching instances from domain='%s' ...", blocked)
622                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
623                 except network.exceptions as exception:
624                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
625                     instances.set_last_error(blocked, exception)
626
627             if blocks.is_instance_blocked(blocker, blocked, block_level):
628                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
629                 continue
630
631             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
632             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
633                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
634                 blockdict.append({
635                     "blocked": blocked,
636                     "reason" : None,
637                 })
638
639         logger.debug("Invoking commit() ...")
640         database.connection.commit()
641
642         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
643         if config.get("bot_enabled") and len(blockdict) > 0:
644             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
645             network.send_bot_post(blocker, blockdict)
646
647     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
648     if instances.has_pending(blocker):
649         logger.debug("Flushing updates for blocker='%s' ...", blocker)
650         instances.update(blocker)
651
652     logger.debug("Success! - EXIT!")
653     return 0
654
655 def fetch_cs(args: argparse.Namespace):
656     logger.debug("args[]='%s' - CALLED!", type(args))
657
658     logger.debug("Invoking locking.acquire() ...")
659     locking.acquire()
660
661     extensions = [
662         "extra",
663         "abbr",
664         "attr_list",
665         "def_list",
666         "fenced_code",
667         "footnotes",
668         "md_in_html",
669         "admonition",
670         "codehilite",
671         "legacy_attrs",
672         "legacy_em",
673         "meta",
674         "nl2br",
675         "sane_lists",
676         "smarty",
677         "toc",
678         "wikilinks"
679     ]
680
681     blocklist = {
682         "silenced": list(),
683         "reject"  : list(),
684     }
685
686     source_domain = "raw.githubusercontent.com"
687     if sources.is_recent(source_domain):
688         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
689         return 0
690     else:
691         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
692         sources.update(source_domain)
693
694     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
695     raw = utils.fetch_url(
696         f"https://{source_domain}/chaossocial/meta/master/federation.md",
697         network.web_headers,
698         (config.get("connection_timeout"), config.get("read_timeout"))
699     ).text
700     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
701
702     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
703     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
704
705     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
706     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
707     blocklist["silenced"] = federation.find_domains(silenced)
708
709     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
710     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
711     blocklist["reject"] = federation.find_domains(blocked)
712
713     blocking = blocklist["silenced"] + blocklist["reject"]
714     blocker = "chaos.social"
715
716     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
717     instances.set_last_blocked(blocker)
718     instances.set_total_blocks(blocker, blocking)
719
720     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
721     if len(blocking) > 0:
722         blockdict = list()
723         for block_level in blocklist:
724             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
725
726             for row in blocklist[block_level]:
727                 logger.debug("row[%s]='%s'", type(row), row)
728                 if not "domain" in row:
729                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
730                     continue
731                 elif not instances.is_registered(row["domain"]):
732                     try:
733                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
734                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
735                     except network.exceptions as exception:
736                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
737                         instances.set_last_error(row["domain"], exception)
738
739                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
740                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
741                     blockdict.append({
742                         "blocked": row["domain"],
743                         "reason" : row["reason"],
744                     })
745
746         logger.debug("Invoking commit() ...")
747         database.connection.commit()
748
749         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
750         if config.get("bot_enabled") and len(blockdict) > 0:
751             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
752             network.send_bot_post(blocker, blockdict)
753
754     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
755     if instances.has_pending(blocker):
756         logger.debug("Flushing updates for blocker='%s' ...", blocker)
757         instances.update(blocker)
758
759     logger.debug("Success! - EXIT!")
760     return 0
761
762 def fetch_fba_rss(args: argparse.Namespace) -> int:
763     logger.debug("args[]='%s' - CALLED!", type(args))
764
765     domains = list()
766
767     logger.debug("Invoking locking.acquire() ...")
768     locking.acquire()
769
770     components = urlparse(args.feed)
771
772     if sources.is_recent(components.netloc):
773         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
774         return 0
775     else:
776         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
777         sources.update(components.netloc)
778
779     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
780     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
781
782     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
783     if response.ok and response.status_code == 200 and len(response.text) > 0:
784         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
785         rss = atoma.parse_rss_bytes(response.content)
786
787         logger.debug("rss[]='%s'", type(rss))
788         for item in rss.items:
789             logger.debug("item[%s]='%s'", type(item), item)
790             domain = tidyup.domain(item.link.split("=")[1])
791
792             logger.debug("domain='%s' - AFTER!", domain)
793             if domain == "":
794                 logger.debug("domain is empty - SKIPPED!")
795                 continue
796
797             logger.debug("domain='%s' - BEFORE!", domain)
798             domain = domain.encode("idna").decode("utf-8")
799             logger.debug("domain='%s' - AFTER!", domain)
800
801             if not domain_helper.is_wanted(domain):
802                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
803                 continue
804             elif domain in domains:
805                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
806                 continue
807             elif instances.is_registered(domain):
808                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
809                 continue
810             elif instances.is_recent(domain):
811                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
812                 continue
813
814             logger.debug("Adding domain='%s'", domain)
815             domains.append(domain)
816
817     logger.debug("domains()=%d", len(domains))
818     if len(domains) > 0:
819         logger.info("Adding %d new instances ...", len(domains))
820         for domain in domains:
821             logger.debug("domain='%s'", domain)
822             try:
823                 logger.info("Fetching instances from domain='%s' ...", domain)
824                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
825             except network.exceptions as exception:
826                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
827                 instances.set_last_error(domain, exception)
828                 return 100
829
830     logger.debug("Success! - EXIT!")
831     return 0
832
833 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
834     logger.debug("args[]='%s' - CALLED!", type(args))
835
836     logger.debug("Invoking locking.acquire() ...")
837     locking.acquire()
838
839     source_domain = "ryona.agency"
840     feed = f"https://{source_domain}/users/fba/feed.atom"
841
842     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
843     if args.feed is not None and validators.url(args.feed):
844         logger.debug("Setting feed='%s' ...", args.feed)
845         feed = str(args.feed)
846         source_domain = urlparse(args.feed).netloc
847
848     if sources.is_recent(source_domain):
849         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
850         return 0
851     else:
852         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
853         sources.update(source_domain)
854
855     domains = list()
856
857     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
858     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
859
860     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
861     if response.ok and response.status_code == 200 and len(response.text) > 0:
862         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
863         atom = atoma.parse_atom_bytes(response.content)
864
865         logger.debug("atom[]='%s'", type(atom))
866         for entry in atom.entries:
867             logger.debug("entry[]='%s'", type(entry))
868             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
869             logger.debug("doc[]='%s'", type(doc))
870             for element in doc.findAll("a"):
871                 logger.debug("element[]='%s'", type(element))
872                 for href in element["href"].split(","):
873                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
874                     domain = tidyup.domain(href)
875
876                     logger.debug("domain='%s' - AFTER!", domain)
877                     if domain == "":
878                         logger.debug("domain is empty - SKIPPED!")
879                         continue
880
881                     logger.debug("domain='%s' - BEFORE!", domain)
882                     domain = domain.encode("idna").decode("utf-8")
883                     logger.debug("domain='%s' - AFTER!", domain)
884
885                     if not domain_helper.is_wanted(domain):
886                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
887                         continue
888                     elif domain in domains:
889                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
890                         continue
891                     elif instances.is_registered(domain):
892                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
893                         continue
894                     elif instances.is_recent(domain):
895                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
896                         continue
897
898                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
899                     domains.append(domain)
900
901     logger.debug("domains()=%d", len(domains))
902     if len(domains) > 0:
903         logger.info("Adding %d new instances ...", len(domains))
904         for domain in domains:
905             logger.debug("domain='%s'", domain)
906             try:
907                 logger.info("Fetching instances from domain='%s' ...", domain)
908                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
909             except network.exceptions as exception:
910                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
911                 instances.set_last_error(domain, exception)
912                 return 100
913
914     logger.debug("Success! - EXIT!")
915     return 0
916
917 def fetch_instances(args: argparse.Namespace) -> int:
918     logger.debug("args[]='%s' - CALLED!", type(args))
919
920     logger.debug("args.domain='%s' - checking ...", args.domain)
921     if not validators.domain(args.domain):
922         logger.warning("args.domain='%s' is not valid.", args.domain)
923         return 100
924     elif blacklist.is_blacklisted(args.domain):
925         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
926         return 101
927
928     logger.debug("Invoking locking.acquire() ...")
929     locking.acquire()
930
931     # Initialize values
932     domain = tidyup.domain(args.domain)
933     origin = software = None
934
935     # Fetch record
936     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
937     row = database.cursor.fetchone()
938     if row is not None:
939         origin = row["origin"]
940         software = row["software"]
941
942     # Initial fetch
943     try:
944         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
945         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
946     except network.exceptions as exception:
947         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
948         instances.set_last_error(args.domain, exception)
949         instances.update(args.domain)
950         return 100
951
952     if args.single:
953         logger.debug("Not fetching more instances - EXIT!")
954         return 0
955
956     # Loop through some instances
957     database.cursor.execute(
958         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
959     )
960
961     rows = database.cursor.fetchall()
962     logger.info("Checking %d entries ...", len(rows))
963     for row in rows:
964         logger.debug("row[domain]='%s'", row["domain"])
965         if row["domain"] == "":
966             logger.debug("row[domain] is empty - SKIPPED!")
967             continue
968
969         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
970         domain = row["domain"].encode("idna").decode("utf-8")
971         logger.debug("domain='%s' - AFTER!", domain)
972
973         if not domain_helper.is_wanted(domain):
974             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
975             continue
976
977         try:
978             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
979             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
980         except network.exceptions as exception:
981             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
982             instances.set_last_error(domain, exception)
983
984     logger.debug("Success - EXIT!")
985     return 0
986
987 def fetch_oliphant(args: argparse.Namespace) -> int:
988     logger.debug("args[]='%s' - CALLED!", type(args))
989
990     logger.debug("Invoking locking.acquire() ...")
991     locking.acquire()
992
993     source_domain = "codeberg.org"
994     if sources.is_recent(source_domain):
995         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
996         return 0
997     else:
998         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
999         sources.update(source_domain)
1000
1001     # Base URL
1002     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1003
1004     domains = list()
1005
1006     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1007     for block in blocklists.oliphant_blocklists:
1008         # Is domain given and not equal blocker?
1009         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1010             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1011             continue
1012         elif args.domain in domains:
1013             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1014             continue
1015
1016         instances.set_last_blocked(block["blocker"])
1017
1018         # Fetch this URL
1019         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1020         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1021
1022         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1023         if not response.ok or response.status_code > 200 or response.content == "":
1024             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1025             continue
1026
1027         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1028         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1029
1030         blockdict = list()
1031
1032         cnt = 0
1033         for row in reader:
1034             logger.debug("row[%s]='%s'", type(row), row)
1035             domain = severity = None
1036             reject_media = reject_reports = False
1037
1038             if "#domain" in row:
1039                 domain = row["#domain"]
1040             elif "domain" in row:
1041                 domain = row["domain"]
1042             else:
1043                 logger.debug("row='%s' does not contain domain column", row)
1044                 continue
1045
1046             if "#severity" in row:
1047                 severity = blocks.alias_block_level(row["#severity"])
1048             elif "severity" in row:
1049                 severity = blocks.alias_block_level(row["severity"])
1050             else:
1051                 logger.debug("row='%s' does not contain severity column", row)
1052                 continue
1053
1054             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1055                 reject_media = True
1056             elif "reject_media" in row and row["reject_media"].lower() == "true":
1057                 reject_media = True
1058
1059             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1060                 reject_reports = True
1061             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1062                 reject_reports = True
1063
1064             cnt = cnt + 1
1065             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1066             if domain == "":
1067                 logger.debug("domain is empty - SKIPPED!")
1068                 continue
1069             elif domain.endswith(".onion"):
1070                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1071                 continue
1072             elif domain.endswith(".arpa"):
1073                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1074                 continue
1075             elif domain.endswith(".tld"):
1076                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1077                 continue
1078             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1079                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1080                 domain = utils.deobfuscate(domain, block["blocker"])
1081                 logger.debug("domain='%s' - AFTER!", domain)
1082
1083             if not validators.domain(domain):
1084                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1085                 continue
1086             elif blacklist.is_blacklisted(domain):
1087                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1088                 continue
1089             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1090                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1091                 continue
1092
1093             logger.debug("Marking domain='%s' as handled", domain)
1094             domains.append(domain)
1095
1096             logger.debug("Processing domain='%s' ...", domain)
1097             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1098             logger.debug("processed='%s'", processed)
1099
1100             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1101                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1102                 blockdict.append({
1103                     "blocked": domain,
1104                     "reason" : block["reason"],
1105                 })
1106
1107             if reject_media:
1108                 processing.block(block["blocker"], domain, None, "reject_media")
1109             if reject_reports:
1110                 processing.block(block["blocker"], domain, None, "reject_reports")
1111
1112         logger.debug("block[blocker]='%s'", block["blocker"])
1113         if not blocklists.has(block["blocker"]):
1114             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1115             instances.set_total_blocks(block["blocker"], domains)
1116
1117         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1118         if instances.has_pending(block["blocker"]):
1119             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1120             instances.update(block["blocker"])
1121
1122         logger.debug("Invoking commit() ...")
1123         database.connection.commit()
1124
1125         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1126         if config.get("bot_enabled") and len(blockdict) > 0:
1127             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1128             network.send_bot_post(block["blocker"], blockdict)
1129
1130     logger.debug("Success! - EXIT!")
1131     return 0
1132
1133 def fetch_txt(args: argparse.Namespace) -> int:
1134     logger.debug("args[]='%s' - CALLED!", type(args))
1135
1136     logger.debug("Invoking locking.acquire() ...")
1137     locking.acquire()
1138
1139     # Static URLs
1140     urls = ({
1141         "blocker": "seirdy.one",
1142         "url"    : "https://seirdy.one/pb/bsl.txt",
1143     },)
1144
1145     logger.info("Checking %d text file(s) ...", len(urls))
1146     for row in urls:
1147         logger.debug("Fetching row[url]='%s' ...", row["url"])
1148         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1149
1150         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1151         if response.ok and response.status_code == 200 and response.text != "":
1152             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1153             domains = response.text.split("\n")
1154
1155             logger.info("Processing %d domains ...", len(domains))
1156             for domain in domains:
1157                 logger.debug("domain='%s' - BEFORE!", domain)
1158                 domain = tidyup.domain(domain)
1159
1160                 logger.debug("domain='%s' - AFTER!", domain)
1161                 if domain == "":
1162                     logger.debug("domain is empty - SKIPPED!")
1163                     continue
1164                 elif not domain_helper.is_wanted(domain):
1165                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1166                     continue
1167                 elif instances.is_recent(domain):
1168                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1169                     continue
1170
1171                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1172                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1173
1174                 logger.debug("processed='%s'", processed)
1175                 if not processed:
1176                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1177                     continue
1178
1179     logger.debug("Success! - EXIT!")
1180     return 0
1181
1182 def fetch_fedipact(args: argparse.Namespace) -> int:
1183     logger.debug("args[]='%s' - CALLED!", type(args))
1184
1185     logger.debug("Invoking locking.acquire() ...")
1186     locking.acquire()
1187
1188     source_domain = "fedipact.online"
1189     if sources.is_recent(source_domain):
1190         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1191         return 0
1192     else:
1193         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1194         sources.update(source_domain)
1195
1196     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1197     response = utils.fetch_url(
1198         f"https://{source_domain}",
1199         network.web_headers,
1200         (config.get("connection_timeout"), config.get("read_timeout"))
1201     )
1202
1203     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1204     if response.ok and response.status_code == 200 and response.text != "":
1205         logger.debug("Parsing %d Bytes ...", len(response.text))
1206
1207         doc = bs4.BeautifulSoup(response.text, "html.parser")
1208         logger.debug("doc[]='%s'", type(doc))
1209
1210         rows = doc.findAll("li")
1211         logger.info("Checking %d row(s) ...", len(rows))
1212         for row in rows:
1213             logger.debug("row[]='%s'", type(row))
1214             domain = tidyup.domain(row.contents[0])
1215
1216             logger.debug("domain='%s' - AFTER!", domain)
1217             if domain == "":
1218                 logger.debug("domain is empty - SKIPPED!")
1219                 continue
1220
1221             logger.debug("domain='%s' - BEFORE!", domain)
1222             domain = domain.encode("idna").decode("utf-8")
1223             logger.debug("domain='%s' - AFTER!", domain)
1224
1225             if not domain_helper.is_wanted(domain):
1226                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1227                 continue
1228             elif instances.is_registered(domain):
1229                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1230                 continue
1231             elif instances.is_recent(domain):
1232                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1233                 continue
1234
1235             logger.info("Fetching domain='%s' ...", domain)
1236             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1237
1238     logger.debug("Success! - EXIT!")
1239     return 0
1240
1241 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1242     logger.debug("args[]='%s' - CALLED!", type(args))
1243
1244     logger.debug("Invoking locking.acquire() ...")
1245     locking.acquire()
1246
1247     source_domain = "instances.joinmobilizon.org"
1248     if sources.is_recent(source_domain):
1249         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1250         return 0
1251     else:
1252         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1253         sources.update(source_domain)
1254
1255     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1256     raw = utils.fetch_url(
1257         f"https://{source_domain}/api/v1/instances",
1258         network.web_headers,
1259         (config.get("connection_timeout"), config.get("read_timeout"))
1260     ).text
1261     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1262
1263     parsed = json.loads(raw)
1264     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1265
1266     if "data" not in parsed:
1267         logger.warning("parsed()=%d does not contain key 'data'")
1268         return 1
1269
1270     logger.info("Checking %d instances ...", len(parsed["data"]))
1271     for row in parsed["data"]:
1272         logger.debug("row[]='%s'", type(row))
1273         if "host" not in row:
1274             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1275             continue
1276         elif not domain_helper.is_wanted(row["host"]):
1277             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1278             continue
1279         elif instances.is_registered(row["host"]):
1280             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1281             continue
1282
1283         logger.info("Fetching row[host]='%s' ...", row["host"])
1284         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1285
1286     logger.debug("Success! - EXIT!")
1287     return 0
1288
1289 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1290     logger.debug("args[]='%s' - CALLED!", type(args))
1291
1292     logger.debug("Invoking locking.acquire() ...")
1293     locking.acquire()
1294
1295     source_domain = "instanceapp.misskey.page"
1296     if sources.is_recent(source_domain):
1297         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1298         return 0
1299     else:
1300         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1301         sources.update(source_domain)
1302
1303     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1304     raw = utils.fetch_url(
1305         f"https://{source_domain}/instances.json",
1306         network.web_headers,
1307         (config.get("connection_timeout"), config.get("read_timeout"))
1308     ).text
1309     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1310
1311     parsed = json.loads(raw)
1312     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1313
1314     if "instancesInfos" not in parsed:
1315         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1316         return 1
1317
1318     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1319     for row in parsed["instancesInfos"]:
1320         logger.debug("row[%s]='%s'", type(row), row)
1321         if "url" not in row:
1322             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1323             continue
1324         elif not domain_helper.is_wanted(row["url"]):
1325             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1326             continue
1327         elif instances.is_registered(row["url"]):
1328             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1329             continue
1330
1331         logger.info("Fetching row[url]='%s' ...", row["url"])
1332         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1333
1334     logger.debug("Success! - EXIT!")
1335     return 0
1336
1337 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1338     logger.debug("args[]='%s' - CALLED!", type(args))
1339
1340     logger.debug("Invoking locking.acquire() ...")
1341     locking.acquire()
1342
1343     source_domain = "joinfediverse.wiki"
1344     if sources.is_recent(source_domain):
1345         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1346         return 0
1347     else:
1348         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1349         sources.update(source_domain)
1350
1351     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1352     raw = utils.fetch_url(
1353         f"https://{source_domain}/FediBlock",
1354         network.web_headers,
1355         (config.get("connection_timeout"), config.get("read_timeout"))
1356     ).text
1357     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1358
1359     doc = bs4.BeautifulSoup(raw, "html.parser")
1360     logger.debug("doc[]='%s'", type(doc))
1361
1362     tables = doc.findAll("table", {"class": "wikitable"})
1363
1364     logger.info("Analyzing %d table(s) ...", len(tables))
1365     blocklist = list()
1366     for table in tables:
1367         logger.debug("table[]='%s'", type(table))
1368
1369         rows = table.findAll("tr")
1370         logger.info("Checking %d row(s) ...", len(rows))
1371         block_headers = dict()
1372         for row in rows:
1373             logger.debug("row[%s]='%s'", type(row), row)
1374
1375             headers = row.findAll("th")
1376             logger.debug("Found headers()=%d header(s)", len(headers))
1377             if len(headers) > 1:
1378                 block_headers = dict()
1379                 cnt = 0
1380                 for header in headers:
1381                     cnt = cnt + 1
1382                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1383                     text = header.contents[0]
1384
1385                     logger.debug("text[]='%s'", type(text))
1386                     if not isinstance(text, str):
1387                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1388                         continue
1389                     elif validators.domain(text.strip()):
1390                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1391                         continue
1392
1393                     text = tidyup.domain(text.strip())
1394                     logger.debug("text='%s' - AFTER!", text)
1395                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1396                         logger.debug("Found header: '%s'=%d", text, cnt)
1397                         block_headers[cnt] = text
1398
1399             elif len(block_headers) == 0:
1400                 logger.debug("row is not scrapable - SKIPPED!")
1401                 continue
1402             elif len(block_headers) > 0:
1403                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1404                 cnt = 0
1405                 block = dict()
1406
1407                 for element in row.find_all(["th", "td"]):
1408                     cnt = cnt + 1
1409                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1410                     if cnt in block_headers:
1411                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1412
1413                         text = element.text.strip()
1414                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1415
1416                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1417                         if key in ["domain", "instance"]:
1418                             block[key] = text
1419                         elif key == "reason":
1420                             block[key] = tidyup.reason(text)
1421                         elif key == "subdomain(s)":
1422                             block[key] = list()
1423                             if text != "":
1424                                 block[key] = text.split("/")
1425                         else:
1426                             logger.debug("key='%s'", key)
1427                             block[key] = text
1428
1429                 logger.debug("block()=%d ...", len(block))
1430                 if len(block) > 0:
1431                     logger.debug("Appending block()=%d ...", len(block))
1432                     blocklist.append(block)
1433
1434     logger.debug("blocklist()=%d", len(blocklist))
1435
1436     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1437     domains = database.cursor.fetchall()
1438
1439     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1440     blocking = list()
1441     for block in blocklist:
1442         logger.debug("block='%s'", block)
1443         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1444             origin = block["blocked"]
1445             logger.debug("origin='%s'", origin)
1446             for subdomain in block["subdomain(s)"]:
1447                 block["blocked"] = subdomain + "." + origin
1448                 logger.debug("block[blocked]='%s'", block["blocked"])
1449                 blocking.append(block)
1450         else:
1451             blocking.append(block)
1452
1453     logger.debug("blocking()=%d", blocking)
1454     for block in blocking:
1455         logger.debug("block[]='%s'", type(block))
1456         if "blocked" not in block:
1457             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1458
1459         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1460         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1461
1462         if block["blocked"] == "":
1463             logger.debug("block[blocked] is empty - SKIPPED!")
1464             continue
1465         elif not domain_helper.is_wanted(block["blocked"]):
1466             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1467             continue
1468         elif instances.is_recent(block["blocked"]):
1469             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1470             continue
1471
1472         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1473         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1474
1475     blockdict = list()
1476     for blocker in domains:
1477         blocker = blocker[0]
1478         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1479         instances.set_last_blocked(blocker)
1480
1481         for block in blocking:
1482             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1483             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1484
1485             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1486             if block["blocked"] == "":
1487                 logger.debug("block[blocked] is empty - SKIPPED!")
1488                 continue
1489             elif not domain_helper.is_wanted(block["blocked"]):
1490                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1491                 continue
1492
1493             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1494             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1495                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1496                 blockdict.append({
1497                     "blocked": block["blocked"],
1498                     "reason" : block["reason"],
1499                 })
1500
1501         if instances.has_pending(blocker):
1502             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1503             instances.update(blocker)
1504
1505         logger.debug("Invoking commit() ...")
1506         database.connection.commit()
1507
1508         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1509         if config.get("bot_enabled") and len(blockdict) > 0:
1510             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1511             network.send_bot_post(blocker, blockdict)
1512
1513     logger.debug("Success! - EXIT!")
1514     return 0
1515
1516 def recheck_obfuscation(args: argparse.Namespace) -> int:
1517     logger.debug("args[]='%s' - CALLED!", type(args))
1518
1519     logger.debug("Invoking locking.acquire() ...")
1520     locking.acquire()
1521
1522     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1523         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1524     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1525         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1526     else:
1527         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1528
1529     rows = database.cursor.fetchall()
1530     logger.info("Checking %d domains ...", len(rows))
1531     for row in rows:
1532         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1533         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1534             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1535             continue
1536
1537         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1538         blocking = federation.fetch_blocks(row["domain"])
1539
1540         logger.debug("blocking()=%d", len(blocking))
1541         if len(blocking) == 0:
1542             if row["software"] == "pleroma":
1543                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1544                 blocking = pleroma.fetch_blocks(row["domain"])
1545             elif row["software"] == "mastodon":
1546                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1547                 blocking = mastodon.fetch_blocks(row["domain"])
1548             elif row["software"] == "lemmy":
1549                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1550                 blocking = lemmy.fetch_blocks(row["domain"])
1551             elif row["software"] == "friendica":
1552                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1553                 blocking = friendica.fetch_blocks(row["domain"])
1554             elif row["software"] == "misskey":
1555                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1556                 blocking = misskey.fetch_blocks(row["domain"])
1557             else:
1558                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1559
1560         # c.s isn't part of oliphant's "hidden" blocklists
1561         logger.debug("row[domain]='%s'", row["domain"])
1562         if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]):
1563             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1564             instances.set_last_blocked(row["domain"])
1565             instances.set_total_blocks(row["domain"], blocking)
1566
1567         obfuscated = 0
1568         blockdict = list()
1569
1570         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1571         for block in blocking:
1572             logger.debug("block[blocked]='%s'", block["blocked"])
1573             blocked = None
1574
1575             if block["blocked"] == "":
1576                 logger.debug("block[blocked] is empty - SKIPPED!")
1577                 continue
1578             elif block["blocked"].endswith(".arpa"):
1579                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1580                 continue
1581             elif block["blocked"].endswith(".tld"):
1582                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1583                 continue
1584             elif block["blocked"].endswith(".onion"):
1585                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1586                 continue
1587             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1588                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1589                 obfuscated = obfuscated + 1
1590                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1591             elif not domain_helper.is_wanted(block["blocked"]):
1592                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1593                 continue
1594             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1595                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1596                 continue
1597
1598             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1599             if blocked is not None and blocked != block["blocked"]:
1600                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1601                 obfuscated = obfuscated - 1
1602
1603                 if blocks.is_instance_blocked(row["domain"], blocked):
1604                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1605                     continue
1606                 elif blacklist.is_blacklisted(blocked):
1607                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1608                     continue
1609
1610                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1611
1612                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1613                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1614                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1615                     blockdict.append({
1616                         "blocked": blocked,
1617                         "reason" : block["reason"],
1618                     })
1619
1620         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1621         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1622
1623         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1624         if obfuscated == 0 and len(blocking) > 0:
1625             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1626             instances.set_has_obfuscation(row["domain"], False)
1627
1628         if instances.has_pending(row["domain"]):
1629             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1630             instances.update(row["domain"])
1631
1632         logger.debug("Invoking commit() ...")
1633         database.connection.commit()
1634
1635         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1636         if config.get("bot_enabled") and len(blockdict) > 0:
1637             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1638             network.send_bot_post(row["domain"], blockdict)
1639
1640     logger.debug("Success! - EXIT!")
1641     return 0
1642
1643 def fetch_fedilist(args: argparse.Namespace) -> int:
1644     logger.debug("args[]='%s' - CALLED!", type(args))
1645
1646     logger.debug("Invoking locking.acquire() ...")
1647     locking.acquire()
1648
1649     source_domain = "demo.fedilist.com"
1650     if sources.is_recent(source_domain):
1651         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1652         return 0
1653     else:
1654         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1655         sources.update(source_domain)
1656
1657     url = f"http://{source_domain}/instance/csv?onion=not"
1658     if args.software is not None and args.software != "":
1659         logger.debug("args.software='%s'", args.software)
1660         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1661
1662     logger.info("Fetching url='%s' ...", url)
1663     response = reqto.get(
1664         url,
1665         headers=network.web_headers,
1666         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1667         allow_redirects=False
1668     )
1669
1670     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1671     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1672         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1673         return 1
1674
1675     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1676
1677     logger.debug("reader[]='%s'", type(reader))
1678     if reader is None:
1679         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1680         return 2
1681
1682     rows = list(reader)
1683
1684     logger.info("Checking %d rows ...", len(rows))
1685     for row in rows:
1686         logger.debug("row[]='%s'", type(row))
1687         if "hostname" not in row:
1688             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1689             continue
1690
1691         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1692         domain = tidyup.domain(row["hostname"])
1693         logger.debug("domain='%s' - AFTER!", domain)
1694
1695         if domain == "":
1696             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1697             continue
1698
1699         logger.debug("domain='%s' - BEFORE!", domain)
1700         domain = domain.encode("idna").decode("utf-8")
1701         logger.debug("domain='%s' - AFTER!", domain)
1702
1703         if not domain_helper.is_wanted(domain):
1704             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1705             continue
1706         elif (args.force is None or not args.force) and instances.is_registered(domain):
1707             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1708             continue
1709         elif instances.is_recent(domain):
1710             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1711             continue
1712
1713         logger.info("Fetching instances from domain='%s' ...", domain)
1714         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1715
1716     logger.debug("Success! - EXIT!")
1717     return 0
1718
1719 def update_nodeinfo(args: argparse.Namespace) -> int:
1720     logger.debug("args[]='%s' - CALLED!", type(args))
1721
1722     logger.debug("Invoking locking.acquire() ...")
1723     locking.acquire()
1724
1725     if args.domain is not None and args.domain != "":
1726         logger.debug("Fetching args.domain='%s'", args.domain)
1727         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1728     elif args.software is not None and args.software != "":
1729         logger.info("Fetching domains for args.software='%s'", args.software)
1730         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1731     elif args.mode is not None and args.mode != "":
1732         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1733         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1734     elif args.no_software:
1735         logger.info("Fetching domains with no software type detected ...")
1736         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1737     else:
1738         logger.info("Fetching domains for recently updated ...")
1739         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1740
1741     domains = database.cursor.fetchall()
1742
1743     logger.info("Checking %d domain(s) ...", len(domains))
1744     cnt = 0
1745     for row in domains:
1746         logger.debug("row[]='%s'", type(row))
1747         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1748             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1749             continue
1750
1751         try:
1752             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1753             software = federation.determine_software(row["domain"])
1754
1755             logger.debug("Determined software='%s'", software)
1756             if (software != row["software"] and software is not None) or args.force is True:
1757                 logger.debug("software='%s'", software)
1758                 if software is None:
1759                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1760                     instances.set_nodeinfo_url(row["domain"], None)
1761
1762                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1763                 instances.set_software(row["domain"], software)
1764
1765             if software is not None:
1766                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1767                 instances.set_success(row["domain"])
1768         except network.exceptions as exception:
1769             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1770             instances.set_last_error(row["domain"], exception)
1771
1772         instances.set_last_nodeinfo(row["domain"])
1773         instances.update(row["domain"])
1774         cnt = cnt + 1
1775
1776     logger.debug("Success! - EXIT!")
1777     return 0
1778
1779 def fetch_instances_social(args: argparse.Namespace) -> int:
1780     logger.debug("args[]='%s' - CALLED!", type(args))
1781
1782     logger.debug("Invoking locking.acquire() ...")
1783     locking.acquire()
1784
1785     source_domain = "instances.social"
1786
1787     if config.get("instances_social_api_key") == "":
1788         logger.error("API key not set. Please set in your config.json file.")
1789         return 1
1790     elif sources.is_recent(source_domain):
1791         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1792         return 0
1793     else:
1794         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1795         sources.update(source_domain)
1796
1797     headers = {
1798         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1799     }
1800
1801     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1802     fetched = network.get_json_api(
1803         source_domain,
1804         "/api/1.0/instances/list?count=0&sort_by=name",
1805         headers,
1806         (config.get("connection_timeout"), config.get("read_timeout"))
1807     )
1808     logger.debug("fetched[]='%s'", type(fetched))
1809
1810     if "error_message" in fetched:
1811         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1812         return 2
1813     elif "exception" in fetched:
1814         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1815         return 3
1816     elif "json" not in fetched:
1817         logger.warning("fetched has no element 'json' - EXIT!")
1818         return 4
1819     elif "instances" not in fetched["json"]:
1820         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1821         return 5
1822
1823     domains = list()
1824     rows = fetched["json"]["instances"]
1825
1826     logger.info("Checking %d row(s) ...", len(rows))
1827     for row in rows:
1828         logger.debug("row[]='%s'", type(row))
1829         domain = tidyup.domain(row["name"])
1830         logger.debug("domain='%s' - AFTER!", domain)
1831
1832         if domain == "":
1833             logger.debug("domain is empty - SKIPPED!")
1834             continue
1835
1836         logger.debug("domain='%s' - BEFORE!", domain)
1837         domain = domain.encode("idna").decode("utf-8")
1838         logger.debug("domain='%s' - AFTER!", domain)
1839
1840         if not domain_helper.is_wanted(domain):
1841             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1842             continue
1843         elif domain in domains:
1844             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1845             continue
1846         elif instances.is_registered(domain):
1847             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1848             continue
1849         elif instances.is_recent(domain):
1850             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1851             continue
1852
1853         logger.info("Fetching instances from domain='%s'", domain)
1854         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1855
1856     logger.debug("Success! - EXIT!")
1857     return 0
1858
1859 def fetch_relays(args: argparse.Namespace) -> int:
1860     logger.debug("args[]='%s' - CALLED!", type(args))
1861
1862     logger.debug("Invoking locking.acquire() ...")
1863     locking.acquire()
1864
1865     if args.domain is not None and args.domain != "":
1866         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1867     else:
1868         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1869
1870     domains = list()
1871     rows = database.cursor.fetchall()
1872
1873     logger.info("Checking %d relays ...", len(rows))
1874     for row in rows:
1875         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1876         peers = list()
1877         if not args.force and instances.is_recent(row["domain"]):
1878             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1879             continue
1880
1881         try:
1882             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1883             raw = utils.fetch_url(
1884                 f"https://{row['domain']}",
1885                 network.web_headers,
1886                 (config.get("connection_timeout"), config.get("read_timeout"))
1887             ).text
1888             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1889         except network.exceptions as exception:
1890             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1891             instances.set_last_error(row["domain"], exception)
1892             instances.set_last_instance_fetch(row["domain"])
1893             instances.update(row["domain"])
1894             continue
1895
1896         doc = bs4.BeautifulSoup(raw, features="html.parser")
1897         logger.debug("doc[]='%s'", type(doc))
1898
1899         logger.debug("row[software]='%s'", row["software"])
1900         if row["software"] == "activityrelay":
1901             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1902             tags = doc.findAll("p")
1903
1904             logger.debug("Checking %d paragraphs ...", len(tags))
1905             for tag in tags:
1906                 logger.debug("tag[]='%s'", type(tag))
1907                 if len(tag.contents) == 0:
1908                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1909                     continue
1910                 elif "registered instances" not in tag.contents[0]:
1911                     logger.debug("Skipping paragraph, text not found.")
1912                     continue
1913
1914                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1915                 for domain in tag.contents:
1916                     logger.debug("domain[%s]='%s'", type(domain), domain)
1917                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1918                         continue
1919
1920                     domain = str(domain)
1921                     logger.debug("domain='%s'", domain)
1922                     if not domain_helper.is_wanted(domain):
1923                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1924                         continue
1925
1926                     logger.debug("domain='%s' - BEFORE!", domain)
1927                     domain = tidyup.domain(domain)
1928                     logger.debug("domain='%s' - AFTER!", domain)
1929
1930                     if domain == "":
1931                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1932                         continue
1933                     elif domain not in peers:
1934                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1935                         peers.append(domain)
1936
1937                     if dict_helper.has_key(domains, "domain", domain):
1938                         logger.debug("domain='%s' already added", domain)
1939                         continue
1940
1941                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1942                     domains.append({
1943                         "domain": domain,
1944                         "origin": row["domain"],
1945                     })
1946         elif row["software"] in ["aoderelay", "selective-relay"]:
1947             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1948             if row["software"] == "aoderelay":
1949                 tags = doc.findAll("section", {"class": "instance"})
1950             else:
1951                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1952
1953             logger.debug("Checking %d tags ...", len(tags))
1954             for tag in tags:
1955                 logger.debug("tag[]='%s'", type(tag))
1956
1957                 link = tag.find("a")
1958                 logger.debug("link[%s]='%s'", type(link), link)
1959                 if link is None:
1960                     logger.warning("tag='%s' has no a-tag ...", tag)
1961                     continue
1962
1963                 components = urlparse(link["href"])
1964                 domain = components.netloc.lower()
1965
1966                 if not domain_helper.is_wanted(domain):
1967                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1968                     continue
1969
1970                 logger.debug("domain='%s' - BEFORE!", domain)
1971                 domain = tidyup.domain(domain)
1972                 logger.debug("domain='%s' - AFTER!", domain)
1973
1974                 if domain == "":
1975                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1976                     continue
1977                 elif domain not in peers:
1978                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1979                     peers.append(domain)
1980
1981                 if dict_helper.has_key(domains, "domain", domain):
1982                     logger.debug("domain='%s' already added", domain)
1983                     continue
1984
1985                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1986                 domains.append({
1987                     "domain": domain,
1988                     "origin": row["domain"],
1989                 })
1990         else:
1991             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1992
1993         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1994         instances.set_last_instance_fetch(row["domain"])
1995
1996         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1997         instances.set_total_peers(row["domain"], peers)
1998
1999         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
2000         instances.update(row["domain"])
2001
2002     logger.info("Checking %d domains ...", len(domains))
2003     for row in domains:
2004         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2005         if instances.is_registered(row["domain"]):
2006             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2007             continue
2008
2009         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2010         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2011
2012     logger.debug("Success! - EXIT!")
2013     return 0
2014
2015 def convert_idna(args: argparse.Namespace) -> int:
2016     logger.debug("args[]='%s' - CALLED!", type(args))
2017
2018     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2019     rows = database.cursor.fetchall()
2020
2021     logger.debug("rows[]='%s'", type(rows))
2022     instances.translate_idnas(rows, "domain")
2023
2024     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2025     rows = database.cursor.fetchall()
2026
2027     logger.debug("rows[]='%s'", type(rows))
2028     instances.translate_idnas(rows, "origin")
2029
2030     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2031     rows = database.cursor.fetchall()
2032
2033     logger.debug("rows[]='%s'", type(rows))
2034     blocks.translate_idnas(rows, "blocker")
2035
2036     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2037     rows = database.cursor.fetchall()
2038
2039     logger.debug("rows[]='%s'", type(rows))
2040     blocks.translate_idnas(rows, "blocked")
2041
2042     logger.debug("Success! - EXIT!")
2043     return 0
2044
2045 def remove_invalid(args: argparse.Namespace) -> int:
2046     logger.debug("args[]='%s' - CALLED!", type(args))
2047
2048     logger.debug("Invoking locking.acquire() ...")
2049     locking.acquire()
2050
2051     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2052     rows = database.cursor.fetchall()
2053
2054     logger.info("Checking %d domains ...", len(rows))
2055     for row in rows:
2056         logger.debug("row[domain]='%s'", row["domain"])
2057         if not validators.domain(row["domain"].split("/")[0]):
2058             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2059             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2060             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2061
2062     logger.debug("Invoking commit() ...")
2063     database.connection.commit()
2064
2065     logger.info("Vaccum cleaning database ...")
2066     database.cursor.execute("VACUUM")
2067
2068     logger.debug("Success! - EXIT!")
2069     return 0