]> git.mxchange.org Git - fba.git/blob - fba/commands.py
645f149820097045aefa9528bbd040921d65347c
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s' - EXIT!", fetched["error_message"])
207             return 100
208         elif "json" not in fetched:
209             logger.warning("post_json_api() returned fetched[]='%s' with missing 'json' element - EXIT!", type(fetched))
210             return 101
211         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
212             logger.warning("post_json_api() returned error: '%s' - EXIT!", fetched["json"]["error"]["message"])
213             return 102
214
215         rows = fetched["json"]
216
217         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
218         if len(rows) == 0:
219             raise Exception("WARNING: Returned no records")
220         elif "data" not in rows:
221             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
222         elif "nodeinfo" not in rows["data"]:
223             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
224
225         for entry in rows["data"]["nodeinfo"]:
226             logger.debug("entry[%s]='%s'", type(entry), entry)
227             if "domain" not in entry:
228                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
229                 continue
230             elif entry["domain"] in [None, ""]:
231                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
232                 continue
233             elif not domain_helper.is_wanted(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_registered(entry["domain"]):
237                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
238                 continue
239             elif instances.is_recent(entry["domain"]):
240                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
241                 continue
242
243             logger.debug("Adding domain='%s' ...", entry["domain"])
244             domains.append(entry["domain"])
245
246     except network.exceptions as exception:
247         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
248         return 102
249
250     logger.debug("domains()=%d", len(domains))
251     if len(domains) > 0:
252         logger.info("Adding %d new instances ...", len(domains))
253         for domain in domains:
254             logger.debug("domain='%s' - BEFORE!", domain)
255             domain = domain.encode("idna").decode("utf-8")
256             logger.debug("domain='%s' - AFTER!", domain)
257
258             try:
259                 logger.info("Fetching instances from domain='%s' ...", domain)
260                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
261             except network.exceptions as exception:
262                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
263                 instances.set_last_error(domain, exception)
264                 return 100
265
266     logger.debug("Success - EXIT!")
267     return 0
268
269 def fetch_blocks(args: argparse.Namespace) -> int:
270     logger.debug("args[]='%s' - CALLED!", type(args))
271     if args.domain is not None and args.domain != "":
272         logger.debug("args.domain='%s' - checking ...", args.domain)
273         if not validators.domain(args.domain):
274             logger.warning("args.domain='%s' is not valid.", args.domain)
275             return 100
276         elif blacklist.is_blacklisted(args.domain):
277             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
278             return 101
279         elif not instances.is_registered(args.domain):
280             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
281             return 102
282
283     logger.debug("Invoking locking.acquire() ...")
284     locking.acquire()
285
286     if args.domain is not None and args.domain != "":
287         # Re-check single domain
288         logger.debug("Querying database for args.domain='%s' ...", args.domain)
289         database.cursor.execute(
290             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
291         )
292     elif args.software is not None and args.software != "":
293         # Re-check single software
294         logger.debug("Querying database for args.software='%s' ...", args.software)
295         database.cursor.execute(
296             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
297         )
298     elif args.only_none:
299         # Check only entries with total_blocked=None
300         database.cursor.execute(
301             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
302         )
303     else:
304         # Re-check after "timeout" (aka. minimum interval)
305         database.cursor.execute(
306             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
307         )
308
309     rows = database.cursor.fetchall()
310     logger.info("Checking %d entries ...", len(rows))
311     for blocker, software, origin, nodeinfo_url in rows:
312         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
313
314         if not domain_helper.is_wanted(blocker):
315             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
316             continue
317         elif not args.force and instances.is_recent(blocker, "last_blocked"):
318             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
319             continue
320
321         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
322         instances.set_last_blocked(blocker)
323         instances.set_has_obfuscation(blocker, False)
324
325         # c.s isn't part of oliphant's "hidden" blocklists
326         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
327             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
328             continue
329
330         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
331         blocking = federation.fetch_blocks(blocker)
332
333         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
334         if len(blocking) == 0:
335             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
336             if software == "pleroma":
337                 blocking = pleroma.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "mastodon":
340                 blocking = mastodon.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "lemmy":
343                 blocking = lemmy.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "friendica":
346                 blocking = friendica.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             elif software == "misskey":
349                 blocking = misskey.fetch_blocks(blocker)
350                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
351             else:
352                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
353
354         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
355         instances.set_total_blocks(blocker, blocking)
356
357         blockdict = list()
358         deobfuscated = obfuscated = 0
359
360         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
361         for block in blocking:
362             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
363
364             if block["block_level"] == "":
365                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
366                 continue
367
368             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
369             block["blocked"] = tidyup.domain(block["blocked"])
370             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
371             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
372
373             if block["blocked"] in [None, ""]:
374                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
375                 continue
376             elif block["blocked"].endswith(".onion"):
377                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
380                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".arpa"):
383                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].endswith(".tld"):
386                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
387                 continue
388             elif block["blocked"].find("*") >= 0:
389                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
390                 instances.set_has_obfuscation(blocker, True)
391                 obfuscated = obfuscated + 1
392
393                 # Some friendica servers also obscure domains without hash
394                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
395
396                 logger.debug("row[]='%s'", type(row))
397                 if row is None:
398                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
399                     continue
400
401                 deobfuscated = deobfuscated + 1
402                 block["blocked"] = row["domain"]
403                 origin           = row["origin"]
404                 nodeinfo_url     = row["nodeinfo_url"]
405             elif block["blocked"].find("?") >= 0:
406                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
407                 instances.set_has_obfuscation(blocker, True)
408                 obfuscated = obfuscated + 1
409
410                 # Some obscure them with question marks, not sure if that's dependent on version or not
411                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
412
413                 logger.debug("row[]='%s'", type(row))
414                 if row is None:
415                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
416                     continue
417
418                 deobfuscated = deobfuscated + 1
419                 block["blocked"] = row["domain"]
420                 origin           = row["origin"]
421                 nodeinfo_url     = row["nodeinfo_url"]
422
423             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
424             if block["blocked"] in [None, ""]:
425                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
426                 continue
427
428             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
429             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
430             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
431
432             if not domain_helper.is_wanted(block["blocked"]):
433                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
434                 continue
435             elif block["block_level"] in ["accept", "accepted"]:
436                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
437                 continue
438             elif not instances.is_registered(block["blocked"]):
439                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
440                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
441
442             block["block_level"] = blocks.alias_block_level(block["block_level"])
443
444             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
445                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
446                 blockdict.append({
447                     "blocked": block["blocked"],
448                     "reason" : block["reason"],
449                 })
450
451             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
452             cookies.clear(block["blocked"])
453
454         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
455         instances.set_obfuscated_blocks(blocker, obfuscated)
456
457         logger.debug("Flushing updates for blocker='%s' ...", blocker)
458         instances.update(blocker)
459
460         logger.debug("Invoking commit() ...")
461         database.connection.commit()
462
463         logger.debug("Invoking cookies.clear(%s) ...", blocker)
464         cookies.clear(blocker)
465
466         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
467         if config.get("bot_enabled") and len(blockdict) > 0:
468             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
469             network.send_bot_post(blocker, blockdict)
470
471     logger.debug("Success! - EXIT!")
472     return 0
473
474 def fetch_observer(args: argparse.Namespace) -> int:
475     logger.debug("args[]='%s' - CALLED!", type(args))
476
477     logger.debug("Invoking locking.acquire() ...")
478     locking.acquire()
479
480     source_domain = "fediverse.observer"
481     if sources.is_recent(source_domain):
482         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
483         return 1
484     else:
485         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
486         sources.update(source_domain)
487
488     types = list()
489     if args.software is None:
490         logger.info("Fetching software list ...")
491         raw = network.fetch_url(
492             f"https://{source_domain}",
493             network.web_headers,
494             (config.get("connection_timeout"), config.get("read_timeout"))
495         ).text
496         logger.debug("raw[%s]()=%d", type(raw), len(raw))
497
498         doc = bs4.BeautifulSoup(raw, features="html.parser")
499         logger.debug("doc[]='%s'", type(doc))
500
501         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
502         logger.debug("navbar[]='%s'", type(navbar))
503         if navbar is None:
504             logger.warning("Cannot find navigation bar, cannot continue!")
505             return 1
506
507         items = navbar.findAll("a", {"class": "dropdown-item"})
508         logger.debug("items[]='%s'", type(items))
509
510         logger.info("Checking %d menu items ...", len(items))
511         for item in items:
512             logger.debug("item[%s]='%s'", type(item), item)
513             if item.text.lower() == "all":
514                 logger.debug("Skipping 'All' menu entry ...")
515                 continue
516
517             logger.debug("Appending item.text='%s' ...", item.text)
518             types.append(tidyup.domain(item.text))
519     else:
520         logger.info("Adding args.software='%s' as type ...", args.software)
521         types.append(args.software)
522
523     logger.info("Fetching %d different table data ...", len(types))
524     for software in types:
525         logger.debug("software='%s'", software)
526
527         if args.software is not None and args.software != software:
528             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
529             continue
530
531         items = list()
532         try:
533             logger.debug("Fetching table data for software='%s' ...", software)
534             raw = network.post_json_api(
535                 f"api.{source_domain}",
536                 "/",
537                 json.dumps({
538                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
539                 })
540             )
541
542             logger.debug("raw[%s]()=%d", type(raw), len(raw))
543             if "exception" in raw:
544                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
545                 raise raw["exception"]
546             elif "error_message" in raw:
547                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
548                 continue
549             elif not "data" in raw["json"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
551                 continue
552             elif not "nodes" in raw["json"]["data"]:
553                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
554                 continue
555
556             items = raw["json"]["data"]["nodes"]
557             logger.debug("items()=%d", len(items))
558
559         except network.exceptions as exception:
560             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
561             continue
562
563         logger.info("Checking %d items,software='%s' ...", len(items), software)
564         for item in items:
565             logger.debug("item[]='%s'", type(item))
566             if not "domain" in item:
567                 logger.debug("item()=%d has not element 'domain'", len(item))
568                 continue
569
570             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
571             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
572             logger.debug("domain='%s' - AFTER!", domain)
573
574             if domain in [None, ""]:
575                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
576                 continue
577
578             logger.debug("domain='%s' - BEFORE!", domain)
579             domain = domain.encode("idna").decode("utf-8")
580             logger.debug("domain='%s' - AFTER!", domain)
581
582             if not domain_helper.is_wanted(domain):
583                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
584                 continue
585             elif instances.is_registered(domain):
586                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
587                 continue
588
589             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
590             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
591
592     logger.debug("Success! - EXIT!")
593     return 0
594
595 def fetch_todon_wiki(args: argparse.Namespace) -> int:
596     logger.debug("args[]='%s' - CALLED!", type(args))
597
598     logger.debug("Invoking locking.acquire() ...")
599     locking.acquire()
600
601     source_domain = "wiki.todon.eu"
602     if sources.is_recent(source_domain):
603         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
604         return 1
605     else:
606         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
607         sources.update(source_domain)
608
609     blocklist = {
610         "silenced": list(),
611         "reject": list(),
612     }
613
614     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
615     raw = network.fetch_url(
616         f"https://{source_domain}/todon/domainblocks",
617         network.web_headers,
618         (config.get("connection_timeout"), config.get("read_timeout"))
619     ).text
620     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
621
622     doc = bs4.BeautifulSoup(raw, "html.parser")
623     logger.debug("doc[]='%s'", type(doc))
624
625     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
626     logger.info("Checking %d silenced/limited entries ...", len(silenced))
627     blocklist["silenced"] = utils.find_domains(silenced, "div")
628
629     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
630     logger.info("Checking %d suspended entries ...", len(suspended))
631     blocklist["reject"] = utils.find_domains(suspended, "div")
632
633     blocking = blocklist["silenced"] + blocklist["reject"]
634     blocker = "todon.eu"
635
636     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
637     instances.set_last_blocked(blocker)
638     instances.set_total_blocks(blocker, blocking)
639
640     blockdict = list()
641     for block_level in blocklist:
642         blockers = blocklist[block_level]
643
644         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
645         for blocked in blockers:
646             logger.debug("blocked='%s'", blocked)
647
648             if not instances.is_registered(blocked):
649                 try:
650                     logger.info("Fetching instances from domain='%s' ...", blocked)
651                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
652                 except network.exceptions as exception:
653                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
654                     instances.set_last_error(blocked, exception)
655
656             if not domain_helper.is_wanted(blocked):
657                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
658                 continue
659             elif not domain_helper.is_wanted(blocker):
660                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
661                 continue
662             elif blocks.is_instance_blocked(blocker, blocked, block_level):
663                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
664                 continue
665
666             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
667             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
668                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
669                 blockdict.append({
670                     "blocked": blocked,
671                     "reason" : None,
672                 })
673
674         logger.debug("Invoking commit() ...")
675         database.connection.commit()
676
677         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
678         if config.get("bot_enabled") and len(blockdict) > 0:
679             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
680             network.send_bot_post(blocker, blockdict)
681
682     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
683     if instances.has_pending(blocker):
684         logger.debug("Flushing updates for blocker='%s' ...", blocker)
685         instances.update(blocker)
686
687     logger.debug("Success! - EXIT!")
688     return 0
689
690 def fetch_cs(args: argparse.Namespace):
691     logger.debug("args[]='%s' - CALLED!", type(args))
692
693     logger.debug("Invoking locking.acquire() ...")
694     locking.acquire()
695
696     extensions = [
697         "extra",
698         "abbr",
699         "attr_list",
700         "def_list",
701         "fenced_code",
702         "footnotes",
703         "md_in_html",
704         "admonition",
705         "codehilite",
706         "legacy_attrs",
707         "legacy_em",
708         "meta",
709         "nl2br",
710         "sane_lists",
711         "smarty",
712         "toc",
713         "wikilinks"
714     ]
715
716     blocklist = {
717         "silenced": list(),
718         "reject"  : list(),
719     }
720
721     source_domain = "raw.githubusercontent.com"
722     if sources.is_recent(source_domain):
723         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
724         return 1
725     else:
726         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
727         sources.update(source_domain)
728
729     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
730     raw = network.fetch_url(
731         f"https://{source_domain}/chaossocial/meta/master/federation.md",
732         network.web_headers,
733         (config.get("connection_timeout"), config.get("read_timeout"))
734     ).text
735     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
736
737     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
738     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
739
740     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
741     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
742     blocklist["silenced"] = federation.find_domains(silenced)
743
744     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
745     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
746     blocklist["reject"] = federation.find_domains(blocked)
747
748     blocking = blocklist["silenced"] + blocklist["reject"]
749     blocker = "chaos.social"
750
751     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
752     instances.set_last_blocked(blocker)
753     instances.set_total_blocks(blocker, blocking)
754
755     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
756     if len(blocking) > 0:
757         blockdict = list()
758         for block_level in blocklist:
759             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
760
761             for row in blocklist[block_level]:
762                 logger.debug("row[%s]='%s'", type(row), row)
763                 if not "domain" in row:
764                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
765                     continue
766                 elif not instances.is_registered(row["domain"]):
767                     try:
768                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
769                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
770                     except network.exceptions as exception:
771                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
772                         instances.set_last_error(row["domain"], exception)
773
774                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
775                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
776                     blockdict.append({
777                         "blocked": row["domain"],
778                         "reason" : row["reason"],
779                     })
780
781         logger.debug("Invoking commit() ...")
782         database.connection.commit()
783
784         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
785         if config.get("bot_enabled") and len(blockdict) > 0:
786             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
787             network.send_bot_post(blocker, blockdict)
788
789     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
790     if instances.has_pending(blocker):
791         logger.debug("Flushing updates for blocker='%s' ...", blocker)
792         instances.update(blocker)
793
794     logger.debug("Success! - EXIT!")
795     return 0
796
797 def fetch_fba_rss(args: argparse.Namespace) -> int:
798     logger.debug("args[]='%s' - CALLED!", type(args))
799
800     domains = list()
801
802     logger.debug("Invoking locking.acquire() ...")
803     locking.acquire()
804
805     components = urlparse(args.feed)
806     domain = components.netloc.lower().split(":")[0]
807
808     logger.debug("domain='%s'", domain)
809     if sources.is_recent(domain):
810         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
811         return 0
812     else:
813         logger.debug("domain='%s' has not been recently used, marking ...", domain)
814         sources.update(domain)
815
816     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
817     response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
818
819     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
820     if response.ok and response.status_code == 200 and len(response.text) > 0:
821         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
822         rss = atoma.parse_rss_bytes(response.content)
823
824         logger.debug("rss[]='%s'", type(rss))
825         for item in rss.items:
826             logger.debug("item[%s]='%s'", type(item), item)
827             domain = item.link.split("=")[1]
828             domain = tidyup.domain(domain) if domain not in[None, ""] else None
829
830             logger.debug("domain='%s' - AFTER!", domain)
831             if domain in [None, ""]:
832                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
833                 continue
834
835             logger.debug("domain='%s' - BEFORE!", domain)
836             domain = domain.encode("idna").decode("utf-8")
837             logger.debug("domain='%s' - AFTER!", domain)
838
839             if not domain_helper.is_wanted(domain):
840                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
841                 continue
842             elif domain in domains:
843                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
844                 continue
845             elif instances.is_registered(domain):
846                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
847                 continue
848             elif instances.is_recent(domain):
849                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
850                 continue
851
852             logger.debug("Adding domain='%s'", domain)
853             domains.append(domain)
854
855     logger.debug("domains()=%d", len(domains))
856     if len(domains) > 0:
857         logger.info("Adding %d new instances ...", len(domains))
858         for domain in domains:
859             logger.debug("domain='%s'", domain)
860             try:
861                 logger.info("Fetching instances from domain='%s' ...", domain)
862                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
863             except network.exceptions as exception:
864                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
865                 instances.set_last_error(domain, exception)
866                 return 100
867
868     logger.debug("Success! - EXIT!")
869     return 0
870
871 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
872     logger.debug("args[]='%s' - CALLED!", type(args))
873
874     logger.debug("Invoking locking.acquire() ...")
875     locking.acquire()
876
877     source_domain = "ryona.agency"
878     feed = f"https://{source_domain}/users/fba/feed.atom"
879
880     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
881     if args.feed is not None and validators.url(args.feed):
882         logger.debug("Setting feed='%s' ...", args.feed)
883         feed = str(args.feed)
884         source_domain = urlparse(args.feed).netloc
885
886     if sources.is_recent(source_domain):
887         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
888         return 1
889     else:
890         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
891         sources.update(source_domain)
892
893     domains = list()
894
895     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
896     response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
897
898     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
899     if response.ok and response.status_code == 200 and len(response.text) > 0:
900         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
901         atom = atoma.parse_atom_bytes(response.content)
902
903         logger.debug("atom[]='%s'", type(atom))
904         for entry in atom.entries:
905             logger.debug("entry[]='%s'", type(entry))
906             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
907             logger.debug("doc[]='%s'", type(doc))
908             elements = doc.findAll("a")
909
910             logger.debug("Checking %d element(s) ...", len(elements))
911             for element in elements:
912                 logger.debug("element[%s]='%s'", type(element), element)
913                 for href in element["href"].split(","):
914                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
915                     domain = tidyup.domain(href) if href not in [None, ""] else None
916
917                     logger.debug("domain='%s' - AFTER!", domain)
918                     if domain in [None, ""]:
919                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
920                         continue
921
922                     logger.debug("domain='%s' - BEFORE!", domain)
923                     domain = domain.encode("idna").decode("utf-8")
924                     logger.debug("domain='%s' - AFTER!", domain)
925
926                     if not domain_helper.is_wanted(domain):
927                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
928                         continue
929                     elif domain in domains:
930                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
931                         continue
932                     elif instances.is_registered(domain):
933                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
934                         continue
935                     elif instances.is_recent(domain):
936                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
937                         continue
938
939                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
940                     domains.append(domain)
941
942     logger.debug("domains()=%d", len(domains))
943     if len(domains) > 0:
944         logger.info("Adding %d new instances ...", len(domains))
945         for domain in domains:
946             logger.debug("domain='%s'", domain)
947             try:
948                 logger.info("Fetching instances from domain='%s' ...", domain)
949                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
950             except network.exceptions as exception:
951                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
952                 instances.set_last_error(domain, exception)
953                 return 100
954
955     logger.debug("Success! - EXIT!")
956     return 0
957
958 def fetch_instances(args: argparse.Namespace) -> int:
959     logger.debug("args[]='%s' - CALLED!", type(args))
960
961     logger.debug("Invoking locking.acquire() ...")
962     locking.acquire()
963
964     # Init variables
965     rows = list()
966
967     # Is domain or software set?
968     if args.domain not in [None, ""]:
969         logger.debug("args.domain='%s' - checking ...", args.domain)
970         if not validators.domain(args.domain):
971             logger.warning("args.domain='%s' is not valid.", args.domain)
972             return 100
973         elif blacklist.is_blacklisted(args.domain):
974             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
975             return 101
976
977         logger.debug("args.domain='%s' - BEFORE!", args.domain)
978         domain = tidyup.domain(args.domain)
979         logger.debug("domain='%s' - AFTER!", domain)
980
981         # Fetch record
982         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
983         rows = database.cursor.fetchall()
984     elif args.software not in [None, ""]:
985         logger.debug("args.software='%s' - BEFORE!", args.software)
986         software = software_helper.alias(args.software)
987         logger.debug("software='%s' - AFTER!", software)
988
989         # Fetch records
990         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [software])
991         rows = database.cursor.fetchall()
992
993     logger.info("Checking %d entries ...", len(rows))
994     for row in rows:
995         logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
996         if row["software"] is None and instances.is_registered(row["domain"]) :
997             logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
998             continue
999         elif software_helper.is_relay(row["software"]) and instances.is_registered(row["domain"]):
1000             logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
1001             continue
1002         elif not args.force and not args.software in [None, ""]and instances.is_recent(row["domain"]):
1003             logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
1004             continue
1005
1006         # Initial fetch
1007         try:
1008             logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
1009             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1010         except network.exceptions as exception:
1011             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
1012             instances.set_last_error(row["domain"], exception)
1013             instances.update(row["domain"])
1014             continue
1015
1016         if args.single:
1017             logger.debug("Not fetching more instances - BREAK!")
1018             break
1019
1020     # Loop through some instances
1021     database.cursor.execute(
1022         "SELECT domain, origin, software \
1023 FROM instances \
1024 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen', 'vebinet') \
1025 AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) \
1026 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
1027     )
1028
1029     rows = database.cursor.fetchall()
1030     logger.info("Checking %d entries ...", len(rows))
1031     for row in rows:
1032         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1033         domain = row["domain"].encode("idna").decode("utf-8")
1034         logger.debug("domain='%s' - AFTER!", domain)
1035
1036         if not domain_helper.is_wanted(domain):
1037             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1038             continue
1039
1040         try:
1041             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1042             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1043         except network.exceptions as exception:
1044             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1045             instances.set_last_error(domain, exception)
1046
1047     logger.debug("Success - EXIT!")
1048     return 0
1049
1050 def fetch_csv(args: argparse.Namespace) -> int:
1051     logger.debug("args[]='%s' - CALLED!", type(args))
1052
1053     logger.debug("Invoking locking.acquire() ...")
1054     locking.acquire()
1055
1056     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1057     for block in blocklists.csv_files:
1058         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1059
1060         # Is domain given and not equal blocker?
1061         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1062             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1063             continue
1064
1065         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1066         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1067
1068     logger.debug("Success - EXIT!")
1069     return 0
1070
1071 def fetch_oliphant(args: argparse.Namespace) -> int:
1072     logger.debug("args[]='%s' - CALLED!", type(args))
1073
1074     logger.debug("Invoking locking.acquire() ...")
1075     locking.acquire()
1076
1077     source_domain = "codeberg.org"
1078     if sources.is_recent(source_domain):
1079         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1080         return 1
1081     else:
1082         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1083         sources.update(source_domain)
1084
1085     # Base URL
1086     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1087
1088     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1089     for block in blocklists.oliphant_blocklists:
1090         # Is domain given and not equal blocker?
1091         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1092         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1093             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1094             continue
1095
1096         url = f"{base_url}/{block['csv_url']}"
1097
1098         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1099         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1100
1101     logger.debug("Success! - EXIT!")
1102     return 0
1103
1104 def fetch_txt(args: argparse.Namespace) -> int:
1105     logger.debug("args[]='%s' - CALLED!", type(args))
1106
1107     logger.debug("Invoking locking.acquire() ...")
1108     locking.acquire()
1109
1110     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1111     for row in blocklists.txt_files:
1112         logger.debug("Fetching row[url]='%s' ...", row["url"])
1113         response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1114
1115         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1116         if response.ok and response.status_code == 200 and response.text != "":
1117             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1118             domains = response.text.strip().split("\n")
1119
1120             logger.info("Processing %d domains ...", len(domains))
1121             for domain in domains:
1122                 logger.debug("domain='%s' - BEFORE!", domain)
1123                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1124                 logger.debug("domain='%s' - AFTER!", domain)
1125
1126                 if domain in [None, ""]:
1127                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1128                     continue
1129                 elif not domain_helper.is_wanted(domain):
1130                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1131                     continue
1132                 elif not args.force and instances.is_registered(domain):
1133                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1134                     continue
1135
1136                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1137                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1138                 logger.debug("processed='%s'", processed)
1139
1140     logger.debug("Success! - EXIT!")
1141     return 0
1142
1143 def fetch_fedipact(args: argparse.Namespace) -> int:
1144     logger.debug("args[]='%s' - CALLED!", type(args))
1145
1146     logger.debug("Invoking locking.acquire() ...")
1147     locking.acquire()
1148
1149     source_domain = "fedipact.online"
1150     if sources.is_recent(source_domain):
1151         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1152         return 1
1153     else:
1154         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1155         sources.update(source_domain)
1156
1157     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1158     response = network.fetch_url(
1159         f"https://{source_domain}",
1160         network.web_headers,
1161         (config.get("connection_timeout"), config.get("read_timeout"))
1162     )
1163
1164     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1165     if response.ok and response.status_code == 200 and response.text != "":
1166         logger.debug("Parsing %d Bytes ...", len(response.text))
1167
1168         doc = bs4.BeautifulSoup(response.text, "html.parser")
1169         logger.debug("doc[]='%s'", type(doc))
1170
1171         rows = doc.findAll("li")
1172         logger.info("Checking %d row(s) ...", len(rows))
1173         for row in rows:
1174             logger.debug("row[]='%s'", type(row))
1175             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1176
1177             logger.debug("domain='%s' - AFTER!", domain)
1178             if domain in [None, ""]:
1179                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1180                 continue
1181
1182             logger.debug("domain='%s' - BEFORE!", domain)
1183             domain = domain.encode("idna").decode("utf-8")
1184             logger.debug("domain='%s' - AFTER!", domain)
1185
1186             if not domain_helper.is_wanted(domain):
1187                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1188                 continue
1189             elif instances.is_registered(domain):
1190                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1191                 continue
1192             elif instances.is_recent(domain):
1193                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1194                 continue
1195
1196             logger.info("Fetching domain='%s' ...", domain)
1197             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1198
1199     logger.debug("Success! - EXIT!")
1200     return 0
1201
1202 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1203     logger.debug("args[]='%s' - CALLED!", type(args))
1204
1205     logger.debug("Invoking locking.acquire() ...")
1206     locking.acquire()
1207
1208     source_domain = "instances.joinmobilizon.org"
1209     if sources.is_recent(source_domain):
1210         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1211         return 1
1212     else:
1213         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1214         sources.update(source_domain)
1215
1216     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1217     raw = network.fetch_url(
1218         f"https://{source_domain}/api/v1/instances",
1219         network.web_headers,
1220         (config.get("connection_timeout"), config.get("read_timeout"))
1221     ).text
1222     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1223
1224     parsed = json.loads(raw)
1225     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1226
1227     if "data" not in parsed:
1228         logger.warning("parsed()=%d does not contain key 'data'")
1229         return 1
1230
1231     logger.info("Checking %d instances ...", len(parsed["data"]))
1232     for row in parsed["data"]:
1233         logger.debug("row[]='%s'", type(row))
1234         if "host" not in row:
1235             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1236             continue
1237         elif not domain_helper.is_wanted(row["host"]):
1238             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1239             continue
1240         elif instances.is_registered(row["host"]):
1241             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1242             continue
1243
1244         logger.info("Fetching row[host]='%s' ...", row["host"])
1245         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1246
1247     logger.debug("Success! - EXIT!")
1248     return 0
1249
1250 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1251     logger.debug("args[]='%s' - CALLED!", type(args))
1252
1253     logger.debug("Invoking locking.acquire() ...")
1254     locking.acquire()
1255
1256     source_domain = "instanceapp.misskey.page"
1257     if sources.is_recent(source_domain):
1258         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1259         return 1
1260     else:
1261         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1262         sources.update(source_domain)
1263
1264     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1265     raw = network.fetch_url(
1266         f"https://{source_domain}/instances.json",
1267         network.web_headers,
1268         (config.get("connection_timeout"), config.get("read_timeout"))
1269     ).text
1270     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1271
1272     parsed = json.loads(raw)
1273     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1274
1275     if "instancesInfos" not in parsed:
1276         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1277         return 1
1278
1279     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1280     for row in parsed["instancesInfos"]:
1281         logger.debug("row[%s]='%s'", type(row), row)
1282         if "url" not in row:
1283             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1284             continue
1285         elif not domain_helper.is_wanted(row["url"]):
1286             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1287             continue
1288         elif instances.is_registered(row["url"]):
1289             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1290             continue
1291
1292         logger.info("Fetching row[url]='%s' ...", row["url"])
1293         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1294
1295     logger.debug("Success! - EXIT!")
1296     return 0
1297
1298 def recheck_obfuscation(args: argparse.Namespace) -> int:
1299     logger.debug("args[]='%s' - CALLED!", type(args))
1300
1301     logger.debug("Invoking locking.acquire() ...")
1302     locking.acquire()
1303
1304     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1305         logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1306         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1307     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1308         logger.debug("Fetching records for args.software='%s' ...", args.software)
1309         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1310     else:
1311         logger.debug("Fetching records where domains have obfuscated block entries ...")
1312         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1313
1314     rows = database.cursor.fetchall()
1315     logger.info("Checking %d domains ...", len(rows))
1316     for row in rows:
1317         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1318         if blacklist.is_blacklisted(row["domain"]):
1319             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1320             continue
1321         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1322             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1323             continue
1324
1325         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1326         blocking = federation.fetch_blocks(row["domain"])
1327
1328         logger.debug("blocking()=%d", len(blocking))
1329         if len(blocking) == 0:
1330             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1331             if row["software"] == "pleroma":
1332                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1333                 blocking = pleroma.fetch_blocks(row["domain"])
1334             elif row["software"] == "mastodon":
1335                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1336                 blocking = mastodon.fetch_blocks(row["domain"])
1337             elif row["software"] == "lemmy":
1338                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1339                 blocking = lemmy.fetch_blocks(row["domain"])
1340             elif row["software"] == "friendica":
1341                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1342                 blocking = friendica.fetch_blocks(row["domain"])
1343             elif row["software"] == "misskey":
1344                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1345                 blocking = misskey.fetch_blocks(row["domain"])
1346             else:
1347                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1348
1349         # c.s isn't part of oliphant's "hidden" blocklists
1350         logger.debug("row[domain]='%s'", row["domain"])
1351         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1352             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1353             instances.set_last_blocked(row["domain"])
1354             instances.set_total_blocks(row["domain"], blocking)
1355
1356         obfuscated = 0
1357         blockdict = list()
1358
1359         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1360         for block in blocking:
1361             logger.debug("block[blocked]='%s'", block["blocked"])
1362             blocked = None
1363
1364             if block["blocked"] == "":
1365                 logger.debug("block[blocked] is empty - SKIPPED!")
1366                 continue
1367             elif block["blocked"].endswith(".onion"):
1368                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1369                 continue
1370             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1371                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1372                 continue
1373             elif block["blocked"].endswith(".arpa"):
1374                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1375                 continue
1376             elif block["blocked"].endswith(".tld"):
1377                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1378                 continue
1379             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1380                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1381                 obfuscated = obfuscated + 1
1382                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1383             elif not domain_helper.is_wanted(block["blocked"]):
1384                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1385                 continue
1386             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1387                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1388                 continue
1389
1390             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1391             if blocked is not None and blocked != block["blocked"]:
1392                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1393                 obfuscated = obfuscated - 1
1394
1395                 if blacklist.is_blacklisted(blocked):
1396                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1397                     continue
1398                 elif blacklist.is_blacklisted(row["domain"]):
1399                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1400                     continue
1401                 elif blocks.is_instance_blocked(row["domain"], blocked):
1402                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1403                     continue
1404
1405                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1406
1407                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1408                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1409                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1410                     blockdict.append({
1411                         "blocked": blocked,
1412                         "reason" : block["reason"],
1413                     })
1414
1415         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1416         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1417         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1418
1419         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1420         if instances.has_pending(row["domain"]):
1421             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1422             instances.update(row["domain"])
1423
1424         logger.debug("Invoking commit() ...")
1425         database.connection.commit()
1426
1427         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1428         if config.get("bot_enabled") and len(blockdict) > 0:
1429             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1430             network.send_bot_post(row["domain"], blockdict)
1431
1432     logger.debug("Success! - EXIT!")
1433     return 0
1434
1435 def fetch_fedilist(args: argparse.Namespace) -> int:
1436     logger.debug("args[]='%s' - CALLED!", type(args))
1437
1438     logger.debug("Invoking locking.acquire() ...")
1439     locking.acquire()
1440
1441     source_domain = "demo.fedilist.com"
1442     if sources.is_recent(source_domain):
1443         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1444         return 1
1445     else:
1446         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1447         sources.update(source_domain)
1448
1449     url = f"http://{source_domain}/instance/csv?onion=not"
1450     if args.software is not None and args.software != "":
1451         logger.debug("args.software='%s'", args.software)
1452         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1453
1454     logger.info("Fetching url='%s' ...", url)
1455     response = reqto.get(
1456         url,
1457         headers=network.web_headers,
1458         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1459         allow_redirects=False
1460     )
1461
1462     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1463     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1464         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1465         return 1
1466
1467     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1468
1469     logger.debug("reader[]='%s'", type(reader))
1470     if reader is None:
1471         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1472         return 2
1473
1474     rows = list(reader)
1475
1476     logger.info("Checking %d rows ...", len(rows))
1477     for row in rows:
1478         logger.debug("row[]='%s'", type(row))
1479         if "hostname" not in row:
1480             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1481             continue
1482
1483         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1484         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1485         logger.debug("domain='%s' - AFTER!", domain)
1486
1487         if domain in [None, ""]:
1488             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1489             continue
1490
1491         logger.debug("domain='%s' - BEFORE!", domain)
1492         domain = domain.encode("idna").decode("utf-8")
1493         logger.debug("domain='%s' - AFTER!", domain)
1494
1495         if not domain_helper.is_wanted(domain):
1496             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1497             continue
1498         elif (args.force is None or not args.force) and instances.is_registered(domain):
1499             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1500             continue
1501         elif instances.is_recent(domain):
1502             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1503             continue
1504
1505         logger.info("Fetching instances from domain='%s' ...", domain)
1506         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1507
1508     logger.debug("Success! - EXIT!")
1509     return 0
1510
1511 def update_nodeinfo(args: argparse.Namespace) -> int:
1512     logger.debug("args[]='%s' - CALLED!", type(args))
1513
1514     logger.debug("Invoking locking.acquire() ...")
1515     locking.acquire()
1516
1517     if args.domain is not None and args.domain != "":
1518         logger.debug("Fetching args.domain='%s'", args.domain)
1519         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1520     elif args.software is not None and args.software != "":
1521         logger.info("Fetching domains for args.software='%s'", args.software)
1522         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1523     elif args.mode is not None and args.mode != "":
1524         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1525         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1526     elif args.no_software:
1527         logger.info("Fetching domains with no software type detected ...")
1528         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1529     elif args.with_software:
1530         logger.info("Fetching domains with any software type detected ...")
1531         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1532     elif args.no_auto:
1533         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1534         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1535     elif args.no_detection:
1536         logger.info("Fetching domains with no detection mode being set ...")
1537         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1538     elif args.same:
1539         logger.info("Fetching domains with domain name and software being the same ...")
1540         database.cursor.execute("SELECT domain, software FROM instances WHERE domain=software ORDER BY last_updated ASC")
1541     else:
1542         logger.info("Fetching domains for recently updated ...")
1543         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1544
1545     domains = database.cursor.fetchall()
1546
1547     logger.info("Checking %d domain(s) ...", len(domains))
1548     cnt = 0
1549     for row in domains:
1550         logger.debug("row[]='%s'", type(row))
1551         if row["domain"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1552             logger.debug("row[domain]='%s' is an I2P address - SKIPPED", row["domain"])
1553             continue
1554         elif row["domain"].endswith(".onion"):
1555             logger.debug("row[domain]='%s' is a TOR .onion domain - SKIPPED", row["domain"])
1556             continue
1557         elif row["domain"].endswith(".arpa"):
1558             logger.debug("row[domain]='%s' is a reverse IP address - SKIPPED", row["domain"])
1559             continue
1560         elif row["domain"].endswith(".tld"):
1561             logger.debug("row[domain]='%s' is a fake domain - SKIPPED", row["domain"])
1562             continue
1563         elif blacklist.is_blacklisted(row["domain"]):
1564             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1565             continue
1566         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1567             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1568             continue
1569
1570         try:
1571             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1572             software = federation.determine_software(row["domain"])
1573
1574             logger.debug("Determined software='%s'", software)
1575             if (software != row["software"] and software is not None) or args.force is True:
1576                 logger.debug("software='%s'", software)
1577                 if software is None:
1578                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1579                     instances.set_nodeinfo_url(row["domain"], None)
1580
1581                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1582                 instances.set_software(row["domain"], software)
1583
1584             if software is not None:
1585                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1586                 instances.set_success(row["domain"])
1587         except network.exceptions as exception:
1588             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1589             instances.set_last_error(row["domain"], exception)
1590
1591         instances.set_last_nodeinfo(row["domain"])
1592         instances.update(row["domain"])
1593         cnt = cnt + 1
1594
1595     logger.debug("Success! - EXIT!")
1596     return 0
1597
1598 def fetch_instances_social(args: argparse.Namespace) -> int:
1599     logger.debug("args[]='%s' - CALLED!", type(args))
1600
1601     logger.debug("Invoking locking.acquire() ...")
1602     locking.acquire()
1603
1604     source_domain = "instances.social"
1605
1606     if config.get("instances_social_api_key") == "":
1607         logger.error("API key not set. Please set in your config.json file.")
1608         return 1
1609     elif sources.is_recent(source_domain):
1610         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1611         return 2
1612     else:
1613         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1614         sources.update(source_domain)
1615
1616     headers = {
1617         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1618     }
1619
1620     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1621     fetched = network.get_json_api(
1622         source_domain,
1623         "/api/1.0/instances/list?count=0&sort_by=name",
1624         headers=headers,
1625         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1626     )
1627     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1628
1629     if "error_message" in fetched:
1630         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1631         return 2
1632     elif "exception" in fetched:
1633         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1634         return 3
1635     elif "json" not in fetched:
1636         logger.warning("fetched has no element 'json' - EXIT!")
1637         return 4
1638     elif "instances" not in fetched["json"]:
1639         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1640         return 5
1641
1642     domains = list()
1643     rows = fetched["json"]["instances"]
1644
1645     logger.info("Checking %d row(s) ...", len(rows))
1646     for row in rows:
1647         logger.debug("row[]='%s'", type(row))
1648         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1649         logger.debug("domain='%s' - AFTER!", domain)
1650
1651         if domain is None and domain == "":
1652             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1653             continue
1654
1655         logger.debug("domain='%s' - BEFORE!", domain)
1656         domain = domain.encode("idna").decode("utf-8")
1657         logger.debug("domain='%s' - AFTER!", domain)
1658
1659         if not domain_helper.is_wanted(domain):
1660             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1661             continue
1662         elif domain in domains:
1663             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1664             continue
1665         elif instances.is_registered(domain):
1666             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1667             continue
1668         elif instances.is_recent(domain):
1669             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1670             continue
1671
1672         logger.info("Fetching instances from domain='%s' ...", domain)
1673         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1674
1675     logger.debug("Success! - EXIT!")
1676     return 0
1677
1678 def fetch_relaylist(args: argparse.Namespace) -> int:
1679     logger.debug("args[]='%s' - CALLED!", type(args))
1680
1681     logger.debug("Invoking locking.acquire() ...")
1682     locking.acquire()
1683
1684     source_domain = "api.relaylist.com"
1685
1686     if sources.is_recent(source_domain):
1687         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1688         return 1
1689     else:
1690         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1691         sources.update(source_domain)
1692
1693     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1694     fetched = network.get_json_api(
1695         source_domain,
1696         "/relays",
1697         {},
1698         (config.get("connection_timeout"), config.get("read_timeout"))
1699     )
1700     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1701
1702     if "error_message" in fetched:
1703         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1704         return 2
1705     elif "exception" in fetched:
1706         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1707         return 3
1708     elif "json" not in fetched:
1709         logger.warning("fetched has no element 'json' - EXIT!")
1710         return 4
1711
1712     domains = list()
1713
1714     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1715     for row in fetched["json"]:
1716         logger.debug("row[]='%s'", type(row))
1717         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1718         logger.debug("domain='%s' - AFTER!", domain)
1719
1720         if domain is None and domain == "":
1721             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1722             continue
1723
1724         logger.debug("domain='%s' - BEFORE!", domain)
1725         domain = domain.encode("idna").decode("utf-8")
1726         logger.debug("domain='%s' - AFTER!", domain)
1727
1728         if not domain_helper.is_wanted(domain):
1729             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1730             continue
1731         elif domain in domains:
1732             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1733             continue
1734         elif instances.is_registered(domain):
1735             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1736             continue
1737         elif instances.is_recent(domain):
1738             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1739             continue
1740
1741         logger.info("Fetching instances from domain='%s'", domain)
1742         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1743
1744     logger.debug("Success! - EXIT!")
1745     return 0
1746
1747 def fetch_relays(args: argparse.Namespace) -> int:
1748     logger.debug("args[]='%s' - CALLED!", type(args))
1749
1750     logger.debug("Invoking locking.acquire() ...")
1751     locking.acquire()
1752
1753     if args.domain is not None and args.domain != "":
1754         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1755         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1756     elif args.software is not None and args.software != "":
1757         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1758         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1759     else:
1760         logger.debug("Fetch all relay instances ...")
1761         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1762
1763     domains = list()
1764     rows = database.cursor.fetchall()
1765
1766     logger.info("Checking %d relays ...", len(rows))
1767     for row in rows:
1768         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1769         if not args.force and instances.is_recent(row["domain"]):
1770             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1771             continue
1772         elif row["nodeinfo_url"] is None:
1773             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1774             continue
1775
1776         peers = list()
1777         try:
1778             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1779             if row["software"] == "pub-relay":
1780                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1781                 raw = network.fetch_api_url(
1782                     row["nodeinfo_url"],
1783                     (config.get("connection_timeout"), config.get("read_timeout"))
1784                 )
1785
1786                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1787                 if "exception" in raw:
1788                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1789                     raise raw["exception"]
1790                 elif "error_message" in raw:
1791                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1792                     instances.set_last_error(row["domain"], raw)
1793                     instances.set_last_instance_fetch(row["domain"])
1794                     instances.update(row["domain"])
1795                     continue
1796                 elif "json" not in raw:
1797                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1798                     continue
1799                 elif not "metadata" in raw["json"]:
1800                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1801                     continue
1802                 elif not "peers" in raw["json"]["metadata"]:
1803                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1804                     continue
1805             else:
1806                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1807                 raw = network.fetch_url(
1808                     f"https://{row['domain']}",
1809                     network.web_headers,
1810                     (config.get("connection_timeout"), config.get("read_timeout"))
1811                 ).text
1812                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1813
1814                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1815                 logger.debug("doc[]='%s'", type(doc))
1816
1817         except network.exceptions as exception:
1818             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1819             instances.set_last_error(row["domain"], exception)
1820             instances.set_last_instance_fetch(row["domain"])
1821             instances.update(row["domain"])
1822             continue
1823
1824         logger.debug("row[software]='%s'", row["software"])
1825         if row["software"] == "activityrelay":
1826             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1827             tags = doc.findAll("p")
1828
1829             logger.debug("Checking %d paragraphs ...", len(tags))
1830             for tag in tags:
1831                 logger.debug("tag[]='%s'", type(tag))
1832                 if len(tag.contents) == 0:
1833                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1834                     continue
1835                 elif "registered instances" not in tag.contents[0]:
1836                     logger.debug("Skipping paragraph, text not found.")
1837                     continue
1838
1839                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1840                 for domain in tag.contents:
1841                     logger.debug("domain[%s]='%s'", type(domain), domain)
1842                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1843                         continue
1844
1845                     domain = str(domain)
1846                     logger.debug("domain='%s'", domain)
1847                     if not domain_helper.is_wanted(domain):
1848                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1849                         continue
1850
1851                     logger.debug("domain='%s' - BEFORE!", domain)
1852                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1853                     logger.debug("domain='%s' - AFTER!", domain)
1854
1855                     if domain in [None, ""]:
1856                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1857                         continue
1858                     elif domain not in peers:
1859                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1860                         peers.append(domain)
1861
1862                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1863                     if dict_helper.has_key(domains, "domain", domain):
1864                         logger.debug("domain='%s' already added", domain)
1865                         continue
1866
1867                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1868                     domains.append({
1869                         "domain": domain,
1870                         "origin": row["domain"],
1871                     })
1872         elif row["software"] in ["aoderelay", "selective-relay"]:
1873             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1874             if row["software"] == "aoderelay":
1875                 tags = doc.findAll("section", {"class": "instance"})
1876             else:
1877                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1878
1879             logger.debug("Checking %d tags ...", len(tags))
1880             for tag in tags:
1881                 logger.debug("tag[]='%s'", type(tag))
1882
1883                 link = tag.find("a")
1884                 logger.debug("link[%s]='%s'", type(link), link)
1885                 if not isinstance(link, bs4.element.Tag):
1886                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1887                     continue
1888
1889                 components = urlparse(link.get("href"))
1890                 logger.debug("components(%d)='%s'", len(components), components)
1891                 domain = components.netloc.lower().split(":")[0]
1892
1893                 logger.debug("domain='%s' - BEFORE!", domain)
1894                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1895                 logger.debug("domain='%s' - AFTER!", domain)
1896
1897                 if domain in [None, ""]:
1898                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1899                     continue
1900                 elif domain not in peers:
1901                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1902                     peers.append(domain)
1903
1904                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1905                 if dict_helper.has_key(domains, "domain", domain):
1906                     logger.debug("domain='%s' already added", domain)
1907                     continue
1908
1909                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1910                 domains.append({
1911                     "domain": domain,
1912                     "origin": row["domain"],
1913                 })
1914         elif row["software"] == "pub-relay":
1915             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1916             for domain in raw["json"]["metadata"]["peers"]:
1917                 logger.debug("domain='%s' - BEFORE!", domain)
1918                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1919                 logger.debug("domain='%s' - AFTER!", domain)
1920
1921                 if domain in [None, ""]:
1922                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1923                     continue
1924                 elif domain not in peers:
1925                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1926                     peers.append(domain)
1927
1928                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1929                 if dict_helper.has_key(domains, "domain", domain):
1930                     logger.debug("domain='%s' already added", domain)
1931                     continue
1932
1933                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1934                 domains.append({
1935                     "domain": domain,
1936                     "origin": row["domain"],
1937                 })
1938         else:
1939             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1940             continue
1941
1942         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1943         instances.set_last_instance_fetch(row["domain"])
1944
1945         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1946         instances.set_total_peers(row["domain"], peers)
1947
1948         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1949         instances.update(row["domain"])
1950
1951     logger.info("Checking %d domains ...", len(domains))
1952     for row in domains:
1953         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1954         if not domain_helper.is_wanted(row["domain"]):
1955             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1956             continue
1957         elif instances.is_registered(row["domain"]):
1958             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1959             continue
1960
1961         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1962         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1963
1964     logger.debug("Success! - EXIT!")
1965     return 0
1966
1967 def convert_idna(args: argparse.Namespace) -> int:
1968     logger.debug("args[]='%s' - CALLED!", type(args))
1969
1970     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1971     rows = database.cursor.fetchall()
1972
1973     logger.debug("rows[]='%s'", type(rows))
1974     instances.translate_idnas(rows, "domain")
1975
1976     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1977     rows = database.cursor.fetchall()
1978
1979     logger.debug("rows[]='%s'", type(rows))
1980     instances.translate_idnas(rows, "origin")
1981
1982     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1983     rows = database.cursor.fetchall()
1984
1985     logger.debug("rows[]='%s'", type(rows))
1986     blocks.translate_idnas(rows, "blocker")
1987
1988     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1989     rows = database.cursor.fetchall()
1990
1991     logger.debug("rows[]='%s'", type(rows))
1992     blocks.translate_idnas(rows, "blocked")
1993
1994     logger.debug("Success! - EXIT!")
1995     return 0
1996
1997 def remove_invalid(args: argparse.Namespace) -> int:
1998     logger.debug("args[]='%s' - CALLED!", type(args))
1999
2000     logger.debug("Invoking locking.acquire() ...")
2001     locking.acquire()
2002
2003     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2004     rows = database.cursor.fetchall()
2005
2006     logger.info("Checking %d domains ...", len(rows))
2007     for row in rows:
2008         logger.debug("row[domain]='%s'", row["domain"])
2009         if not validators.domain(row["domain"].split("/")[0]):
2010             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2011             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2012             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2013
2014     logger.debug("Invoking commit() ...")
2015     database.connection.commit()
2016
2017     logger.info("Vaccum cleaning database ...")
2018     database.cursor.execute("VACUUM")
2019
2020     logger.debug("Success! - EXIT!")
2021     return 0