]> git.mxchange.org Git - fba.git/blob - fba/commands.py
2e36ee49e53853c08675a5ac4b506634d2eb4a65
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = utils.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         doc = None
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = utils.fetch_url(
532                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
533                 network.web_headers,
534                 (config.get("connection_timeout"), config.get("read_timeout"))
535             ).text
536             logger.debug("raw[%s]()=%d", type(raw), len(raw))
537
538             doc = bs4.BeautifulSoup(raw, features="html.parser")
539             logger.debug("doc[]='%s'", type(doc))
540         except network.exceptions as exception:
541             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
542             continue
543
544         items = doc.findAll("a", {"class": "url"})
545         logger.info("Checking %d items,software='%s' ...", len(items), software)
546         for item in items:
547             logger.debug("item[]='%s'", type(item))
548             domain = item.decode_contents()
549             logger.debug("domain[%s]='%s'", type(domain), domain)
550             domain = tidyup.domain(domain) if domain not in [None, ""] else None
551             logger.debug("domain='%s' - AFTER!", domain)
552
553             if domain in [None, ""]:
554                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
555                 continue
556
557             logger.debug("domain='%s' - BEFORE!", domain)
558             domain = domain.encode("idna").decode("utf-8")
559             logger.debug("domain='%s' - AFTER!", domain)
560
561             if not domain_helper.is_wanted(domain):
562                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
563                 continue
564             elif instances.is_registered(domain):
565                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
566                 continue
567
568             logger.info("Fetching instances for domain='%s' ...", domain)
569             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
570
571     logger.debug("Success! - EXIT!")
572     return 0
573
574 def fetch_todon_wiki(args: argparse.Namespace) -> int:
575     logger.debug("args[]='%s' - CALLED!", type(args))
576
577     logger.debug("Invoking locking.acquire() ...")
578     locking.acquire()
579
580     source_domain = "wiki.todon.eu"
581     if sources.is_recent(source_domain):
582         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
583         return 1
584     else:
585         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
586         sources.update(source_domain)
587
588     blocklist = {
589         "silenced": list(),
590         "reject": list(),
591     }
592
593     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
594     raw = utils.fetch_url(
595         f"https://{source_domain}/todon/domainblocks",
596         network.web_headers,
597         (config.get("connection_timeout"), config.get("read_timeout"))
598     ).text
599     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
600
601     doc = bs4.BeautifulSoup(raw, "html.parser")
602     logger.debug("doc[]='%s'", type(doc))
603
604     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
605     logger.info("Checking %d silenced/limited entries ...", len(silenced))
606     blocklist["silenced"] = utils.find_domains(silenced, "div")
607
608     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
609     logger.info("Checking %d suspended entries ...", len(suspended))
610     blocklist["reject"] = utils.find_domains(suspended, "div")
611
612     blocking = blocklist["silenced"] + blocklist["reject"]
613     blocker = "todon.eu"
614
615     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
616     instances.set_last_blocked(blocker)
617     instances.set_total_blocks(blocker, blocking)
618
619     blockdict = list()
620     for block_level in blocklist:
621         blockers = blocklist[block_level]
622
623         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
624         for blocked in blockers:
625             logger.debug("blocked='%s'", blocked)
626
627             if not instances.is_registered(blocked):
628                 try:
629                     logger.info("Fetching instances from domain='%s' ...", blocked)
630                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
631                 except network.exceptions as exception:
632                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
633                     instances.set_last_error(blocked, exception)
634
635             if not domain_helper.is_wanted(blocked):
636                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
637                 continue
638             elif not domain_helper.is_wanted(blocker):
639                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
640                 continue
641             elif blocks.is_instance_blocked(blocker, blocked, block_level):
642                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
643                 continue
644
645             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
646             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
647                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
648                 blockdict.append({
649                     "blocked": blocked,
650                     "reason" : None,
651                 })
652
653         logger.debug("Invoking commit() ...")
654         database.connection.commit()
655
656         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
657         if config.get("bot_enabled") and len(blockdict) > 0:
658             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
659             network.send_bot_post(blocker, blockdict)
660
661     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
662     if instances.has_pending(blocker):
663         logger.debug("Flushing updates for blocker='%s' ...", blocker)
664         instances.update(blocker)
665
666     logger.debug("Success! - EXIT!")
667     return 0
668
669 def fetch_cs(args: argparse.Namespace):
670     logger.debug("args[]='%s' - CALLED!", type(args))
671
672     logger.debug("Invoking locking.acquire() ...")
673     locking.acquire()
674
675     extensions = [
676         "extra",
677         "abbr",
678         "attr_list",
679         "def_list",
680         "fenced_code",
681         "footnotes",
682         "md_in_html",
683         "admonition",
684         "codehilite",
685         "legacy_attrs",
686         "legacy_em",
687         "meta",
688         "nl2br",
689         "sane_lists",
690         "smarty",
691         "toc",
692         "wikilinks"
693     ]
694
695     blocklist = {
696         "silenced": list(),
697         "reject"  : list(),
698     }
699
700     source_domain = "raw.githubusercontent.com"
701     if sources.is_recent(source_domain):
702         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
703         return 1
704     else:
705         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
706         sources.update(source_domain)
707
708     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
709     raw = utils.fetch_url(
710         f"https://{source_domain}/chaossocial/meta/master/federation.md",
711         network.web_headers,
712         (config.get("connection_timeout"), config.get("read_timeout"))
713     ).text
714     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
715
716     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
717     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
718
719     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
720     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
721     blocklist["silenced"] = federation.find_domains(silenced)
722
723     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
724     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
725     blocklist["reject"] = federation.find_domains(blocked)
726
727     blocking = blocklist["silenced"] + blocklist["reject"]
728     blocker = "chaos.social"
729
730     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
731     instances.set_last_blocked(blocker)
732     instances.set_total_blocks(blocker, blocking)
733
734     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
735     if len(blocking) > 0:
736         blockdict = list()
737         for block_level in blocklist:
738             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
739
740             for row in blocklist[block_level]:
741                 logger.debug("row[%s]='%s'", type(row), row)
742                 if not "domain" in row:
743                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
744                     continue
745                 elif not instances.is_registered(row["domain"]):
746                     try:
747                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
748                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
749                     except network.exceptions as exception:
750                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
751                         instances.set_last_error(row["domain"], exception)
752
753                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
754                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
755                     blockdict.append({
756                         "blocked": row["domain"],
757                         "reason" : row["reason"],
758                     })
759
760         logger.debug("Invoking commit() ...")
761         database.connection.commit()
762
763         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
764         if config.get("bot_enabled") and len(blockdict) > 0:
765             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
766             network.send_bot_post(blocker, blockdict)
767
768     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
769     if instances.has_pending(blocker):
770         logger.debug("Flushing updates for blocker='%s' ...", blocker)
771         instances.update(blocker)
772
773     logger.debug("Success! - EXIT!")
774     return 0
775
776 def fetch_fba_rss(args: argparse.Namespace) -> int:
777     logger.debug("args[]='%s' - CALLED!", type(args))
778
779     domains = list()
780
781     logger.debug("Invoking locking.acquire() ...")
782     locking.acquire()
783
784     components = urlparse(args.feed)
785     domain = components.netloc.lower().split(":")[0]
786
787     logger.debug("domain='%s'", domain)
788     if sources.is_recent(domain):
789         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
790         return 0
791     else:
792         logger.debug("domain='%s' has not been recently used, marking ...", domain)
793         sources.update(domain)
794
795     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
796     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
797
798     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
799     if response.ok and response.status_code == 200 and len(response.text) > 0:
800         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
801         rss = atoma.parse_rss_bytes(response.content)
802
803         logger.debug("rss[]='%s'", type(rss))
804         for item in rss.items:
805             logger.debug("item[%s]='%s'", type(item), item)
806             domain = item.link.split("=")[1]
807             domain = tidyup.domain(domain) if domain not in[None, ""] else None
808
809             logger.debug("domain='%s' - AFTER!", domain)
810             if domain in [None, ""]:
811                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
812                 continue
813
814             logger.debug("domain='%s' - BEFORE!", domain)
815             domain = domain.encode("idna").decode("utf-8")
816             logger.debug("domain='%s' - AFTER!", domain)
817
818             if not domain_helper.is_wanted(domain):
819                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
820                 continue
821             elif domain in domains:
822                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
823                 continue
824             elif instances.is_registered(domain):
825                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
826                 continue
827             elif instances.is_recent(domain):
828                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
829                 continue
830
831             logger.debug("Adding domain='%s'", domain)
832             domains.append(domain)
833
834     logger.debug("domains()=%d", len(domains))
835     if len(domains) > 0:
836         logger.info("Adding %d new instances ...", len(domains))
837         for domain in domains:
838             logger.debug("domain='%s'", domain)
839             try:
840                 logger.info("Fetching instances from domain='%s' ...", domain)
841                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
842             except network.exceptions as exception:
843                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
844                 instances.set_last_error(domain, exception)
845                 return 100
846
847     logger.debug("Success! - EXIT!")
848     return 0
849
850 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
851     logger.debug("args[]='%s' - CALLED!", type(args))
852
853     logger.debug("Invoking locking.acquire() ...")
854     locking.acquire()
855
856     source_domain = "ryona.agency"
857     feed = f"https://{source_domain}/users/fba/feed.atom"
858
859     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
860     if args.feed is not None and validators.url(args.feed):
861         logger.debug("Setting feed='%s' ...", args.feed)
862         feed = str(args.feed)
863         source_domain = urlparse(args.feed).netloc
864
865     if sources.is_recent(source_domain):
866         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
867         return 1
868     else:
869         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
870         sources.update(source_domain)
871
872     domains = list()
873
874     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
875     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
876
877     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
878     if response.ok and response.status_code == 200 and len(response.text) > 0:
879         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
880         atom = atoma.parse_atom_bytes(response.content)
881
882         logger.debug("atom[]='%s'", type(atom))
883         for entry in atom.entries:
884             logger.debug("entry[]='%s'", type(entry))
885             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
886             logger.debug("doc[]='%s'", type(doc))
887             elements = doc.findAll("a")
888
889             logger.debug("Checking %d element(s) ...", len(elements))
890             for element in elements:
891                 logger.debug("element[%s]='%s'", type(element), element)
892                 for href in element["href"].split(","):
893                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
894                     domain = tidyup.domain(href) if href not in [None, ""] else None
895
896                     logger.debug("domain='%s' - AFTER!", domain)
897                     if domain in [None, ""]:
898                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
899                         continue
900
901                     logger.debug("domain='%s' - BEFORE!", domain)
902                     domain = domain.encode("idna").decode("utf-8")
903                     logger.debug("domain='%s' - AFTER!", domain)
904
905                     if not domain_helper.is_wanted(domain):
906                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
907                         continue
908                     elif domain in domains:
909                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
910                         continue
911                     elif instances.is_registered(domain):
912                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
913                         continue
914                     elif instances.is_recent(domain):
915                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
916                         continue
917
918                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
919                     domains.append(domain)
920
921     logger.debug("domains()=%d", len(domains))
922     if len(domains) > 0:
923         logger.info("Adding %d new instances ...", len(domains))
924         for domain in domains:
925             logger.debug("domain='%s'", domain)
926             try:
927                 logger.info("Fetching instances from domain='%s' ...", domain)
928                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
929             except network.exceptions as exception:
930                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
931                 instances.set_last_error(domain, exception)
932                 return 100
933
934     logger.debug("Success! - EXIT!")
935     return 0
936
937 def fetch_instances(args: argparse.Namespace) -> int:
938     logger.debug("args[]='%s' - CALLED!", type(args))
939
940     logger.debug("args.domain='%s' - checking ...", args.domain)
941     if not validators.domain(args.domain):
942         logger.warning("args.domain='%s' is not valid.", args.domain)
943         return 100
944     elif blacklist.is_blacklisted(args.domain):
945         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
946         return 101
947
948     logger.debug("Invoking locking.acquire() ...")
949     locking.acquire()
950
951     # Initialize values
952     domain = tidyup.domain(args.domain)
953     origin = software = None
954
955     # Fetch record
956     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
957     row = database.cursor.fetchone()
958     if row is not None:
959         origin = row["origin"]
960         software = row["software"]
961
962     logger.debug("software='%s'", software)
963     if software is None:
964         logger.warning("args.domain='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated.", args.domain, args.domain)
965         return 102
966     elif software_helper.is_relay(software):
967         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
968         return 103
969
970     # Initial fetch
971     try:
972         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
973         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
974     except network.exceptions as exception:
975         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
976         instances.set_last_error(args.domain, exception)
977         instances.update(args.domain)
978         return 104
979
980     if args.single:
981         logger.debug("Not fetching more instances - EXIT!")
982         return 0
983
984     # Loop through some instances
985     database.cursor.execute(
986         "SELECT domain, origin, software FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
987     )
988
989     rows = database.cursor.fetchall()
990     logger.info("Checking %d entries ...", len(rows))
991     for row in rows:
992         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
993         domain = row["domain"].encode("idna").decode("utf-8")
994         logger.debug("domain='%s' - AFTER!", domain)
995
996         if not domain_helper.is_wanted(domain):
997             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
998             continue
999
1000         try:
1001             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1002             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1003         except network.exceptions as exception:
1004             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1005             instances.set_last_error(domain, exception)
1006
1007     logger.debug("Success - EXIT!")
1008     return 0
1009
1010 def fetch_csv(args: argparse.Namespace) -> int:
1011     logger.debug("args[]='%s' - CALLED!", type(args))
1012
1013     logger.debug("Invoking locking.acquire() ...")
1014     locking.acquire()
1015
1016     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1017     for block in blocklists.csv_files:
1018         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1019
1020         # Is domain given and not equal blocker?
1021         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1022             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1023             continue
1024
1025         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1026         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1027
1028     logger.debug("Success - EXIT!")
1029     return 0
1030
1031 def fetch_oliphant(args: argparse.Namespace) -> int:
1032     logger.debug("args[]='%s' - CALLED!", type(args))
1033
1034     logger.debug("Invoking locking.acquire() ...")
1035     locking.acquire()
1036
1037     source_domain = "codeberg.org"
1038     if sources.is_recent(source_domain):
1039         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1040         return 1
1041     else:
1042         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1043         sources.update(source_domain)
1044
1045     # Base URL
1046     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1047
1048     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1049     for block in blocklists.oliphant_blocklists:
1050         # Is domain given and not equal blocker?
1051         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1052         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1053             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1054             continue
1055
1056         url = f"{base_url}/{block['csv_url']}"
1057
1058         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1059         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1060
1061     logger.debug("Success! - EXIT!")
1062     return 0
1063
1064 def fetch_txt(args: argparse.Namespace) -> int:
1065     logger.debug("args[]='%s' - CALLED!", type(args))
1066
1067     logger.debug("Invoking locking.acquire() ...")
1068     locking.acquire()
1069
1070     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1071     for row in blocklists.txt_files:
1072         logger.debug("Fetching row[url]='%s' ...", row["url"])
1073         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1074
1075         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1076         if response.ok and response.status_code == 200 and response.text != "":
1077             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1078             domains = response.text.strip().split("\n")
1079
1080             logger.info("Processing %d domains ...", len(domains))
1081             for domain in domains:
1082                 logger.debug("domain='%s' - BEFORE!", domain)
1083                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1084                 logger.debug("domain='%s' - AFTER!", domain)
1085
1086                 if domain in [None, ""]:
1087                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1088                     continue
1089                 elif not domain_helper.is_wanted(domain):
1090                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1091                     continue
1092                 elif not args.force and instances.is_registered(domain):
1093                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1094                     continue
1095
1096                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1097                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1098                 logger.debug("processed='%s'", processed)
1099
1100     logger.debug("Success! - EXIT!")
1101     return 0
1102
1103 def fetch_fedipact(args: argparse.Namespace) -> int:
1104     logger.debug("args[]='%s' - CALLED!", type(args))
1105
1106     logger.debug("Invoking locking.acquire() ...")
1107     locking.acquire()
1108
1109     source_domain = "fedipact.online"
1110     if sources.is_recent(source_domain):
1111         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1112         return 1
1113     else:
1114         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1115         sources.update(source_domain)
1116
1117     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1118     response = utils.fetch_url(
1119         f"https://{source_domain}",
1120         network.web_headers,
1121         (config.get("connection_timeout"), config.get("read_timeout"))
1122     )
1123
1124     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1125     if response.ok and response.status_code == 200 and response.text != "":
1126         logger.debug("Parsing %d Bytes ...", len(response.text))
1127
1128         doc = bs4.BeautifulSoup(response.text, "html.parser")
1129         logger.debug("doc[]='%s'", type(doc))
1130
1131         rows = doc.findAll("li")
1132         logger.info("Checking %d row(s) ...", len(rows))
1133         for row in rows:
1134             logger.debug("row[]='%s'", type(row))
1135             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1136
1137             logger.debug("domain='%s' - AFTER!", domain)
1138             if domain in [None, ""]:
1139                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1140                 continue
1141
1142             logger.debug("domain='%s' - BEFORE!", domain)
1143             domain = domain.encode("idna").decode("utf-8")
1144             logger.debug("domain='%s' - AFTER!", domain)
1145
1146             if not domain_helper.is_wanted(domain):
1147                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1148                 continue
1149             elif instances.is_registered(domain):
1150                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1151                 continue
1152             elif instances.is_recent(domain):
1153                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1154                 continue
1155
1156             logger.info("Fetching domain='%s' ...", domain)
1157             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1158
1159     logger.debug("Success! - EXIT!")
1160     return 0
1161
1162 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1163     logger.debug("args[]='%s' - CALLED!", type(args))
1164
1165     logger.debug("Invoking locking.acquire() ...")
1166     locking.acquire()
1167
1168     source_domain = "instances.joinmobilizon.org"
1169     if sources.is_recent(source_domain):
1170         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1171         return 1
1172     else:
1173         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1174         sources.update(source_domain)
1175
1176     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1177     raw = utils.fetch_url(
1178         f"https://{source_domain}/api/v1/instances",
1179         network.web_headers,
1180         (config.get("connection_timeout"), config.get("read_timeout"))
1181     ).text
1182     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1183
1184     parsed = json.loads(raw)
1185     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1186
1187     if "data" not in parsed:
1188         logger.warning("parsed()=%d does not contain key 'data'")
1189         return 1
1190
1191     logger.info("Checking %d instances ...", len(parsed["data"]))
1192     for row in parsed["data"]:
1193         logger.debug("row[]='%s'", type(row))
1194         if "host" not in row:
1195             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1196             continue
1197         elif not domain_helper.is_wanted(row["host"]):
1198             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1199             continue
1200         elif instances.is_registered(row["host"]):
1201             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1202             continue
1203
1204         logger.info("Fetching row[host]='%s' ...", row["host"])
1205         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1206
1207     logger.debug("Success! - EXIT!")
1208     return 0
1209
1210 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1211     logger.debug("args[]='%s' - CALLED!", type(args))
1212
1213     logger.debug("Invoking locking.acquire() ...")
1214     locking.acquire()
1215
1216     source_domain = "instanceapp.misskey.page"
1217     if sources.is_recent(source_domain):
1218         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1219         return 1
1220     else:
1221         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1222         sources.update(source_domain)
1223
1224     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1225     raw = utils.fetch_url(
1226         f"https://{source_domain}/instances.json",
1227         network.web_headers,
1228         (config.get("connection_timeout"), config.get("read_timeout"))
1229     ).text
1230     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1231
1232     parsed = json.loads(raw)
1233     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1234
1235     if "instancesInfos" not in parsed:
1236         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1237         return 1
1238
1239     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1240     for row in parsed["instancesInfos"]:
1241         logger.debug("row[%s]='%s'", type(row), row)
1242         if "url" not in row:
1243             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1244             continue
1245         elif not domain_helper.is_wanted(row["url"]):
1246             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1247             continue
1248         elif instances.is_registered(row["url"]):
1249             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1250             continue
1251
1252         logger.info("Fetching row[url]='%s' ...", row["url"])
1253         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1254
1255     logger.debug("Success! - EXIT!")
1256     return 0
1257
1258 def recheck_obfuscation(args: argparse.Namespace) -> int:
1259     logger.debug("args[]='%s' - CALLED!", type(args))
1260
1261     logger.debug("Invoking locking.acquire() ...")
1262     locking.acquire()
1263
1264     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1265         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1266     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1267         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1268     else:
1269         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1270
1271     rows = database.cursor.fetchall()
1272     logger.info("Checking %d domains ...", len(rows))
1273     for row in rows:
1274         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1275         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1276             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1277             continue
1278         elif blacklist.is_blacklisted(row["domain"]):
1279             logger.warning("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1280             continue
1281
1282         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1283         blocking = federation.fetch_blocks(row["domain"])
1284
1285         logger.debug("blocking()=%d", len(blocking))
1286         if len(blocking) == 0:
1287             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1288             if row["software"] == "pleroma":
1289                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1290                 blocking = pleroma.fetch_blocks(row["domain"])
1291             elif row["software"] == "mastodon":
1292                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1293                 blocking = mastodon.fetch_blocks(row["domain"])
1294             elif row["software"] == "lemmy":
1295                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1296                 blocking = lemmy.fetch_blocks(row["domain"])
1297             elif row["software"] == "friendica":
1298                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1299                 blocking = friendica.fetch_blocks(row["domain"])
1300             elif row["software"] == "misskey":
1301                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1302                 blocking = misskey.fetch_blocks(row["domain"])
1303             else:
1304                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1305
1306         # c.s isn't part of oliphant's "hidden" blocklists
1307         logger.debug("row[domain]='%s'", row["domain"])
1308         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1309             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1310             instances.set_last_blocked(row["domain"])
1311             instances.set_total_blocks(row["domain"], blocking)
1312
1313         obfuscated = 0
1314         blockdict = list()
1315
1316         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1317         for block in blocking:
1318             logger.debug("block[blocked]='%s'", block["blocked"])
1319             blocked = None
1320
1321             if block["blocked"] == "":
1322                 logger.debug("block[blocked] is empty - SKIPPED!")
1323                 continue
1324             elif block["blocked"].endswith(".onion"):
1325                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1326                 continue
1327             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
1328                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1329                 continue
1330             elif block["blocked"].endswith(".arpa"):
1331                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1332                 continue
1333             elif block["blocked"].endswith(".tld"):
1334                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1335                 continue
1336             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1337                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1338                 obfuscated = obfuscated + 1
1339                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1340             elif not domain_helper.is_wanted(block["blocked"]):
1341                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1342                 continue
1343             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1344                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1345                 continue
1346
1347             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1348             if blocked is not None and blocked != block["blocked"]:
1349                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1350                 obfuscated = obfuscated - 1
1351
1352                 if blacklist.is_blacklisted(blocked):
1353                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1354                     continue
1355                 elif blacklist.is_blacklisted(row["domain"]):
1356                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1357                     continue
1358                 elif blocks.is_instance_blocked(row["domain"], blocked):
1359                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1360                     continue
1361
1362                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1363
1364                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1365                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1366                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1367                     blockdict.append({
1368                         "blocked": blocked,
1369                         "reason" : block["reason"],
1370                     })
1371
1372         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1373         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1374         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1375
1376         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1377         if instances.has_pending(row["domain"]):
1378             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1379             instances.update(row["domain"])
1380
1381         logger.debug("Invoking commit() ...")
1382         database.connection.commit()
1383
1384         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1385         if config.get("bot_enabled") and len(blockdict) > 0:
1386             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1387             network.send_bot_post(row["domain"], blockdict)
1388
1389     logger.debug("Success! - EXIT!")
1390     return 0
1391
1392 def fetch_fedilist(args: argparse.Namespace) -> int:
1393     logger.debug("args[]='%s' - CALLED!", type(args))
1394
1395     logger.debug("Invoking locking.acquire() ...")
1396     locking.acquire()
1397
1398     source_domain = "demo.fedilist.com"
1399     if sources.is_recent(source_domain):
1400         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1401         return 1
1402     else:
1403         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1404         sources.update(source_domain)
1405
1406     url = f"http://{source_domain}/instance/csv?onion=not"
1407     if args.software is not None and args.software != "":
1408         logger.debug("args.software='%s'", args.software)
1409         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1410
1411     logger.info("Fetching url='%s' ...", url)
1412     response = reqto.get(
1413         url,
1414         headers=network.web_headers,
1415         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1416         allow_redirects=False
1417     )
1418
1419     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1420     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1421         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1422         return 1
1423
1424     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1425
1426     logger.debug("reader[]='%s'", type(reader))
1427     if reader is None:
1428         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1429         return 2
1430
1431     rows = list(reader)
1432
1433     logger.info("Checking %d rows ...", len(rows))
1434     for row in rows:
1435         logger.debug("row[]='%s'", type(row))
1436         if "hostname" not in row:
1437             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1438             continue
1439
1440         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1441         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1442         logger.debug("domain='%s' - AFTER!", domain)
1443
1444         if domain in [None, ""]:
1445             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1446             continue
1447
1448         logger.debug("domain='%s' - BEFORE!", domain)
1449         domain = domain.encode("idna").decode("utf-8")
1450         logger.debug("domain='%s' - AFTER!", domain)
1451
1452         if not domain_helper.is_wanted(domain):
1453             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1454             continue
1455         elif (args.force is None or not args.force) and instances.is_registered(domain):
1456             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1457             continue
1458         elif instances.is_recent(domain):
1459             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1460             continue
1461
1462         logger.info("Fetching instances from domain='%s' ...", domain)
1463         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1464
1465     logger.debug("Success! - EXIT!")
1466     return 0
1467
1468 def update_nodeinfo(args: argparse.Namespace) -> int:
1469     logger.debug("args[]='%s' - CALLED!", type(args))
1470
1471     logger.debug("Invoking locking.acquire() ...")
1472     locking.acquire()
1473
1474     if args.domain is not None and args.domain != "":
1475         logger.debug("Fetching args.domain='%s'", args.domain)
1476         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1477     elif args.software is not None and args.software != "":
1478         logger.info("Fetching domains for args.software='%s'", args.software)
1479         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1480     elif args.mode is not None and args.mode != "":
1481         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1482         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1483     elif args.no_software:
1484         logger.info("Fetching domains with no software type detected ...")
1485         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1486     elif args.with_software:
1487         logger.info("Fetching domains with any software type detected ...")
1488         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1489     elif args.no_auto:
1490         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1491         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1492     elif args.no_detection:
1493         logger.info("Fetching domains with no detection mode being set ...")
1494         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1495     else:
1496         logger.info("Fetching domains for recently updated ...")
1497         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1498
1499     domains = database.cursor.fetchall()
1500
1501     logger.info("Checking %d domain(s) ...", len(domains))
1502     cnt = 0
1503     for row in domains:
1504         logger.debug("row[]='%s'", type(row))
1505         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1506             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1507             continue
1508         elif blacklist.is_blacklisted(row["domain"]):
1509             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1510             continue
1511
1512         try:
1513             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1514             software = federation.determine_software(row["domain"])
1515
1516             logger.debug("Determined software='%s'", software)
1517             if (software != row["software"] and software is not None) or args.force is True:
1518                 logger.debug("software='%s'", software)
1519                 if software is None:
1520                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1521                     instances.set_nodeinfo_url(row["domain"], None)
1522
1523                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1524                 instances.set_software(row["domain"], software)
1525
1526             if software is not None:
1527                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1528                 instances.set_success(row["domain"])
1529         except network.exceptions as exception:
1530             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1531             instances.set_last_error(row["domain"], exception)
1532
1533         instances.set_last_nodeinfo(row["domain"])
1534         instances.update(row["domain"])
1535         cnt = cnt + 1
1536
1537     logger.debug("Success! - EXIT!")
1538     return 0
1539
1540 def fetch_instances_social(args: argparse.Namespace) -> int:
1541     logger.debug("args[]='%s' - CALLED!", type(args))
1542
1543     logger.debug("Invoking locking.acquire() ...")
1544     locking.acquire()
1545
1546     source_domain = "instances.social"
1547
1548     if config.get("instances_social_api_key") == "":
1549         logger.error("API key not set. Please set in your config.json file.")
1550         return 1
1551     elif sources.is_recent(source_domain):
1552         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1553         return 2
1554     else:
1555         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1556         sources.update(source_domain)
1557
1558     headers = {
1559         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1560     }
1561
1562     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1563     fetched = network.get_json_api(
1564         source_domain,
1565         "/api/1.0/instances/list?count=0&sort_by=name",
1566         headers=headers,
1567         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1568     )
1569     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1570
1571     if "error_message" in fetched:
1572         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1573         return 2
1574     elif "exception" in fetched:
1575         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1576         return 3
1577     elif "json" not in fetched:
1578         logger.warning("fetched has no element 'json' - EXIT!")
1579         return 4
1580     elif "instances" not in fetched["json"]:
1581         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1582         return 5
1583
1584     domains = list()
1585     rows = fetched["json"]["instances"]
1586
1587     logger.info("Checking %d row(s) ...", len(rows))
1588     for row in rows:
1589         logger.debug("row[]='%s'", type(row))
1590         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1591         logger.debug("domain='%s' - AFTER!", domain)
1592
1593         if domain is None and domain == "":
1594             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1595             continue
1596
1597         logger.debug("domain='%s' - BEFORE!", domain)
1598         domain = domain.encode("idna").decode("utf-8")
1599         logger.debug("domain='%s' - AFTER!", domain)
1600
1601         if not domain_helper.is_wanted(domain):
1602             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1603             continue
1604         elif domain in domains:
1605             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1606             continue
1607         elif instances.is_registered(domain):
1608             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1609             continue
1610         elif instances.is_recent(domain):
1611             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1612             continue
1613
1614         logger.info("Fetching instances from domain='%s'", domain)
1615         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1616
1617     logger.debug("Success! - EXIT!")
1618     return 0
1619
1620 def fetch_relaylist(args: argparse.Namespace) -> int:
1621     logger.debug("args[]='%s' - CALLED!", type(args))
1622
1623     logger.debug("Invoking locking.acquire() ...")
1624     locking.acquire()
1625
1626     source_domain = "api.relaylist.com"
1627
1628     if sources.is_recent(source_domain):
1629         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1630         return 1
1631     else:
1632         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1633         sources.update(source_domain)
1634
1635     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1636     fetched = network.get_json_api(
1637         source_domain,
1638         "/relays",
1639         {},
1640         (config.get("connection_timeout"), config.get("read_timeout"))
1641     )
1642     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1643
1644     if "error_message" in fetched:
1645         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1646         return 2
1647     elif "exception" in fetched:
1648         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1649         return 3
1650     elif "json" not in fetched:
1651         logger.warning("fetched has no element 'json' - EXIT!")
1652         return 4
1653
1654     domains = list()
1655
1656     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1657     for row in fetched["json"]:
1658         logger.debug("row[]='%s'", type(row))
1659         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1660         logger.debug("domain='%s' - AFTER!", domain)
1661
1662         if domain is None and domain == "":
1663             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1664             continue
1665
1666         logger.debug("domain='%s' - BEFORE!", domain)
1667         domain = domain.encode("idna").decode("utf-8")
1668         logger.debug("domain='%s' - AFTER!", domain)
1669
1670         if not domain_helper.is_wanted(domain):
1671             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1672             continue
1673         elif domain in domains:
1674             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1675             continue
1676         elif instances.is_registered(domain):
1677             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1678             continue
1679         elif instances.is_recent(domain):
1680             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1681             continue
1682
1683         logger.info("Fetching instances from domain='%s'", domain)
1684         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1685
1686     logger.debug("Success! - EXIT!")
1687     return 0
1688
1689 def fetch_relays(args: argparse.Namespace) -> int:
1690     logger.debug("args[]='%s' - CALLED!", type(args))
1691
1692     logger.debug("Invoking locking.acquire() ...")
1693     locking.acquire()
1694
1695     if args.domain is not None and args.domain != "":
1696         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1697         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1698     elif args.software is not None and args.software != "":
1699         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1700         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1701     else:
1702         logger.debug("Fetch all relay instances ...")
1703         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1704
1705     domains = list()
1706     rows = database.cursor.fetchall()
1707
1708     logger.info("Checking %d relays ...", len(rows))
1709     for row in rows:
1710         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1711         if not args.force and instances.is_recent(row["domain"]):
1712             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1713             continue
1714         elif row["nodeinfo_url"] is None:
1715             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1716             continue
1717
1718         peers = list()
1719         try:
1720             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1721             if row["software"] == "pub-relay":
1722                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1723                 raw = network.fetch_api_url(
1724                     row["nodeinfo_url"],
1725                     (config.get("connection_timeout"), config.get("read_timeout"))
1726                 )
1727
1728                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1729                 if "exception" in raw:
1730                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1731                     raise raw["exception"]
1732                 elif "error_message" in raw:
1733                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1734                     instances.set_last_error(row["domain"], raw)
1735                     instances.set_last_instance_fetch(row["domain"])
1736                     instances.update(row["domain"])
1737                     continue
1738                 elif "json" not in raw:
1739                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1740                     continue
1741                 elif not "metadata" in raw["json"]:
1742                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1743                     continue
1744                 elif not "peers" in raw["json"]["metadata"]:
1745                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1746                     continue
1747             else:
1748                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1749                 raw = utils.fetch_url(
1750                     f"https://{row['domain']}",
1751                     network.web_headers,
1752                     (config.get("connection_timeout"), config.get("read_timeout"))
1753                 ).text
1754                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1755
1756                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1757                 logger.debug("doc[]='%s'", type(doc))
1758
1759         except network.exceptions as exception:
1760             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1761             instances.set_last_error(row["domain"], exception)
1762             instances.set_last_instance_fetch(row["domain"])
1763             instances.update(row["domain"])
1764             continue
1765
1766         logger.debug("row[software]='%s'", row["software"])
1767         if row["software"] == "activityrelay":
1768             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1769             tags = doc.findAll("p")
1770
1771             logger.debug("Checking %d paragraphs ...", len(tags))
1772             for tag in tags:
1773                 logger.debug("tag[]='%s'", type(tag))
1774                 if len(tag.contents) == 0:
1775                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1776                     continue
1777                 elif "registered instances" not in tag.contents[0]:
1778                     logger.debug("Skipping paragraph, text not found.")
1779                     continue
1780
1781                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1782                 for domain in tag.contents:
1783                     logger.debug("domain[%s]='%s'", type(domain), domain)
1784                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1785                         continue
1786
1787                     domain = str(domain)
1788                     logger.debug("domain='%s'", domain)
1789                     if not domain_helper.is_wanted(domain):
1790                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1791                         continue
1792
1793                     logger.debug("domain='%s' - BEFORE!", domain)
1794                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1795                     logger.debug("domain='%s' - AFTER!", domain)
1796
1797                     if domain in [None, ""]:
1798                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1799                         continue
1800                     elif domain not in peers:
1801                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1802                         peers.append(domain)
1803
1804                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1805                     if dict_helper.has_key(domains, "domain", domain):
1806                         logger.debug("domain='%s' already added", domain)
1807                         continue
1808
1809                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1810                     domains.append({
1811                         "domain": domain,
1812                         "origin": row["domain"],
1813                     })
1814         elif row["software"] in ["aoderelay", "selective-relay"]:
1815             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1816             if row["software"] == "aoderelay":
1817                 tags = doc.findAll("section", {"class": "instance"})
1818             else:
1819                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1820
1821             logger.debug("Checking %d tags ...", len(tags))
1822             for tag in tags:
1823                 logger.debug("tag[]='%s'", type(tag))
1824
1825                 link = tag.find("a")
1826                 logger.debug("link[%s]='%s'", type(link), link)
1827                 if not isinstance(link, bs4.element.Tag):
1828                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1829                     continue
1830
1831                 components = urlparse(link.get("href"))
1832                 logger.debug("components(%d)='%s'", len(components), components)
1833                 domain = components.netloc.lower().split(":")[0]
1834
1835                 logger.debug("domain='%s' - BEFORE!", domain)
1836                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1837                 logger.debug("domain='%s' - AFTER!", domain)
1838
1839                 if domain in [None, ""]:
1840                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1841                     continue
1842                 elif domain not in peers:
1843                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1844                     peers.append(domain)
1845
1846                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1847                 if dict_helper.has_key(domains, "domain", domain):
1848                     logger.debug("domain='%s' already added", domain)
1849                     continue
1850
1851                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1852                 domains.append({
1853                     "domain": domain,
1854                     "origin": row["domain"],
1855                 })
1856         elif row["software"] == "pub-relay":
1857             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1858             for domain in raw["json"]["metadata"]["peers"]:
1859                 logger.debug("domain='%s' - BEFORE!", domain)
1860                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1861                 logger.debug("domain='%s' - AFTER!", domain)
1862
1863                 if domain in [None, ""]:
1864                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1865                     continue
1866                 elif domain not in peers:
1867                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1868                     peers.append(domain)
1869
1870                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1871                 if dict_helper.has_key(domains, "domain", domain):
1872                     logger.debug("domain='%s' already added", domain)
1873                     continue
1874
1875                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1876                 domains.append({
1877                     "domain": domain,
1878                     "origin": row["domain"],
1879                 })
1880         else:
1881             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1882             continue
1883
1884         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1885         instances.set_last_instance_fetch(row["domain"])
1886
1887         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1888         instances.set_total_peers(row["domain"], peers)
1889
1890         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1891         instances.update(row["domain"])
1892
1893     logger.info("Checking %d domains ...", len(domains))
1894     for row in domains:
1895         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1896         if not domain_helper.is_wanted(row["domain"]):
1897             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1898             continue
1899         elif instances.is_registered(row["domain"]):
1900             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1901             continue
1902
1903         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1904         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1905
1906     logger.debug("Success! - EXIT!")
1907     return 0
1908
1909 def convert_idna(args: argparse.Namespace) -> int:
1910     logger.debug("args[]='%s' - CALLED!", type(args))
1911
1912     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1913     rows = database.cursor.fetchall()
1914
1915     logger.debug("rows[]='%s'", type(rows))
1916     instances.translate_idnas(rows, "domain")
1917
1918     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1919     rows = database.cursor.fetchall()
1920
1921     logger.debug("rows[]='%s'", type(rows))
1922     instances.translate_idnas(rows, "origin")
1923
1924     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1925     rows = database.cursor.fetchall()
1926
1927     logger.debug("rows[]='%s'", type(rows))
1928     blocks.translate_idnas(rows, "blocker")
1929
1930     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1931     rows = database.cursor.fetchall()
1932
1933     logger.debug("rows[]='%s'", type(rows))
1934     blocks.translate_idnas(rows, "blocked")
1935
1936     logger.debug("Success! - EXIT!")
1937     return 0
1938
1939 def remove_invalid(args: argparse.Namespace) -> int:
1940     logger.debug("args[]='%s' - CALLED!", type(args))
1941
1942     logger.debug("Invoking locking.acquire() ...")
1943     locking.acquire()
1944
1945     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1946     rows = database.cursor.fetchall()
1947
1948     logger.info("Checking %d domains ...", len(rows))
1949     for row in rows:
1950         logger.debug("row[domain]='%s'", row["domain"])
1951         if not validators.domain(row["domain"].split("/")[0]):
1952             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1953             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1954             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1955
1956     logger.debug("Invoking commit() ...")
1957     database.connection.commit()
1958
1959     logger.info("Vaccum cleaning database ...")
1960     database.cursor.execute("VACUUM")
1961
1962     logger.debug("Success! - EXIT!")
1963     return 0