]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued/WIP:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = network.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         items = list()
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = network.post_json_api(
532                 f"api.{source_domain}",
533                 "/",
534                 json.dumps({
535                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
536                 })
537             )
538
539             logger.debug("raw[%s]()=%d", type(raw), len(raw))
540             if "exception" in raw:
541                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
542                 raise raw["exception"]
543             elif "error_message" in raw:
544                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
545                 continue
546             elif not "data" in raw["json"]:
547                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
548                 continue
549             elif not "nodes" in raw["json"]["data"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
551                 continue
552
553             items = raw["json"]["data"]["nodes"]
554             logger.debug("items()=%d", len(items))
555
556         except network.exceptions as exception:
557             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
558             continue
559
560         logger.info("Checking %d items,software='%s' ...", len(items), software)
561         for item in items:
562             logger.debug("item[]='%s'", type(item))
563             if not "domain" in item:
564                 logger.debug("item()=%d has not element 'domain'", len(item))
565                 continue
566
567             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
568             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
569             logger.debug("domain='%s' - AFTER!", domain)
570
571             if domain in [None, ""]:
572                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
573                 continue
574
575             logger.debug("domain='%s' - BEFORE!", domain)
576             domain = domain.encode("idna").decode("utf-8")
577             logger.debug("domain='%s' - AFTER!", domain)
578
579             if not domain_helper.is_wanted(domain):
580                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
581                 continue
582             elif instances.is_registered(domain):
583                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
584                 continue
585
586             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
587             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
588
589     logger.debug("Success! - EXIT!")
590     return 0
591
592 def fetch_todon_wiki(args: argparse.Namespace) -> int:
593     logger.debug("args[]='%s' - CALLED!", type(args))
594
595     logger.debug("Invoking locking.acquire() ...")
596     locking.acquire()
597
598     source_domain = "wiki.todon.eu"
599     if sources.is_recent(source_domain):
600         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
601         return 1
602     else:
603         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
604         sources.update(source_domain)
605
606     blocklist = {
607         "silenced": list(),
608         "reject": list(),
609     }
610
611     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
612     raw = network.fetch_url(
613         f"https://{source_domain}/todon/domainblocks",
614         network.web_headers,
615         (config.get("connection_timeout"), config.get("read_timeout"))
616     ).text
617     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
618
619     doc = bs4.BeautifulSoup(raw, "html.parser")
620     logger.debug("doc[]='%s'", type(doc))
621
622     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
623     logger.info("Checking %d silenced/limited entries ...", len(silenced))
624     blocklist["silenced"] = utils.find_domains(silenced, "div")
625
626     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
627     logger.info("Checking %d suspended entries ...", len(suspended))
628     blocklist["reject"] = utils.find_domains(suspended, "div")
629
630     blocking = blocklist["silenced"] + blocklist["reject"]
631     blocker = "todon.eu"
632
633     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
634     instances.set_last_blocked(blocker)
635     instances.set_total_blocks(blocker, blocking)
636
637     blockdict = list()
638     for block_level in blocklist:
639         blockers = blocklist[block_level]
640
641         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
642         for blocked in blockers:
643             logger.debug("blocked='%s'", blocked)
644
645             if not instances.is_registered(blocked):
646                 try:
647                     logger.info("Fetching instances from domain='%s' ...", blocked)
648                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
649                 except network.exceptions as exception:
650                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
651                     instances.set_last_error(blocked, exception)
652
653             if not domain_helper.is_wanted(blocked):
654                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
655                 continue
656             elif not domain_helper.is_wanted(blocker):
657                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
658                 continue
659             elif blocks.is_instance_blocked(blocker, blocked, block_level):
660                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
661                 continue
662
663             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
664             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
665                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
666                 blockdict.append({
667                     "blocked": blocked,
668                     "reason" : None,
669                 })
670
671         logger.debug("Invoking commit() ...")
672         database.connection.commit()
673
674         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
675         if config.get("bot_enabled") and len(blockdict) > 0:
676             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
677             network.send_bot_post(blocker, blockdict)
678
679     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
680     if instances.has_pending(blocker):
681         logger.debug("Flushing updates for blocker='%s' ...", blocker)
682         instances.update(blocker)
683
684     logger.debug("Success! - EXIT!")
685     return 0
686
687 def fetch_cs(args: argparse.Namespace):
688     logger.debug("args[]='%s' - CALLED!", type(args))
689
690     logger.debug("Invoking locking.acquire() ...")
691     locking.acquire()
692
693     extensions = [
694         "extra",
695         "abbr",
696         "attr_list",
697         "def_list",
698         "fenced_code",
699         "footnotes",
700         "md_in_html",
701         "admonition",
702         "codehilite",
703         "legacy_attrs",
704         "legacy_em",
705         "meta",
706         "nl2br",
707         "sane_lists",
708         "smarty",
709         "toc",
710         "wikilinks"
711     ]
712
713     blocklist = {
714         "silenced": list(),
715         "reject"  : list(),
716     }
717
718     source_domain = "raw.githubusercontent.com"
719     if sources.is_recent(source_domain):
720         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
721         return 1
722     else:
723         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
724         sources.update(source_domain)
725
726     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
727     raw = network.fetch_url(
728         f"https://{source_domain}/chaossocial/meta/master/federation.md",
729         network.web_headers,
730         (config.get("connection_timeout"), config.get("read_timeout"))
731     ).text
732     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
733
734     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
735     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
736
737     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
738     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
739     blocklist["silenced"] = federation.find_domains(silenced)
740
741     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
742     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
743     blocklist["reject"] = federation.find_domains(blocked)
744
745     blocking = blocklist["silenced"] + blocklist["reject"]
746     blocker = "chaos.social"
747
748     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
749     instances.set_last_blocked(blocker)
750     instances.set_total_blocks(blocker, blocking)
751
752     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
753     if len(blocking) > 0:
754         blockdict = list()
755         for block_level in blocklist:
756             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
757
758             for row in blocklist[block_level]:
759                 logger.debug("row[%s]='%s'", type(row), row)
760                 if not "domain" in row:
761                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
762                     continue
763                 elif not instances.is_registered(row["domain"]):
764                     try:
765                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
766                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
767                     except network.exceptions as exception:
768                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
769                         instances.set_last_error(row["domain"], exception)
770
771                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
772                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
773                     blockdict.append({
774                         "blocked": row["domain"],
775                         "reason" : row["reason"],
776                     })
777
778         logger.debug("Invoking commit() ...")
779         database.connection.commit()
780
781         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
782         if config.get("bot_enabled") and len(blockdict) > 0:
783             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
784             network.send_bot_post(blocker, blockdict)
785
786     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
787     if instances.has_pending(blocker):
788         logger.debug("Flushing updates for blocker='%s' ...", blocker)
789         instances.update(blocker)
790
791     logger.debug("Success! - EXIT!")
792     return 0
793
794 def fetch_fba_rss(args: argparse.Namespace) -> int:
795     logger.debug("args[]='%s' - CALLED!", type(args))
796
797     domains = list()
798
799     logger.debug("Invoking locking.acquire() ...")
800     locking.acquire()
801
802     components = urlparse(args.feed)
803     domain = components.netloc.lower().split(":")[0]
804
805     logger.debug("domain='%s'", domain)
806     if sources.is_recent(domain):
807         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
808         return 0
809     else:
810         logger.debug("domain='%s' has not been recently used, marking ...", domain)
811         sources.update(domain)
812
813     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
814     response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
815
816     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
817     if response.ok and response.status_code == 200 and len(response.text) > 0:
818         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
819         rss = atoma.parse_rss_bytes(response.content)
820
821         logger.debug("rss[]='%s'", type(rss))
822         for item in rss.items:
823             logger.debug("item[%s]='%s'", type(item), item)
824             domain = item.link.split("=")[1]
825             domain = tidyup.domain(domain) if domain not in[None, ""] else None
826
827             logger.debug("domain='%s' - AFTER!", domain)
828             if domain in [None, ""]:
829                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
830                 continue
831
832             logger.debug("domain='%s' - BEFORE!", domain)
833             domain = domain.encode("idna").decode("utf-8")
834             logger.debug("domain='%s' - AFTER!", domain)
835
836             if not domain_helper.is_wanted(domain):
837                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
838                 continue
839             elif domain in domains:
840                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
841                 continue
842             elif instances.is_registered(domain):
843                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
844                 continue
845             elif instances.is_recent(domain):
846                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
847                 continue
848
849             logger.debug("Adding domain='%s'", domain)
850             domains.append(domain)
851
852     logger.debug("domains()=%d", len(domains))
853     if len(domains) > 0:
854         logger.info("Adding %d new instances ...", len(domains))
855         for domain in domains:
856             logger.debug("domain='%s'", domain)
857             try:
858                 logger.info("Fetching instances from domain='%s' ...", domain)
859                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
860             except network.exceptions as exception:
861                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
862                 instances.set_last_error(domain, exception)
863                 return 100
864
865     logger.debug("Success! - EXIT!")
866     return 0
867
868 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
869     logger.debug("args[]='%s' - CALLED!", type(args))
870
871     logger.debug("Invoking locking.acquire() ...")
872     locking.acquire()
873
874     source_domain = "ryona.agency"
875     feed = f"https://{source_domain}/users/fba/feed.atom"
876
877     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
878     if args.feed is not None and validators.url(args.feed):
879         logger.debug("Setting feed='%s' ...", args.feed)
880         feed = str(args.feed)
881         source_domain = urlparse(args.feed).netloc
882
883     if sources.is_recent(source_domain):
884         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
885         return 1
886     else:
887         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
888         sources.update(source_domain)
889
890     domains = list()
891
892     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
893     response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
894
895     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
896     if response.ok and response.status_code == 200 and len(response.text) > 0:
897         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
898         atom = atoma.parse_atom_bytes(response.content)
899
900         logger.debug("atom[]='%s'", type(atom))
901         for entry in atom.entries:
902             logger.debug("entry[]='%s'", type(entry))
903             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
904             logger.debug("doc[]='%s'", type(doc))
905             elements = doc.findAll("a")
906
907             logger.debug("Checking %d element(s) ...", len(elements))
908             for element in elements:
909                 logger.debug("element[%s]='%s'", type(element), element)
910                 for href in element["href"].split(","):
911                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
912                     domain = tidyup.domain(href) if href not in [None, ""] else None
913
914                     logger.debug("domain='%s' - AFTER!", domain)
915                     if domain in [None, ""]:
916                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
917                         continue
918
919                     logger.debug("domain='%s' - BEFORE!", domain)
920                     domain = domain.encode("idna").decode("utf-8")
921                     logger.debug("domain='%s' - AFTER!", domain)
922
923                     if not domain_helper.is_wanted(domain):
924                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
925                         continue
926                     elif domain in domains:
927                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
928                         continue
929                     elif instances.is_registered(domain):
930                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
931                         continue
932                     elif instances.is_recent(domain):
933                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
934                         continue
935
936                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
937                     domains.append(domain)
938
939     logger.debug("domains()=%d", len(domains))
940     if len(domains) > 0:
941         logger.info("Adding %d new instances ...", len(domains))
942         for domain in domains:
943             logger.debug("domain='%s'", domain)
944             try:
945                 logger.info("Fetching instances from domain='%s' ...", domain)
946                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
947             except network.exceptions as exception:
948                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
949                 instances.set_last_error(domain, exception)
950                 return 100
951
952     logger.debug("Success! - EXIT!")
953     return 0
954
955 def fetch_instances(args: argparse.Namespace) -> int:
956     logger.debug("args[]='%s' - CALLED!", type(args))
957
958     logger.debug("Invoking locking.acquire() ...")
959     locking.acquire()
960
961     # Init variables
962     rows = list()
963
964     # Is domain or software set?
965     if args.domain != "":
966         logger.debug("args.domain='%s' - checking ...", args.domain)
967         if not validators.domain(args.domain):
968             logger.warning("args.domain='%s' is not valid.", args.domain)
969             return 100
970         elif blacklist.is_blacklisted(args.domain):
971             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
972             return 101
973
974         logger.debug("args.domain='%s' - BEFORE!", args.domain)
975         domain = tidyup.domain(args.domain)
976         logger.debug("domain='%s' - AFTER!", domain)
977
978         # Fetch record
979         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
980         rows = database.cursor.fetchall()
981
982     logger.info("Checking %d entries ...", len(rows))
983     for row in rows:
984         logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
985         if row["software"] is None:
986             logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
987             continue
988         elif software_helper.is_relay(row["software"]):
989             logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
990             continue
991
992         # Initial fetch
993         try:
994             logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
995             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
996         except network.exceptions as exception:
997             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
998             instances.set_last_error(row["domain"], exception)
999             instances.update(row["domain"])
1000             raise exception
1001
1002         if args.single:
1003             logger.debug("Not fetching more instances - BREAK!")
1004             break
1005
1006     # Loop through some instances
1007     database.cursor.execute(
1008         "SELECT domain, origin, software \
1009 FROM instances \
1010 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') \
1011 AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) \
1012 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
1013     )
1014
1015     rows = database.cursor.fetchall()
1016     logger.info("Checking %d entries ...", len(rows))
1017     for row in rows:
1018         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1019         domain = row["domain"].encode("idna").decode("utf-8")
1020         logger.debug("domain='%s' - AFTER!", domain)
1021
1022         if not domain_helper.is_wanted(domain):
1023             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1024             continue
1025
1026         try:
1027             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1028             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1029         except network.exceptions as exception:
1030             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1031             instances.set_last_error(domain, exception)
1032
1033     logger.debug("Success - EXIT!")
1034     return 0
1035
1036 def fetch_csv(args: argparse.Namespace) -> int:
1037     logger.debug("args[]='%s' - CALLED!", type(args))
1038
1039     logger.debug("Invoking locking.acquire() ...")
1040     locking.acquire()
1041
1042     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1043     for block in blocklists.csv_files:
1044         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1045
1046         # Is domain given and not equal blocker?
1047         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1048             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1049             continue
1050
1051         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1052         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1053
1054     logger.debug("Success - EXIT!")
1055     return 0
1056
1057 def fetch_oliphant(args: argparse.Namespace) -> int:
1058     logger.debug("args[]='%s' - CALLED!", type(args))
1059
1060     logger.debug("Invoking locking.acquire() ...")
1061     locking.acquire()
1062
1063     source_domain = "codeberg.org"
1064     if sources.is_recent(source_domain):
1065         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1066         return 1
1067     else:
1068         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1069         sources.update(source_domain)
1070
1071     # Base URL
1072     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1073
1074     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1075     for block in blocklists.oliphant_blocklists:
1076         # Is domain given and not equal blocker?
1077         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1078         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1079             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1080             continue
1081
1082         url = f"{base_url}/{block['csv_url']}"
1083
1084         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1085         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1086
1087     logger.debug("Success! - EXIT!")
1088     return 0
1089
1090 def fetch_txt(args: argparse.Namespace) -> int:
1091     logger.debug("args[]='%s' - CALLED!", type(args))
1092
1093     logger.debug("Invoking locking.acquire() ...")
1094     locking.acquire()
1095
1096     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1097     for row in blocklists.txt_files:
1098         logger.debug("Fetching row[url]='%s' ...", row["url"])
1099         response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1100
1101         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1102         if response.ok and response.status_code == 200 and response.text != "":
1103             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1104             domains = response.text.strip().split("\n")
1105
1106             logger.info("Processing %d domains ...", len(domains))
1107             for domain in domains:
1108                 logger.debug("domain='%s' - BEFORE!", domain)
1109                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1110                 logger.debug("domain='%s' - AFTER!", domain)
1111
1112                 if domain in [None, ""]:
1113                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1114                     continue
1115                 elif not domain_helper.is_wanted(domain):
1116                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1117                     continue
1118                 elif not args.force and instances.is_registered(domain):
1119                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1120                     continue
1121
1122                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1123                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1124                 logger.debug("processed='%s'", processed)
1125
1126     logger.debug("Success! - EXIT!")
1127     return 0
1128
1129 def fetch_fedipact(args: argparse.Namespace) -> int:
1130     logger.debug("args[]='%s' - CALLED!", type(args))
1131
1132     logger.debug("Invoking locking.acquire() ...")
1133     locking.acquire()
1134
1135     source_domain = "fedipact.online"
1136     if sources.is_recent(source_domain):
1137         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1138         return 1
1139     else:
1140         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1141         sources.update(source_domain)
1142
1143     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1144     response = network.fetch_url(
1145         f"https://{source_domain}",
1146         network.web_headers,
1147         (config.get("connection_timeout"), config.get("read_timeout"))
1148     )
1149
1150     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1151     if response.ok and response.status_code == 200 and response.text != "":
1152         logger.debug("Parsing %d Bytes ...", len(response.text))
1153
1154         doc = bs4.BeautifulSoup(response.text, "html.parser")
1155         logger.debug("doc[]='%s'", type(doc))
1156
1157         rows = doc.findAll("li")
1158         logger.info("Checking %d row(s) ...", len(rows))
1159         for row in rows:
1160             logger.debug("row[]='%s'", type(row))
1161             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1162
1163             logger.debug("domain='%s' - AFTER!", domain)
1164             if domain in [None, ""]:
1165                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1166                 continue
1167
1168             logger.debug("domain='%s' - BEFORE!", domain)
1169             domain = domain.encode("idna").decode("utf-8")
1170             logger.debug("domain='%s' - AFTER!", domain)
1171
1172             if not domain_helper.is_wanted(domain):
1173                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1174                 continue
1175             elif instances.is_registered(domain):
1176                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1177                 continue
1178             elif instances.is_recent(domain):
1179                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1180                 continue
1181
1182             logger.info("Fetching domain='%s' ...", domain)
1183             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1184
1185     logger.debug("Success! - EXIT!")
1186     return 0
1187
1188 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1189     logger.debug("args[]='%s' - CALLED!", type(args))
1190
1191     logger.debug("Invoking locking.acquire() ...")
1192     locking.acquire()
1193
1194     source_domain = "instances.joinmobilizon.org"
1195     if sources.is_recent(source_domain):
1196         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1197         return 1
1198     else:
1199         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1200         sources.update(source_domain)
1201
1202     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1203     raw = network.fetch_url(
1204         f"https://{source_domain}/api/v1/instances",
1205         network.web_headers,
1206         (config.get("connection_timeout"), config.get("read_timeout"))
1207     ).text
1208     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1209
1210     parsed = json.loads(raw)
1211     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1212
1213     if "data" not in parsed:
1214         logger.warning("parsed()=%d does not contain key 'data'")
1215         return 1
1216
1217     logger.info("Checking %d instances ...", len(parsed["data"]))
1218     for row in parsed["data"]:
1219         logger.debug("row[]='%s'", type(row))
1220         if "host" not in row:
1221             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1222             continue
1223         elif not domain_helper.is_wanted(row["host"]):
1224             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1225             continue
1226         elif instances.is_registered(row["host"]):
1227             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1228             continue
1229
1230         logger.info("Fetching row[host]='%s' ...", row["host"])
1231         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1232
1233     logger.debug("Success! - EXIT!")
1234     return 0
1235
1236 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1237     logger.debug("args[]='%s' - CALLED!", type(args))
1238
1239     logger.debug("Invoking locking.acquire() ...")
1240     locking.acquire()
1241
1242     source_domain = "instanceapp.misskey.page"
1243     if sources.is_recent(source_domain):
1244         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1245         return 1
1246     else:
1247         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1248         sources.update(source_domain)
1249
1250     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1251     raw = network.fetch_url(
1252         f"https://{source_domain}/instances.json",
1253         network.web_headers,
1254         (config.get("connection_timeout"), config.get("read_timeout"))
1255     ).text
1256     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1257
1258     parsed = json.loads(raw)
1259     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1260
1261     if "instancesInfos" not in parsed:
1262         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1263         return 1
1264
1265     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1266     for row in parsed["instancesInfos"]:
1267         logger.debug("row[%s]='%s'", type(row), row)
1268         if "url" not in row:
1269             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1270             continue
1271         elif not domain_helper.is_wanted(row["url"]):
1272             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1273             continue
1274         elif instances.is_registered(row["url"]):
1275             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1276             continue
1277
1278         logger.info("Fetching row[url]='%s' ...", row["url"])
1279         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1280
1281     logger.debug("Success! - EXIT!")
1282     return 0
1283
1284 def recheck_obfuscation(args: argparse.Namespace) -> int:
1285     logger.debug("args[]='%s' - CALLED!", type(args))
1286
1287     logger.debug("Invoking locking.acquire() ...")
1288     locking.acquire()
1289
1290     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1291         logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1292         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1293     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1294         logger.debug("Fetching records for args.software='%s' ...", args.software)
1295         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1296     else:
1297         logger.debug("Fetching records where domains have obfuscated block entries ...")
1298         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1299
1300     rows = database.cursor.fetchall()
1301     logger.info("Checking %d domains ...", len(rows))
1302     for row in rows:
1303         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1304         if blacklist.is_blacklisted(row["domain"]):
1305             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1306             continue
1307         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1308             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1309             continue
1310
1311         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1312         blocking = federation.fetch_blocks(row["domain"])
1313
1314         logger.debug("blocking()=%d", len(blocking))
1315         if len(blocking) == 0:
1316             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1317             if row["software"] == "pleroma":
1318                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1319                 blocking = pleroma.fetch_blocks(row["domain"])
1320             elif row["software"] == "mastodon":
1321                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1322                 blocking = mastodon.fetch_blocks(row["domain"])
1323             elif row["software"] == "lemmy":
1324                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1325                 blocking = lemmy.fetch_blocks(row["domain"])
1326             elif row["software"] == "friendica":
1327                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1328                 blocking = friendica.fetch_blocks(row["domain"])
1329             elif row["software"] == "misskey":
1330                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1331                 blocking = misskey.fetch_blocks(row["domain"])
1332             else:
1333                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1334
1335         # c.s isn't part of oliphant's "hidden" blocklists
1336         logger.debug("row[domain]='%s'", row["domain"])
1337         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1338             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1339             instances.set_last_blocked(row["domain"])
1340             instances.set_total_blocks(row["domain"], blocking)
1341
1342         obfuscated = 0
1343         blockdict = list()
1344
1345         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1346         for block in blocking:
1347             logger.debug("block[blocked]='%s'", block["blocked"])
1348             blocked = None
1349
1350             if block["blocked"] == "":
1351                 logger.debug("block[blocked] is empty - SKIPPED!")
1352                 continue
1353             elif block["blocked"].endswith(".onion"):
1354                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1355                 continue
1356             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1357                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1358                 continue
1359             elif block["blocked"].endswith(".arpa"):
1360                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1361                 continue
1362             elif block["blocked"].endswith(".tld"):
1363                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1364                 continue
1365             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1366                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1367                 obfuscated = obfuscated + 1
1368                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1369             elif not domain_helper.is_wanted(block["blocked"]):
1370                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1371                 continue
1372             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1373                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1374                 continue
1375
1376             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1377             if blocked is not None and blocked != block["blocked"]:
1378                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1379                 obfuscated = obfuscated - 1
1380
1381                 if blacklist.is_blacklisted(blocked):
1382                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1383                     continue
1384                 elif blacklist.is_blacklisted(row["domain"]):
1385                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1386                     continue
1387                 elif blocks.is_instance_blocked(row["domain"], blocked):
1388                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1389                     continue
1390
1391                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1392
1393                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1394                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1395                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1396                     blockdict.append({
1397                         "blocked": blocked,
1398                         "reason" : block["reason"],
1399                     })
1400
1401         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1402         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1403         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1404
1405         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1406         if instances.has_pending(row["domain"]):
1407             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1408             instances.update(row["domain"])
1409
1410         logger.debug("Invoking commit() ...")
1411         database.connection.commit()
1412
1413         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1414         if config.get("bot_enabled") and len(blockdict) > 0:
1415             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1416             network.send_bot_post(row["domain"], blockdict)
1417
1418     logger.debug("Success! - EXIT!")
1419     return 0
1420
1421 def fetch_fedilist(args: argparse.Namespace) -> int:
1422     logger.debug("args[]='%s' - CALLED!", type(args))
1423
1424     logger.debug("Invoking locking.acquire() ...")
1425     locking.acquire()
1426
1427     source_domain = "demo.fedilist.com"
1428     if sources.is_recent(source_domain):
1429         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1430         return 1
1431     else:
1432         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1433         sources.update(source_domain)
1434
1435     url = f"http://{source_domain}/instance/csv?onion=not"
1436     if args.software is not None and args.software != "":
1437         logger.debug("args.software='%s'", args.software)
1438         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1439
1440     logger.info("Fetching url='%s' ...", url)
1441     response = reqto.get(
1442         url,
1443         headers=network.web_headers,
1444         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1445         allow_redirects=False
1446     )
1447
1448     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1449     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1450         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1451         return 1
1452
1453     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1454
1455     logger.debug("reader[]='%s'", type(reader))
1456     if reader is None:
1457         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1458         return 2
1459
1460     rows = list(reader)
1461
1462     logger.info("Checking %d rows ...", len(rows))
1463     for row in rows:
1464         logger.debug("row[]='%s'", type(row))
1465         if "hostname" not in row:
1466             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1467             continue
1468
1469         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1470         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1471         logger.debug("domain='%s' - AFTER!", domain)
1472
1473         if domain in [None, ""]:
1474             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1475             continue
1476
1477         logger.debug("domain='%s' - BEFORE!", domain)
1478         domain = domain.encode("idna").decode("utf-8")
1479         logger.debug("domain='%s' - AFTER!", domain)
1480
1481         if not domain_helper.is_wanted(domain):
1482             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1483             continue
1484         elif (args.force is None or not args.force) and instances.is_registered(domain):
1485             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1486             continue
1487         elif instances.is_recent(domain):
1488             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1489             continue
1490
1491         logger.info("Fetching instances from domain='%s' ...", domain)
1492         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1493
1494     logger.debug("Success! - EXIT!")
1495     return 0
1496
1497 def update_nodeinfo(args: argparse.Namespace) -> int:
1498     logger.debug("args[]='%s' - CALLED!", type(args))
1499
1500     logger.debug("Invoking locking.acquire() ...")
1501     locking.acquire()
1502
1503     if args.domain is not None and args.domain != "":
1504         logger.debug("Fetching args.domain='%s'", args.domain)
1505         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1506     elif args.software is not None and args.software != "":
1507         logger.info("Fetching domains for args.software='%s'", args.software)
1508         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1509     elif args.mode is not None and args.mode != "":
1510         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1511         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1512     elif args.no_software:
1513         logger.info("Fetching domains with no software type detected ...")
1514         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1515     elif args.with_software:
1516         logger.info("Fetching domains with any software type detected ...")
1517         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1518     elif args.no_auto:
1519         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1520         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1521     elif args.no_detection:
1522         logger.info("Fetching domains with no detection mode being set ...")
1523         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1524     else:
1525         logger.info("Fetching domains for recently updated ...")
1526         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1527
1528     domains = database.cursor.fetchall()
1529
1530     logger.info("Checking %d domain(s) ...", len(domains))
1531     cnt = 0
1532     for row in domains:
1533         logger.debug("row[]='%s'", type(row))
1534         if blacklist.is_blacklisted(row["domain"]):
1535             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1536             continue
1537         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1538             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1539             continue
1540
1541         try:
1542             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1543             software = federation.determine_software(row["domain"])
1544
1545             logger.debug("Determined software='%s'", software)
1546             if (software != row["software"] and software is not None) or args.force is True:
1547                 logger.debug("software='%s'", software)
1548                 if software is None:
1549                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1550                     instances.set_nodeinfo_url(row["domain"], None)
1551
1552                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1553                 instances.set_software(row["domain"], software)
1554
1555             if software is not None:
1556                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1557                 instances.set_success(row["domain"])
1558         except network.exceptions as exception:
1559             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1560             instances.set_last_error(row["domain"], exception)
1561
1562         instances.set_last_nodeinfo(row["domain"])
1563         instances.update(row["domain"])
1564         cnt = cnt + 1
1565
1566     logger.debug("Success! - EXIT!")
1567     return 0
1568
1569 def fetch_instances_social(args: argparse.Namespace) -> int:
1570     logger.debug("args[]='%s' - CALLED!", type(args))
1571
1572     logger.debug("Invoking locking.acquire() ...")
1573     locking.acquire()
1574
1575     source_domain = "instances.social"
1576
1577     if config.get("instances_social_api_key") == "":
1578         logger.error("API key not set. Please set in your config.json file.")
1579         return 1
1580     elif sources.is_recent(source_domain):
1581         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1582         return 2
1583     else:
1584         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1585         sources.update(source_domain)
1586
1587     headers = {
1588         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1589     }
1590
1591     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1592     fetched = network.get_json_api(
1593         source_domain,
1594         "/api/1.0/instances/list?count=0&sort_by=name",
1595         headers=headers,
1596         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1597     )
1598     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1599
1600     if "error_message" in fetched:
1601         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1602         return 2
1603     elif "exception" in fetched:
1604         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1605         return 3
1606     elif "json" not in fetched:
1607         logger.warning("fetched has no element 'json' - EXIT!")
1608         return 4
1609     elif "instances" not in fetched["json"]:
1610         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1611         return 5
1612
1613     domains = list()
1614     rows = fetched["json"]["instances"]
1615
1616     logger.info("Checking %d row(s) ...", len(rows))
1617     for row in rows:
1618         logger.debug("row[]='%s'", type(row))
1619         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1620         logger.debug("domain='%s' - AFTER!", domain)
1621
1622         if domain is None and domain == "":
1623             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1624             continue
1625
1626         logger.debug("domain='%s' - BEFORE!", domain)
1627         domain = domain.encode("idna").decode("utf-8")
1628         logger.debug("domain='%s' - AFTER!", domain)
1629
1630         if not domain_helper.is_wanted(domain):
1631             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1632             continue
1633         elif domain in domains:
1634             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1635             continue
1636         elif instances.is_registered(domain):
1637             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1638             continue
1639         elif instances.is_recent(domain):
1640             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1641             continue
1642
1643         logger.info("Fetching instances from domain='%s' ...", domain)
1644         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1645
1646     logger.debug("Success! - EXIT!")
1647     return 0
1648
1649 def fetch_relaylist(args: argparse.Namespace) -> int:
1650     logger.debug("args[]='%s' - CALLED!", type(args))
1651
1652     logger.debug("Invoking locking.acquire() ...")
1653     locking.acquire()
1654
1655     source_domain = "api.relaylist.com"
1656
1657     if sources.is_recent(source_domain):
1658         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1659         return 1
1660     else:
1661         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1662         sources.update(source_domain)
1663
1664     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1665     fetched = network.get_json_api(
1666         source_domain,
1667         "/relays",
1668         {},
1669         (config.get("connection_timeout"), config.get("read_timeout"))
1670     )
1671     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1672
1673     if "error_message" in fetched:
1674         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1675         return 2
1676     elif "exception" in fetched:
1677         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1678         return 3
1679     elif "json" not in fetched:
1680         logger.warning("fetched has no element 'json' - EXIT!")
1681         return 4
1682
1683     domains = list()
1684
1685     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1686     for row in fetched["json"]:
1687         logger.debug("row[]='%s'", type(row))
1688         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1689         logger.debug("domain='%s' - AFTER!", domain)
1690
1691         if domain is None and domain == "":
1692             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1693             continue
1694
1695         logger.debug("domain='%s' - BEFORE!", domain)
1696         domain = domain.encode("idna").decode("utf-8")
1697         logger.debug("domain='%s' - AFTER!", domain)
1698
1699         if not domain_helper.is_wanted(domain):
1700             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1701             continue
1702         elif domain in domains:
1703             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1704             continue
1705         elif instances.is_registered(domain):
1706             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1707             continue
1708         elif instances.is_recent(domain):
1709             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1710             continue
1711
1712         logger.info("Fetching instances from domain='%s'", domain)
1713         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1714
1715     logger.debug("Success! - EXIT!")
1716     return 0
1717
1718 def fetch_relays(args: argparse.Namespace) -> int:
1719     logger.debug("args[]='%s' - CALLED!", type(args))
1720
1721     logger.debug("Invoking locking.acquire() ...")
1722     locking.acquire()
1723
1724     if args.domain is not None and args.domain != "":
1725         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1726         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1727     elif args.software is not None and args.software != "":
1728         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1729         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1730     else:
1731         logger.debug("Fetch all relay instances ...")
1732         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1733
1734     domains = list()
1735     rows = database.cursor.fetchall()
1736
1737     logger.info("Checking %d relays ...", len(rows))
1738     for row in rows:
1739         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1740         if not args.force and instances.is_recent(row["domain"]):
1741             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1742             continue
1743         elif row["nodeinfo_url"] is None:
1744             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1745             continue
1746
1747         peers = list()
1748         try:
1749             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1750             if row["software"] == "pub-relay":
1751                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1752                 raw = network.fetch_api_url(
1753                     row["nodeinfo_url"],
1754                     (config.get("connection_timeout"), config.get("read_timeout"))
1755                 )
1756
1757                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1758                 if "exception" in raw:
1759                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1760                     raise raw["exception"]
1761                 elif "error_message" in raw:
1762                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1763                     instances.set_last_error(row["domain"], raw)
1764                     instances.set_last_instance_fetch(row["domain"])
1765                     instances.update(row["domain"])
1766                     continue
1767                 elif "json" not in raw:
1768                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1769                     continue
1770                 elif not "metadata" in raw["json"]:
1771                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1772                     continue
1773                 elif not "peers" in raw["json"]["metadata"]:
1774                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1775                     continue
1776             else:
1777                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1778                 raw = network.fetch_url(
1779                     f"https://{row['domain']}",
1780                     network.web_headers,
1781                     (config.get("connection_timeout"), config.get("read_timeout"))
1782                 ).text
1783                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1784
1785                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1786                 logger.debug("doc[]='%s'", type(doc))
1787
1788         except network.exceptions as exception:
1789             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1790             instances.set_last_error(row["domain"], exception)
1791             instances.set_last_instance_fetch(row["domain"])
1792             instances.update(row["domain"])
1793             continue
1794
1795         logger.debug("row[software]='%s'", row["software"])
1796         if row["software"] == "activityrelay":
1797             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1798             tags = doc.findAll("p")
1799
1800             logger.debug("Checking %d paragraphs ...", len(tags))
1801             for tag in tags:
1802                 logger.debug("tag[]='%s'", type(tag))
1803                 if len(tag.contents) == 0:
1804                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1805                     continue
1806                 elif "registered instances" not in tag.contents[0]:
1807                     logger.debug("Skipping paragraph, text not found.")
1808                     continue
1809
1810                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1811                 for domain in tag.contents:
1812                     logger.debug("domain[%s]='%s'", type(domain), domain)
1813                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1814                         continue
1815
1816                     domain = str(domain)
1817                     logger.debug("domain='%s'", domain)
1818                     if not domain_helper.is_wanted(domain):
1819                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1820                         continue
1821
1822                     logger.debug("domain='%s' - BEFORE!", domain)
1823                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1824                     logger.debug("domain='%s' - AFTER!", domain)
1825
1826                     if domain in [None, ""]:
1827                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1828                         continue
1829                     elif domain not in peers:
1830                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1831                         peers.append(domain)
1832
1833                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1834                     if dict_helper.has_key(domains, "domain", domain):
1835                         logger.debug("domain='%s' already added", domain)
1836                         continue
1837
1838                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1839                     domains.append({
1840                         "domain": domain,
1841                         "origin": row["domain"],
1842                     })
1843         elif row["software"] in ["aoderelay", "selective-relay"]:
1844             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1845             if row["software"] == "aoderelay":
1846                 tags = doc.findAll("section", {"class": "instance"})
1847             else:
1848                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1849
1850             logger.debug("Checking %d tags ...", len(tags))
1851             for tag in tags:
1852                 logger.debug("tag[]='%s'", type(tag))
1853
1854                 link = tag.find("a")
1855                 logger.debug("link[%s]='%s'", type(link), link)
1856                 if not isinstance(link, bs4.element.Tag):
1857                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1858                     continue
1859
1860                 components = urlparse(link.get("href"))
1861                 logger.debug("components(%d)='%s'", len(components), components)
1862                 domain = components.netloc.lower().split(":")[0]
1863
1864                 logger.debug("domain='%s' - BEFORE!", domain)
1865                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1866                 logger.debug("domain='%s' - AFTER!", domain)
1867
1868                 if domain in [None, ""]:
1869                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1870                     continue
1871                 elif domain not in peers:
1872                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1873                     peers.append(domain)
1874
1875                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1876                 if dict_helper.has_key(domains, "domain", domain):
1877                     logger.debug("domain='%s' already added", domain)
1878                     continue
1879
1880                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1881                 domains.append({
1882                     "domain": domain,
1883                     "origin": row["domain"],
1884                 })
1885         elif row["software"] == "pub-relay":
1886             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1887             for domain in raw["json"]["metadata"]["peers"]:
1888                 logger.debug("domain='%s' - BEFORE!", domain)
1889                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1890                 logger.debug("domain='%s' - AFTER!", domain)
1891
1892                 if domain in [None, ""]:
1893                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1894                     continue
1895                 elif domain not in peers:
1896                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1897                     peers.append(domain)
1898
1899                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1900                 if dict_helper.has_key(domains, "domain", domain):
1901                     logger.debug("domain='%s' already added", domain)
1902                     continue
1903
1904                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1905                 domains.append({
1906                     "domain": domain,
1907                     "origin": row["domain"],
1908                 })
1909         else:
1910             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1911             continue
1912
1913         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1914         instances.set_last_instance_fetch(row["domain"])
1915
1916         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1917         instances.set_total_peers(row["domain"], peers)
1918
1919         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1920         instances.update(row["domain"])
1921
1922     logger.info("Checking %d domains ...", len(domains))
1923     for row in domains:
1924         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1925         if not domain_helper.is_wanted(row["domain"]):
1926             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1927             continue
1928         elif instances.is_registered(row["domain"]):
1929             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1930             continue
1931
1932         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1933         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1934
1935     logger.debug("Success! - EXIT!")
1936     return 0
1937
1938 def convert_idna(args: argparse.Namespace) -> int:
1939     logger.debug("args[]='%s' - CALLED!", type(args))
1940
1941     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1942     rows = database.cursor.fetchall()
1943
1944     logger.debug("rows[]='%s'", type(rows))
1945     instances.translate_idnas(rows, "domain")
1946
1947     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1948     rows = database.cursor.fetchall()
1949
1950     logger.debug("rows[]='%s'", type(rows))
1951     instances.translate_idnas(rows, "origin")
1952
1953     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1954     rows = database.cursor.fetchall()
1955
1956     logger.debug("rows[]='%s'", type(rows))
1957     blocks.translate_idnas(rows, "blocker")
1958
1959     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1960     rows = database.cursor.fetchall()
1961
1962     logger.debug("rows[]='%s'", type(rows))
1963     blocks.translate_idnas(rows, "blocked")
1964
1965     logger.debug("Success! - EXIT!")
1966     return 0
1967
1968 def remove_invalid(args: argparse.Namespace) -> int:
1969     logger.debug("args[]='%s' - CALLED!", type(args))
1970
1971     logger.debug("Invoking locking.acquire() ...")
1972     locking.acquire()
1973
1974     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1975     rows = database.cursor.fetchall()
1976
1977     logger.info("Checking %d domains ...", len(rows))
1978     for row in rows:
1979         logger.debug("row[domain]='%s'", row["domain"])
1980         if not validators.domain(row["domain"].split("/")[0]):
1981             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1982             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1983             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1984
1985     logger.debug("Invoking commit() ...")
1986     database.connection.commit()
1987
1988     logger.info("Vaccum cleaning database ...")
1989     database.cursor.execute("VACUUM")
1990
1991     logger.debug("Success! - EXIT!")
1992     return 0