]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = utils.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         items = list()
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = network.post_json_api(
532                 f"api.{source_domain}",
533                 "/",
534                 json.dumps({
535                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
536                 })
537             )
538
539             logger.debug("raw[%s]()=%d", type(raw), len(raw))
540             if "exception" in raw:
541                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
542                 raise raw["exception"]
543             elif "error_message" in raw:
544                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
545                 continue
546             elif not "data" in raw["json"]:
547                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
548                 continue
549             elif not "nodes" in raw["json"]["data"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
551                 continue
552
553             items = raw["json"]["data"]["nodes"]
554             logger.debug("items()=%d", len(items))
555
556         except network.exceptions as exception:
557             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
558             continue
559
560         logger.info("Checking %d items,software='%s' ...", len(items), software)
561         for item in items:
562             logger.debug("item[]='%s'", type(item))
563             if not "domain" in item:
564                 logger.debug("item()=%d has not element 'domain'", len(item))
565                 continue
566
567             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
568             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
569             logger.debug("domain='%s' - AFTER!", domain)
570
571             if domain in [None, ""]:
572                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
573                 continue
574
575             logger.debug("domain='%s' - BEFORE!", domain)
576             domain = domain.encode("idna").decode("utf-8")
577             logger.debug("domain='%s' - AFTER!", domain)
578
579             if not domain_helper.is_wanted(domain):
580                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
581                 continue
582             elif instances.is_registered(domain):
583                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
584                 continue
585
586             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
587             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
588
589     logger.debug("Success! - EXIT!")
590     return 0
591
592 def fetch_todon_wiki(args: argparse.Namespace) -> int:
593     logger.debug("args[]='%s' - CALLED!", type(args))
594
595     logger.debug("Invoking locking.acquire() ...")
596     locking.acquire()
597
598     source_domain = "wiki.todon.eu"
599     if sources.is_recent(source_domain):
600         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
601         return 1
602     else:
603         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
604         sources.update(source_domain)
605
606     blocklist = {
607         "silenced": list(),
608         "reject": list(),
609     }
610
611     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
612     raw = utils.fetch_url(
613         f"https://{source_domain}/todon/domainblocks",
614         network.web_headers,
615         (config.get("connection_timeout"), config.get("read_timeout"))
616     ).text
617     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
618
619     doc = bs4.BeautifulSoup(raw, "html.parser")
620     logger.debug("doc[]='%s'", type(doc))
621
622     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
623     logger.info("Checking %d silenced/limited entries ...", len(silenced))
624     blocklist["silenced"] = utils.find_domains(silenced, "div")
625
626     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
627     logger.info("Checking %d suspended entries ...", len(suspended))
628     blocklist["reject"] = utils.find_domains(suspended, "div")
629
630     blocking = blocklist["silenced"] + blocklist["reject"]
631     blocker = "todon.eu"
632
633     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
634     instances.set_last_blocked(blocker)
635     instances.set_total_blocks(blocker, blocking)
636
637     blockdict = list()
638     for block_level in blocklist:
639         blockers = blocklist[block_level]
640
641         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
642         for blocked in blockers:
643             logger.debug("blocked='%s'", blocked)
644
645             if not instances.is_registered(blocked):
646                 try:
647                     logger.info("Fetching instances from domain='%s' ...", blocked)
648                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
649                 except network.exceptions as exception:
650                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
651                     instances.set_last_error(blocked, exception)
652
653             if not domain_helper.is_wanted(blocked):
654                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
655                 continue
656             elif not domain_helper.is_wanted(blocker):
657                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
658                 continue
659             elif blocks.is_instance_blocked(blocker, blocked, block_level):
660                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
661                 continue
662
663             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
664             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
665                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
666                 blockdict.append({
667                     "blocked": blocked,
668                     "reason" : None,
669                 })
670
671         logger.debug("Invoking commit() ...")
672         database.connection.commit()
673
674         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
675         if config.get("bot_enabled") and len(blockdict) > 0:
676             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
677             network.send_bot_post(blocker, blockdict)
678
679     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
680     if instances.has_pending(blocker):
681         logger.debug("Flushing updates for blocker='%s' ...", blocker)
682         instances.update(blocker)
683
684     logger.debug("Success! - EXIT!")
685     return 0
686
687 def fetch_cs(args: argparse.Namespace):
688     logger.debug("args[]='%s' - CALLED!", type(args))
689
690     logger.debug("Invoking locking.acquire() ...")
691     locking.acquire()
692
693     extensions = [
694         "extra",
695         "abbr",
696         "attr_list",
697         "def_list",
698         "fenced_code",
699         "footnotes",
700         "md_in_html",
701         "admonition",
702         "codehilite",
703         "legacy_attrs",
704         "legacy_em",
705         "meta",
706         "nl2br",
707         "sane_lists",
708         "smarty",
709         "toc",
710         "wikilinks"
711     ]
712
713     blocklist = {
714         "silenced": list(),
715         "reject"  : list(),
716     }
717
718     source_domain = "raw.githubusercontent.com"
719     if sources.is_recent(source_domain):
720         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
721         return 1
722     else:
723         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
724         sources.update(source_domain)
725
726     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
727     raw = utils.fetch_url(
728         f"https://{source_domain}/chaossocial/meta/master/federation.md",
729         network.web_headers,
730         (config.get("connection_timeout"), config.get("read_timeout"))
731     ).text
732     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
733
734     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
735     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
736
737     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
738     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
739     blocklist["silenced"] = federation.find_domains(silenced)
740
741     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
742     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
743     blocklist["reject"] = federation.find_domains(blocked)
744
745     blocking = blocklist["silenced"] + blocklist["reject"]
746     blocker = "chaos.social"
747
748     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
749     instances.set_last_blocked(blocker)
750     instances.set_total_blocks(blocker, blocking)
751
752     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
753     if len(blocking) > 0:
754         blockdict = list()
755         for block_level in blocklist:
756             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
757
758             for row in blocklist[block_level]:
759                 logger.debug("row[%s]='%s'", type(row), row)
760                 if not "domain" in row:
761                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
762                     continue
763                 elif not instances.is_registered(row["domain"]):
764                     try:
765                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
766                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
767                     except network.exceptions as exception:
768                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
769                         instances.set_last_error(row["domain"], exception)
770
771                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
772                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
773                     blockdict.append({
774                         "blocked": row["domain"],
775                         "reason" : row["reason"],
776                     })
777
778         logger.debug("Invoking commit() ...")
779         database.connection.commit()
780
781         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
782         if config.get("bot_enabled") and len(blockdict) > 0:
783             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
784             network.send_bot_post(blocker, blockdict)
785
786     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
787     if instances.has_pending(blocker):
788         logger.debug("Flushing updates for blocker='%s' ...", blocker)
789         instances.update(blocker)
790
791     logger.debug("Success! - EXIT!")
792     return 0
793
794 def fetch_fba_rss(args: argparse.Namespace) -> int:
795     logger.debug("args[]='%s' - CALLED!", type(args))
796
797     domains = list()
798
799     logger.debug("Invoking locking.acquire() ...")
800     locking.acquire()
801
802     components = urlparse(args.feed)
803     domain = components.netloc.lower().split(":")[0]
804
805     logger.debug("domain='%s'", domain)
806     if sources.is_recent(domain):
807         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
808         return 0
809     else:
810         logger.debug("domain='%s' has not been recently used, marking ...", domain)
811         sources.update(domain)
812
813     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
814     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
815
816     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
817     if response.ok and response.status_code == 200 and len(response.text) > 0:
818         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
819         rss = atoma.parse_rss_bytes(response.content)
820
821         logger.debug("rss[]='%s'", type(rss))
822         for item in rss.items:
823             logger.debug("item[%s]='%s'", type(item), item)
824             domain = item.link.split("=")[1]
825             domain = tidyup.domain(domain) if domain not in[None, ""] else None
826
827             logger.debug("domain='%s' - AFTER!", domain)
828             if domain in [None, ""]:
829                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
830                 continue
831
832             logger.debug("domain='%s' - BEFORE!", domain)
833             domain = domain.encode("idna").decode("utf-8")
834             logger.debug("domain='%s' - AFTER!", domain)
835
836             if not domain_helper.is_wanted(domain):
837                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
838                 continue
839             elif domain in domains:
840                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
841                 continue
842             elif instances.is_registered(domain):
843                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
844                 continue
845             elif instances.is_recent(domain):
846                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
847                 continue
848
849             logger.debug("Adding domain='%s'", domain)
850             domains.append(domain)
851
852     logger.debug("domains()=%d", len(domains))
853     if len(domains) > 0:
854         logger.info("Adding %d new instances ...", len(domains))
855         for domain in domains:
856             logger.debug("domain='%s'", domain)
857             try:
858                 logger.info("Fetching instances from domain='%s' ...", domain)
859                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
860             except network.exceptions as exception:
861                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
862                 instances.set_last_error(domain, exception)
863                 return 100
864
865     logger.debug("Success! - EXIT!")
866     return 0
867
868 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
869     logger.debug("args[]='%s' - CALLED!", type(args))
870
871     logger.debug("Invoking locking.acquire() ...")
872     locking.acquire()
873
874     source_domain = "ryona.agency"
875     feed = f"https://{source_domain}/users/fba/feed.atom"
876
877     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
878     if args.feed is not None and validators.url(args.feed):
879         logger.debug("Setting feed='%s' ...", args.feed)
880         feed = str(args.feed)
881         source_domain = urlparse(args.feed).netloc
882
883     if sources.is_recent(source_domain):
884         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
885         return 1
886     else:
887         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
888         sources.update(source_domain)
889
890     domains = list()
891
892     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
893     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
894
895     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
896     if response.ok and response.status_code == 200 and len(response.text) > 0:
897         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
898         atom = atoma.parse_atom_bytes(response.content)
899
900         logger.debug("atom[]='%s'", type(atom))
901         for entry in atom.entries:
902             logger.debug("entry[]='%s'", type(entry))
903             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
904             logger.debug("doc[]='%s'", type(doc))
905             elements = doc.findAll("a")
906
907             logger.debug("Checking %d element(s) ...", len(elements))
908             for element in elements:
909                 logger.debug("element[%s]='%s'", type(element), element)
910                 for href in element["href"].split(","):
911                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
912                     domain = tidyup.domain(href) if href not in [None, ""] else None
913
914                     logger.debug("domain='%s' - AFTER!", domain)
915                     if domain in [None, ""]:
916                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
917                         continue
918
919                     logger.debug("domain='%s' - BEFORE!", domain)
920                     domain = domain.encode("idna").decode("utf-8")
921                     logger.debug("domain='%s' - AFTER!", domain)
922
923                     if not domain_helper.is_wanted(domain):
924                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
925                         continue
926                     elif domain in domains:
927                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
928                         continue
929                     elif instances.is_registered(domain):
930                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
931                         continue
932                     elif instances.is_recent(domain):
933                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
934                         continue
935
936                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
937                     domains.append(domain)
938
939     logger.debug("domains()=%d", len(domains))
940     if len(domains) > 0:
941         logger.info("Adding %d new instances ...", len(domains))
942         for domain in domains:
943             logger.debug("domain='%s'", domain)
944             try:
945                 logger.info("Fetching instances from domain='%s' ...", domain)
946                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
947             except network.exceptions as exception:
948                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
949                 instances.set_last_error(domain, exception)
950                 return 100
951
952     logger.debug("Success! - EXIT!")
953     return 0
954
955 def fetch_instances(args: argparse.Namespace) -> int:
956     logger.debug("args[]='%s' - CALLED!", type(args))
957
958     logger.debug("args.domain='%s' - checking ...", args.domain)
959     if not validators.domain(args.domain):
960         logger.warning("args.domain='%s' is not valid.", args.domain)
961         return 100
962     elif blacklist.is_blacklisted(args.domain):
963         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
964         return 101
965
966     logger.debug("Invoking locking.acquire() ...")
967     locking.acquire()
968
969     # Initialize values
970     domain = tidyup.domain(args.domain)
971     origin = software = None
972
973     # Fetch record
974     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
975     row = database.cursor.fetchone()
976     if row is not None:
977         origin = row["origin"]
978         software = row["software"]
979
980     logger.debug("software='%s'", software)
981     if software is None:
982         logger.warning("args.domain='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated.", args.domain, args.domain)
983         return 102
984     elif software_helper.is_relay(software):
985         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
986         return 103
987
988     # Initial fetch
989     try:
990         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
991         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
992     except network.exceptions as exception:
993         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
994         instances.set_last_error(args.domain, exception)
995         instances.update(args.domain)
996         return 104
997
998     if args.single:
999         logger.debug("Not fetching more instances - EXIT!")
1000         return 0
1001
1002     # Loop through some instances
1003     database.cursor.execute(
1004         "SELECT domain, origin, software FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
1005     )
1006
1007     rows = database.cursor.fetchall()
1008     logger.info("Checking %d entries ...", len(rows))
1009     for row in rows:
1010         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1011         domain = row["domain"].encode("idna").decode("utf-8")
1012         logger.debug("domain='%s' - AFTER!", domain)
1013
1014         if not domain_helper.is_wanted(domain):
1015             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1016             continue
1017
1018         try:
1019             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1020             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1021         except network.exceptions as exception:
1022             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1023             instances.set_last_error(domain, exception)
1024
1025     logger.debug("Success - EXIT!")
1026     return 0
1027
1028 def fetch_csv(args: argparse.Namespace) -> int:
1029     logger.debug("args[]='%s' - CALLED!", type(args))
1030
1031     logger.debug("Invoking locking.acquire() ...")
1032     locking.acquire()
1033
1034     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1035     for block in blocklists.csv_files:
1036         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1037
1038         # Is domain given and not equal blocker?
1039         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1040             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1041             continue
1042
1043         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1044         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1045
1046     logger.debug("Success - EXIT!")
1047     return 0
1048
1049 def fetch_oliphant(args: argparse.Namespace) -> int:
1050     logger.debug("args[]='%s' - CALLED!", type(args))
1051
1052     logger.debug("Invoking locking.acquire() ...")
1053     locking.acquire()
1054
1055     source_domain = "codeberg.org"
1056     if sources.is_recent(source_domain):
1057         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1058         return 1
1059     else:
1060         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1061         sources.update(source_domain)
1062
1063     # Base URL
1064     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1065
1066     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1067     for block in blocklists.oliphant_blocklists:
1068         # Is domain given and not equal blocker?
1069         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1070         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1071             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1072             continue
1073
1074         url = f"{base_url}/{block['csv_url']}"
1075
1076         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1077         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1078
1079     logger.debug("Success! - EXIT!")
1080     return 0
1081
1082 def fetch_txt(args: argparse.Namespace) -> int:
1083     logger.debug("args[]='%s' - CALLED!", type(args))
1084
1085     logger.debug("Invoking locking.acquire() ...")
1086     locking.acquire()
1087
1088     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1089     for row in blocklists.txt_files:
1090         logger.debug("Fetching row[url]='%s' ...", row["url"])
1091         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1092
1093         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1094         if response.ok and response.status_code == 200 and response.text != "":
1095             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1096             domains = response.text.strip().split("\n")
1097
1098             logger.info("Processing %d domains ...", len(domains))
1099             for domain in domains:
1100                 logger.debug("domain='%s' - BEFORE!", domain)
1101                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1102                 logger.debug("domain='%s' - AFTER!", domain)
1103
1104                 if domain in [None, ""]:
1105                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1106                     continue
1107                 elif not domain_helper.is_wanted(domain):
1108                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1109                     continue
1110                 elif not args.force and instances.is_registered(domain):
1111                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1112                     continue
1113
1114                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1115                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1116                 logger.debug("processed='%s'", processed)
1117
1118     logger.debug("Success! - EXIT!")
1119     return 0
1120
1121 def fetch_fedipact(args: argparse.Namespace) -> int:
1122     logger.debug("args[]='%s' - CALLED!", type(args))
1123
1124     logger.debug("Invoking locking.acquire() ...")
1125     locking.acquire()
1126
1127     source_domain = "fedipact.online"
1128     if sources.is_recent(source_domain):
1129         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1130         return 1
1131     else:
1132         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1133         sources.update(source_domain)
1134
1135     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1136     response = utils.fetch_url(
1137         f"https://{source_domain}",
1138         network.web_headers,
1139         (config.get("connection_timeout"), config.get("read_timeout"))
1140     )
1141
1142     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1143     if response.ok and response.status_code == 200 and response.text != "":
1144         logger.debug("Parsing %d Bytes ...", len(response.text))
1145
1146         doc = bs4.BeautifulSoup(response.text, "html.parser")
1147         logger.debug("doc[]='%s'", type(doc))
1148
1149         rows = doc.findAll("li")
1150         logger.info("Checking %d row(s) ...", len(rows))
1151         for row in rows:
1152             logger.debug("row[]='%s'", type(row))
1153             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1154
1155             logger.debug("domain='%s' - AFTER!", domain)
1156             if domain in [None, ""]:
1157                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1158                 continue
1159
1160             logger.debug("domain='%s' - BEFORE!", domain)
1161             domain = domain.encode("idna").decode("utf-8")
1162             logger.debug("domain='%s' - AFTER!", domain)
1163
1164             if not domain_helper.is_wanted(domain):
1165                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1166                 continue
1167             elif instances.is_registered(domain):
1168                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1169                 continue
1170             elif instances.is_recent(domain):
1171                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1172                 continue
1173
1174             logger.info("Fetching domain='%s' ...", domain)
1175             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1176
1177     logger.debug("Success! - EXIT!")
1178     return 0
1179
1180 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1181     logger.debug("args[]='%s' - CALLED!", type(args))
1182
1183     logger.debug("Invoking locking.acquire() ...")
1184     locking.acquire()
1185
1186     source_domain = "instances.joinmobilizon.org"
1187     if sources.is_recent(source_domain):
1188         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1189         return 1
1190     else:
1191         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1192         sources.update(source_domain)
1193
1194     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1195     raw = utils.fetch_url(
1196         f"https://{source_domain}/api/v1/instances",
1197         network.web_headers,
1198         (config.get("connection_timeout"), config.get("read_timeout"))
1199     ).text
1200     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1201
1202     parsed = json.loads(raw)
1203     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1204
1205     if "data" not in parsed:
1206         logger.warning("parsed()=%d does not contain key 'data'")
1207         return 1
1208
1209     logger.info("Checking %d instances ...", len(parsed["data"]))
1210     for row in parsed["data"]:
1211         logger.debug("row[]='%s'", type(row))
1212         if "host" not in row:
1213             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1214             continue
1215         elif not domain_helper.is_wanted(row["host"]):
1216             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1217             continue
1218         elif instances.is_registered(row["host"]):
1219             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1220             continue
1221
1222         logger.info("Fetching row[host]='%s' ...", row["host"])
1223         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1224
1225     logger.debug("Success! - EXIT!")
1226     return 0
1227
1228 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1229     logger.debug("args[]='%s' - CALLED!", type(args))
1230
1231     logger.debug("Invoking locking.acquire() ...")
1232     locking.acquire()
1233
1234     source_domain = "instanceapp.misskey.page"
1235     if sources.is_recent(source_domain):
1236         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1237         return 1
1238     else:
1239         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1240         sources.update(source_domain)
1241
1242     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1243     raw = utils.fetch_url(
1244         f"https://{source_domain}/instances.json",
1245         network.web_headers,
1246         (config.get("connection_timeout"), config.get("read_timeout"))
1247     ).text
1248     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1249
1250     parsed = json.loads(raw)
1251     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1252
1253     if "instancesInfos" not in parsed:
1254         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1255         return 1
1256
1257     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1258     for row in parsed["instancesInfos"]:
1259         logger.debug("row[%s]='%s'", type(row), row)
1260         if "url" not in row:
1261             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1262             continue
1263         elif not domain_helper.is_wanted(row["url"]):
1264             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1265             continue
1266         elif instances.is_registered(row["url"]):
1267             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1268             continue
1269
1270         logger.info("Fetching row[url]='%s' ...", row["url"])
1271         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1272
1273     logger.debug("Success! - EXIT!")
1274     return 0
1275
1276 def recheck_obfuscation(args: argparse.Namespace) -> int:
1277     logger.debug("args[]='%s' - CALLED!", type(args))
1278
1279     logger.debug("Invoking locking.acquire() ...")
1280     locking.acquire()
1281
1282     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1283         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1284     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1285         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1286     else:
1287         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1288
1289     rows = database.cursor.fetchall()
1290     logger.info("Checking %d domains ...", len(rows))
1291     for row in rows:
1292         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1293         if blacklist.is_blacklisted(row["domain"]):
1294             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1295             continue
1296         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1297             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1298             continue
1299
1300         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1301         blocking = federation.fetch_blocks(row["domain"])
1302
1303         logger.debug("blocking()=%d", len(blocking))
1304         if len(blocking) == 0:
1305             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1306             if row["software"] == "pleroma":
1307                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1308                 blocking = pleroma.fetch_blocks(row["domain"])
1309             elif row["software"] == "mastodon":
1310                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1311                 blocking = mastodon.fetch_blocks(row["domain"])
1312             elif row["software"] == "lemmy":
1313                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1314                 blocking = lemmy.fetch_blocks(row["domain"])
1315             elif row["software"] == "friendica":
1316                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1317                 blocking = friendica.fetch_blocks(row["domain"])
1318             elif row["software"] == "misskey":
1319                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1320                 blocking = misskey.fetch_blocks(row["domain"])
1321             else:
1322                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1323
1324         # c.s isn't part of oliphant's "hidden" blocklists
1325         logger.debug("row[domain]='%s'", row["domain"])
1326         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1327             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1328             instances.set_last_blocked(row["domain"])
1329             instances.set_total_blocks(row["domain"], blocking)
1330
1331         obfuscated = 0
1332         blockdict = list()
1333
1334         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1335         for block in blocking:
1336             logger.debug("block[blocked]='%s'", block["blocked"])
1337             blocked = None
1338
1339             if block["blocked"] == "":
1340                 logger.debug("block[blocked] is empty - SKIPPED!")
1341                 continue
1342             elif block["blocked"].endswith(".onion"):
1343                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1344                 continue
1345             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
1346                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1347                 continue
1348             elif block["blocked"].endswith(".arpa"):
1349                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1350                 continue
1351             elif block["blocked"].endswith(".tld"):
1352                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1353                 continue
1354             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1355                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1356                 obfuscated = obfuscated + 1
1357                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1358             elif not domain_helper.is_wanted(block["blocked"]):
1359                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1360                 continue
1361             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1362                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1363                 continue
1364
1365             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1366             if blocked is not None and blocked != block["blocked"]:
1367                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1368                 obfuscated = obfuscated - 1
1369
1370                 if blacklist.is_blacklisted(blocked):
1371                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1372                     continue
1373                 elif blacklist.is_blacklisted(row["domain"]):
1374                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1375                     continue
1376                 elif blocks.is_instance_blocked(row["domain"], blocked):
1377                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1378                     continue
1379
1380                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1381
1382                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1383                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1384                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1385                     blockdict.append({
1386                         "blocked": blocked,
1387                         "reason" : block["reason"],
1388                     })
1389
1390         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1391         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1392         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1393
1394         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1395         if instances.has_pending(row["domain"]):
1396             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1397             instances.update(row["domain"])
1398
1399         logger.debug("Invoking commit() ...")
1400         database.connection.commit()
1401
1402         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1403         if config.get("bot_enabled") and len(blockdict) > 0:
1404             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1405             network.send_bot_post(row["domain"], blockdict)
1406
1407     logger.debug("Success! - EXIT!")
1408     return 0
1409
1410 def fetch_fedilist(args: argparse.Namespace) -> int:
1411     logger.debug("args[]='%s' - CALLED!", type(args))
1412
1413     logger.debug("Invoking locking.acquire() ...")
1414     locking.acquire()
1415
1416     source_domain = "demo.fedilist.com"
1417     if sources.is_recent(source_domain):
1418         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1419         return 1
1420     else:
1421         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1422         sources.update(source_domain)
1423
1424     url = f"http://{source_domain}/instance/csv?onion=not"
1425     if args.software is not None and args.software != "":
1426         logger.debug("args.software='%s'", args.software)
1427         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1428
1429     logger.info("Fetching url='%s' ...", url)
1430     response = reqto.get(
1431         url,
1432         headers=network.web_headers,
1433         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1434         allow_redirects=False
1435     )
1436
1437     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1438     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1439         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1440         return 1
1441
1442     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1443
1444     logger.debug("reader[]='%s'", type(reader))
1445     if reader is None:
1446         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1447         return 2
1448
1449     rows = list(reader)
1450
1451     logger.info("Checking %d rows ...", len(rows))
1452     for row in rows:
1453         logger.debug("row[]='%s'", type(row))
1454         if "hostname" not in row:
1455             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1456             continue
1457
1458         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1459         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1460         logger.debug("domain='%s' - AFTER!", domain)
1461
1462         if domain in [None, ""]:
1463             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1464             continue
1465
1466         logger.debug("domain='%s' - BEFORE!", domain)
1467         domain = domain.encode("idna").decode("utf-8")
1468         logger.debug("domain='%s' - AFTER!", domain)
1469
1470         if not domain_helper.is_wanted(domain):
1471             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1472             continue
1473         elif (args.force is None or not args.force) and instances.is_registered(domain):
1474             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1475             continue
1476         elif instances.is_recent(domain):
1477             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1478             continue
1479
1480         logger.info("Fetching instances from domain='%s' ...", domain)
1481         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1482
1483     logger.debug("Success! - EXIT!")
1484     return 0
1485
1486 def update_nodeinfo(args: argparse.Namespace) -> int:
1487     logger.debug("args[]='%s' - CALLED!", type(args))
1488
1489     logger.debug("Invoking locking.acquire() ...")
1490     locking.acquire()
1491
1492     if args.domain is not None and args.domain != "":
1493         logger.debug("Fetching args.domain='%s'", args.domain)
1494         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1495     elif args.software is not None and args.software != "":
1496         logger.info("Fetching domains for args.software='%s'", args.software)
1497         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1498     elif args.mode is not None and args.mode != "":
1499         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1500         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1501     elif args.no_software:
1502         logger.info("Fetching domains with no software type detected ...")
1503         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1504     elif args.with_software:
1505         logger.info("Fetching domains with any software type detected ...")
1506         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1507     elif args.no_auto:
1508         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1509         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1510     elif args.no_detection:
1511         logger.info("Fetching domains with no detection mode being set ...")
1512         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1513     else:
1514         logger.info("Fetching domains for recently updated ...")
1515         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1516
1517     domains = database.cursor.fetchall()
1518
1519     logger.info("Checking %d domain(s) ...", len(domains))
1520     cnt = 0
1521     for row in domains:
1522         logger.debug("row[]='%s'", type(row))
1523         if blacklist.is_blacklisted(row["domain"]):
1524             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1525             continue
1526         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1527             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1528             continue
1529
1530         try:
1531             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1532             software = federation.determine_software(row["domain"])
1533
1534             logger.debug("Determined software='%s'", software)
1535             if (software != row["software"] and software is not None) or args.force is True:
1536                 logger.debug("software='%s'", software)
1537                 if software is None:
1538                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1539                     instances.set_nodeinfo_url(row["domain"], None)
1540
1541                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1542                 instances.set_software(row["domain"], software)
1543
1544             if software is not None:
1545                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1546                 instances.set_success(row["domain"])
1547         except network.exceptions as exception:
1548             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1549             instances.set_last_error(row["domain"], exception)
1550
1551         instances.set_last_nodeinfo(row["domain"])
1552         instances.update(row["domain"])
1553         cnt = cnt + 1
1554
1555     logger.debug("Success! - EXIT!")
1556     return 0
1557
1558 def fetch_instances_social(args: argparse.Namespace) -> int:
1559     logger.debug("args[]='%s' - CALLED!", type(args))
1560
1561     logger.debug("Invoking locking.acquire() ...")
1562     locking.acquire()
1563
1564     source_domain = "instances.social"
1565
1566     if config.get("instances_social_api_key") == "":
1567         logger.error("API key not set. Please set in your config.json file.")
1568         return 1
1569     elif sources.is_recent(source_domain):
1570         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1571         return 2
1572     else:
1573         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1574         sources.update(source_domain)
1575
1576     headers = {
1577         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1578     }
1579
1580     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1581     fetched = network.get_json_api(
1582         source_domain,
1583         "/api/1.0/instances/list?count=0&sort_by=name",
1584         headers=headers,
1585         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1586     )
1587     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1588
1589     if "error_message" in fetched:
1590         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1591         return 2
1592     elif "exception" in fetched:
1593         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1594         return 3
1595     elif "json" not in fetched:
1596         logger.warning("fetched has no element 'json' - EXIT!")
1597         return 4
1598     elif "instances" not in fetched["json"]:
1599         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1600         return 5
1601
1602     domains = list()
1603     rows = fetched["json"]["instances"]
1604
1605     logger.info("Checking %d row(s) ...", len(rows))
1606     for row in rows:
1607         logger.debug("row[]='%s'", type(row))
1608         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1609         logger.debug("domain='%s' - AFTER!", domain)
1610
1611         if domain is None and domain == "":
1612             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1613             continue
1614
1615         logger.debug("domain='%s' - BEFORE!", domain)
1616         domain = domain.encode("idna").decode("utf-8")
1617         logger.debug("domain='%s' - AFTER!", domain)
1618
1619         if not domain_helper.is_wanted(domain):
1620             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1621             continue
1622         elif domain in domains:
1623             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1624             continue
1625         elif instances.is_registered(domain):
1626             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1627             continue
1628         elif instances.is_recent(domain):
1629             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1630             continue
1631
1632         logger.info("Fetching instances from domain='%s' ...", domain)
1633         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1634
1635     logger.debug("Success! - EXIT!")
1636     return 0
1637
1638 def fetch_relaylist(args: argparse.Namespace) -> int:
1639     logger.debug("args[]='%s' - CALLED!", type(args))
1640
1641     logger.debug("Invoking locking.acquire() ...")
1642     locking.acquire()
1643
1644     source_domain = "api.relaylist.com"
1645
1646     if sources.is_recent(source_domain):
1647         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1648         return 1
1649     else:
1650         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1651         sources.update(source_domain)
1652
1653     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1654     fetched = network.get_json_api(
1655         source_domain,
1656         "/relays",
1657         {},
1658         (config.get("connection_timeout"), config.get("read_timeout"))
1659     )
1660     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1661
1662     if "error_message" in fetched:
1663         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1664         return 2
1665     elif "exception" in fetched:
1666         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1667         return 3
1668     elif "json" not in fetched:
1669         logger.warning("fetched has no element 'json' - EXIT!")
1670         return 4
1671
1672     domains = list()
1673
1674     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1675     for row in fetched["json"]:
1676         logger.debug("row[]='%s'", type(row))
1677         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1678         logger.debug("domain='%s' - AFTER!", domain)
1679
1680         if domain is None and domain == "":
1681             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1682             continue
1683
1684         logger.debug("domain='%s' - BEFORE!", domain)
1685         domain = domain.encode("idna").decode("utf-8")
1686         logger.debug("domain='%s' - AFTER!", domain)
1687
1688         if not domain_helper.is_wanted(domain):
1689             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1690             continue
1691         elif domain in domains:
1692             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1693             continue
1694         elif instances.is_registered(domain):
1695             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1696             continue
1697         elif instances.is_recent(domain):
1698             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1699             continue
1700
1701         logger.info("Fetching instances from domain='%s'", domain)
1702         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1703
1704     logger.debug("Success! - EXIT!")
1705     return 0
1706
1707 def fetch_relays(args: argparse.Namespace) -> int:
1708     logger.debug("args[]='%s' - CALLED!", type(args))
1709
1710     logger.debug("Invoking locking.acquire() ...")
1711     locking.acquire()
1712
1713     if args.domain is not None and args.domain != "":
1714         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1715         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1716     elif args.software is not None and args.software != "":
1717         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1718         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1719     else:
1720         logger.debug("Fetch all relay instances ...")
1721         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1722
1723     domains = list()
1724     rows = database.cursor.fetchall()
1725
1726     logger.info("Checking %d relays ...", len(rows))
1727     for row in rows:
1728         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1729         if not args.force and instances.is_recent(row["domain"]):
1730             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1731             continue
1732         elif row["nodeinfo_url"] is None:
1733             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1734             continue
1735
1736         peers = list()
1737         try:
1738             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1739             if row["software"] == "pub-relay":
1740                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1741                 raw = network.fetch_api_url(
1742                     row["nodeinfo_url"],
1743                     (config.get("connection_timeout"), config.get("read_timeout"))
1744                 )
1745
1746                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1747                 if "exception" in raw:
1748                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1749                     raise raw["exception"]
1750                 elif "error_message" in raw:
1751                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1752                     instances.set_last_error(row["domain"], raw)
1753                     instances.set_last_instance_fetch(row["domain"])
1754                     instances.update(row["domain"])
1755                     continue
1756                 elif "json" not in raw:
1757                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1758                     continue
1759                 elif not "metadata" in raw["json"]:
1760                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1761                     continue
1762                 elif not "peers" in raw["json"]["metadata"]:
1763                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1764                     continue
1765             else:
1766                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1767                 raw = utils.fetch_url(
1768                     f"https://{row['domain']}",
1769                     network.web_headers,
1770                     (config.get("connection_timeout"), config.get("read_timeout"))
1771                 ).text
1772                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1773
1774                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1775                 logger.debug("doc[]='%s'", type(doc))
1776
1777         except network.exceptions as exception:
1778             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1779             instances.set_last_error(row["domain"], exception)
1780             instances.set_last_instance_fetch(row["domain"])
1781             instances.update(row["domain"])
1782             continue
1783
1784         logger.debug("row[software]='%s'", row["software"])
1785         if row["software"] == "activityrelay":
1786             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1787             tags = doc.findAll("p")
1788
1789             logger.debug("Checking %d paragraphs ...", len(tags))
1790             for tag in tags:
1791                 logger.debug("tag[]='%s'", type(tag))
1792                 if len(tag.contents) == 0:
1793                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1794                     continue
1795                 elif "registered instances" not in tag.contents[0]:
1796                     logger.debug("Skipping paragraph, text not found.")
1797                     continue
1798
1799                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1800                 for domain in tag.contents:
1801                     logger.debug("domain[%s]='%s'", type(domain), domain)
1802                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1803                         continue
1804
1805                     domain = str(domain)
1806                     logger.debug("domain='%s'", domain)
1807                     if not domain_helper.is_wanted(domain):
1808                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1809                         continue
1810
1811                     logger.debug("domain='%s' - BEFORE!", domain)
1812                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1813                     logger.debug("domain='%s' - AFTER!", domain)
1814
1815                     if domain in [None, ""]:
1816                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1817                         continue
1818                     elif domain not in peers:
1819                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1820                         peers.append(domain)
1821
1822                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1823                     if dict_helper.has_key(domains, "domain", domain):
1824                         logger.debug("domain='%s' already added", domain)
1825                         continue
1826
1827                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1828                     domains.append({
1829                         "domain": domain,
1830                         "origin": row["domain"],
1831                     })
1832         elif row["software"] in ["aoderelay", "selective-relay"]:
1833             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1834             if row["software"] == "aoderelay":
1835                 tags = doc.findAll("section", {"class": "instance"})
1836             else:
1837                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1838
1839             logger.debug("Checking %d tags ...", len(tags))
1840             for tag in tags:
1841                 logger.debug("tag[]='%s'", type(tag))
1842
1843                 link = tag.find("a")
1844                 logger.debug("link[%s]='%s'", type(link), link)
1845                 if not isinstance(link, bs4.element.Tag):
1846                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1847                     continue
1848
1849                 components = urlparse(link.get("href"))
1850                 logger.debug("components(%d)='%s'", len(components), components)
1851                 domain = components.netloc.lower().split(":")[0]
1852
1853                 logger.debug("domain='%s' - BEFORE!", domain)
1854                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1855                 logger.debug("domain='%s' - AFTER!", domain)
1856
1857                 if domain in [None, ""]:
1858                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1859                     continue
1860                 elif domain not in peers:
1861                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1862                     peers.append(domain)
1863
1864                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1865                 if dict_helper.has_key(domains, "domain", domain):
1866                     logger.debug("domain='%s' already added", domain)
1867                     continue
1868
1869                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1870                 domains.append({
1871                     "domain": domain,
1872                     "origin": row["domain"],
1873                 })
1874         elif row["software"] == "pub-relay":
1875             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1876             for domain in raw["json"]["metadata"]["peers"]:
1877                 logger.debug("domain='%s' - BEFORE!", domain)
1878                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1879                 logger.debug("domain='%s' - AFTER!", domain)
1880
1881                 if domain in [None, ""]:
1882                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1883                     continue
1884                 elif domain not in peers:
1885                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1886                     peers.append(domain)
1887
1888                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1889                 if dict_helper.has_key(domains, "domain", domain):
1890                     logger.debug("domain='%s' already added", domain)
1891                     continue
1892
1893                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1894                 domains.append({
1895                     "domain": domain,
1896                     "origin": row["domain"],
1897                 })
1898         else:
1899             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1900             continue
1901
1902         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1903         instances.set_last_instance_fetch(row["domain"])
1904
1905         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1906         instances.set_total_peers(row["domain"], peers)
1907
1908         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1909         instances.update(row["domain"])
1910
1911     logger.info("Checking %d domains ...", len(domains))
1912     for row in domains:
1913         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1914         if not domain_helper.is_wanted(row["domain"]):
1915             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1916             continue
1917         elif instances.is_registered(row["domain"]):
1918             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1919             continue
1920
1921         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1922         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1923
1924     logger.debug("Success! - EXIT!")
1925     return 0
1926
1927 def convert_idna(args: argparse.Namespace) -> int:
1928     logger.debug("args[]='%s' - CALLED!", type(args))
1929
1930     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1931     rows = database.cursor.fetchall()
1932
1933     logger.debug("rows[]='%s'", type(rows))
1934     instances.translate_idnas(rows, "domain")
1935
1936     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1937     rows = database.cursor.fetchall()
1938
1939     logger.debug("rows[]='%s'", type(rows))
1940     instances.translate_idnas(rows, "origin")
1941
1942     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1943     rows = database.cursor.fetchall()
1944
1945     logger.debug("rows[]='%s'", type(rows))
1946     blocks.translate_idnas(rows, "blocker")
1947
1948     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1949     rows = database.cursor.fetchall()
1950
1951     logger.debug("rows[]='%s'", type(rows))
1952     blocks.translate_idnas(rows, "blocked")
1953
1954     logger.debug("Success! - EXIT!")
1955     return 0
1956
1957 def remove_invalid(args: argparse.Namespace) -> int:
1958     logger.debug("args[]='%s' - CALLED!", type(args))
1959
1960     logger.debug("Invoking locking.acquire() ...")
1961     locking.acquire()
1962
1963     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1964     rows = database.cursor.fetchall()
1965
1966     logger.info("Checking %d domains ...", len(rows))
1967     for row in rows:
1968         logger.debug("row[domain]='%s'", row["domain"])
1969         if not validators.domain(row["domain"].split("/")[0]):
1970             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1971             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1972             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1973
1974     logger.debug("Invoking commit() ...")
1975     database.connection.commit()
1976
1977     logger.info("Vaccum cleaning database ...")
1978     database.cursor.execute("VACUUM")
1979
1980     logger.debug("Success! - EXIT!")
1981     return 0