]> git.mxchange.org Git - fba.git/blob - fba/commands.py
WIP:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = network.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         items = list()
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = network.post_json_api(
532                 f"api.{source_domain}",
533                 "/",
534                 json.dumps({
535                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
536                 })
537             )
538
539             logger.debug("raw[%s]()=%d", type(raw), len(raw))
540             if "exception" in raw:
541                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
542                 raise raw["exception"]
543             elif "error_message" in raw:
544                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
545                 continue
546             elif not "data" in raw["json"]:
547                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
548                 continue
549             elif not "nodes" in raw["json"]["data"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
551                 continue
552
553             items = raw["json"]["data"]["nodes"]
554             logger.debug("items()=%d", len(items))
555
556         except network.exceptions as exception:
557             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
558             continue
559
560         logger.info("Checking %d items,software='%s' ...", len(items), software)
561         for item in items:
562             logger.debug("item[]='%s'", type(item))
563             if not "domain" in item:
564                 logger.debug("item()=%d has not element 'domain'", len(item))
565                 continue
566
567             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
568             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
569             logger.debug("domain='%s' - AFTER!", domain)
570
571             if domain in [None, ""]:
572                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
573                 continue
574
575             logger.debug("domain='%s' - BEFORE!", domain)
576             domain = domain.encode("idna").decode("utf-8")
577             logger.debug("domain='%s' - AFTER!", domain)
578
579             if not domain_helper.is_wanted(domain):
580                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
581                 continue
582             elif instances.is_registered(domain):
583                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
584                 continue
585
586             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
587             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
588
589     logger.debug("Success! - EXIT!")
590     return 0
591
592 def fetch_todon_wiki(args: argparse.Namespace) -> int:
593     logger.debug("args[]='%s' - CALLED!", type(args))
594
595     logger.debug("Invoking locking.acquire() ...")
596     locking.acquire()
597
598     source_domain = "wiki.todon.eu"
599     if sources.is_recent(source_domain):
600         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
601         return 1
602     else:
603         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
604         sources.update(source_domain)
605
606     blocklist = {
607         "silenced": list(),
608         "reject": list(),
609     }
610
611     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
612     raw = network.fetch_url(
613         f"https://{source_domain}/todon/domainblocks",
614         network.web_headers,
615         (config.get("connection_timeout"), config.get("read_timeout"))
616     ).text
617     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
618
619     doc = bs4.BeautifulSoup(raw, "html.parser")
620     logger.debug("doc[]='%s'", type(doc))
621
622     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
623     logger.info("Checking %d silenced/limited entries ...", len(silenced))
624     blocklist["silenced"] = utils.find_domains(silenced, "div")
625
626     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
627     logger.info("Checking %d suspended entries ...", len(suspended))
628     blocklist["reject"] = utils.find_domains(suspended, "div")
629
630     blocking = blocklist["silenced"] + blocklist["reject"]
631     blocker = "todon.eu"
632
633     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
634     instances.set_last_blocked(blocker)
635     instances.set_total_blocks(blocker, blocking)
636
637     blockdict = list()
638     for block_level in blocklist:
639         blockers = blocklist[block_level]
640
641         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
642         for blocked in blockers:
643             logger.debug("blocked='%s'", blocked)
644
645             if not instances.is_registered(blocked):
646                 try:
647                     logger.info("Fetching instances from domain='%s' ...", blocked)
648                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
649                 except network.exceptions as exception:
650                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
651                     instances.set_last_error(blocked, exception)
652
653             if not domain_helper.is_wanted(blocked):
654                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
655                 continue
656             elif not domain_helper.is_wanted(blocker):
657                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
658                 continue
659             elif blocks.is_instance_blocked(blocker, blocked, block_level):
660                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
661                 continue
662
663             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
664             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
665                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
666                 blockdict.append({
667                     "blocked": blocked,
668                     "reason" : None,
669                 })
670
671         logger.debug("Invoking commit() ...")
672         database.connection.commit()
673
674         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
675         if config.get("bot_enabled") and len(blockdict) > 0:
676             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
677             network.send_bot_post(blocker, blockdict)
678
679     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
680     if instances.has_pending(blocker):
681         logger.debug("Flushing updates for blocker='%s' ...", blocker)
682         instances.update(blocker)
683
684     logger.debug("Success! - EXIT!")
685     return 0
686
687 def fetch_cs(args: argparse.Namespace):
688     logger.debug("args[]='%s' - CALLED!", type(args))
689
690     logger.debug("Invoking locking.acquire() ...")
691     locking.acquire()
692
693     extensions = [
694         "extra",
695         "abbr",
696         "attr_list",
697         "def_list",
698         "fenced_code",
699         "footnotes",
700         "md_in_html",
701         "admonition",
702         "codehilite",
703         "legacy_attrs",
704         "legacy_em",
705         "meta",
706         "nl2br",
707         "sane_lists",
708         "smarty",
709         "toc",
710         "wikilinks"
711     ]
712
713     blocklist = {
714         "silenced": list(),
715         "reject"  : list(),
716     }
717
718     source_domain = "raw.githubusercontent.com"
719     if sources.is_recent(source_domain):
720         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
721         return 1
722     else:
723         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
724         sources.update(source_domain)
725
726     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
727     raw = network.fetch_url(
728         f"https://{source_domain}/chaossocial/meta/master/federation.md",
729         network.web_headers,
730         (config.get("connection_timeout"), config.get("read_timeout"))
731     ).text
732     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
733
734     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
735     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
736
737     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
738     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
739     blocklist["silenced"] = federation.find_domains(silenced)
740
741     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
742     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
743     blocklist["reject"] = federation.find_domains(blocked)
744
745     blocking = blocklist["silenced"] + blocklist["reject"]
746     blocker = "chaos.social"
747
748     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
749     instances.set_last_blocked(blocker)
750     instances.set_total_blocks(blocker, blocking)
751
752     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
753     if len(blocking) > 0:
754         blockdict = list()
755         for block_level in blocklist:
756             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
757
758             for row in blocklist[block_level]:
759                 logger.debug("row[%s]='%s'", type(row), row)
760                 if not "domain" in row:
761                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
762                     continue
763                 elif not instances.is_registered(row["domain"]):
764                     try:
765                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
766                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
767                     except network.exceptions as exception:
768                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
769                         instances.set_last_error(row["domain"], exception)
770
771                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
772                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
773                     blockdict.append({
774                         "blocked": row["domain"],
775                         "reason" : row["reason"],
776                     })
777
778         logger.debug("Invoking commit() ...")
779         database.connection.commit()
780
781         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
782         if config.get("bot_enabled") and len(blockdict) > 0:
783             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
784             network.send_bot_post(blocker, blockdict)
785
786     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
787     if instances.has_pending(blocker):
788         logger.debug("Flushing updates for blocker='%s' ...", blocker)
789         instances.update(blocker)
790
791     logger.debug("Success! - EXIT!")
792     return 0
793
794 def fetch_fba_rss(args: argparse.Namespace) -> int:
795     logger.debug("args[]='%s' - CALLED!", type(args))
796
797     domains = list()
798
799     logger.debug("Invoking locking.acquire() ...")
800     locking.acquire()
801
802     components = urlparse(args.feed)
803     domain = components.netloc.lower().split(":")[0]
804
805     logger.debug("domain='%s'", domain)
806     if sources.is_recent(domain):
807         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
808         return 0
809     else:
810         logger.debug("domain='%s' has not been recently used, marking ...", domain)
811         sources.update(domain)
812
813     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
814     response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
815
816     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
817     if response.ok and response.status_code == 200 and len(response.text) > 0:
818         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
819         rss = atoma.parse_rss_bytes(response.content)
820
821         logger.debug("rss[]='%s'", type(rss))
822         for item in rss.items:
823             logger.debug("item[%s]='%s'", type(item), item)
824             domain = item.link.split("=")[1]
825             domain = tidyup.domain(domain) if domain not in[None, ""] else None
826
827             logger.debug("domain='%s' - AFTER!", domain)
828             if domain in [None, ""]:
829                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
830                 continue
831
832             logger.debug("domain='%s' - BEFORE!", domain)
833             domain = domain.encode("idna").decode("utf-8")
834             logger.debug("domain='%s' - AFTER!", domain)
835
836             if not domain_helper.is_wanted(domain):
837                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
838                 continue
839             elif domain in domains:
840                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
841                 continue
842             elif instances.is_registered(domain):
843                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
844                 continue
845             elif instances.is_recent(domain):
846                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
847                 continue
848
849             logger.debug("Adding domain='%s'", domain)
850             domains.append(domain)
851
852     logger.debug("domains()=%d", len(domains))
853     if len(domains) > 0:
854         logger.info("Adding %d new instances ...", len(domains))
855         for domain in domains:
856             logger.debug("domain='%s'", domain)
857             try:
858                 logger.info("Fetching instances from domain='%s' ...", domain)
859                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
860             except network.exceptions as exception:
861                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
862                 instances.set_last_error(domain, exception)
863                 return 100
864
865     logger.debug("Success! - EXIT!")
866     return 0
867
868 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
869     logger.debug("args[]='%s' - CALLED!", type(args))
870
871     logger.debug("Invoking locking.acquire() ...")
872     locking.acquire()
873
874     source_domain = "ryona.agency"
875     feed = f"https://{source_domain}/users/fba/feed.atom"
876
877     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
878     if args.feed is not None and validators.url(args.feed):
879         logger.debug("Setting feed='%s' ...", args.feed)
880         feed = str(args.feed)
881         source_domain = urlparse(args.feed).netloc
882
883     if sources.is_recent(source_domain):
884         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
885         return 1
886     else:
887         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
888         sources.update(source_domain)
889
890     domains = list()
891
892     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
893     response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
894
895     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
896     if response.ok and response.status_code == 200 and len(response.text) > 0:
897         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
898         atom = atoma.parse_atom_bytes(response.content)
899
900         logger.debug("atom[]='%s'", type(atom))
901         for entry in atom.entries:
902             logger.debug("entry[]='%s'", type(entry))
903             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
904             logger.debug("doc[]='%s'", type(doc))
905             elements = doc.findAll("a")
906
907             logger.debug("Checking %d element(s) ...", len(elements))
908             for element in elements:
909                 logger.debug("element[%s]='%s'", type(element), element)
910                 for href in element["href"].split(","):
911                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
912                     domain = tidyup.domain(href) if href not in [None, ""] else None
913
914                     logger.debug("domain='%s' - AFTER!", domain)
915                     if domain in [None, ""]:
916                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
917                         continue
918
919                     logger.debug("domain='%s' - BEFORE!", domain)
920                     domain = domain.encode("idna").decode("utf-8")
921                     logger.debug("domain='%s' - AFTER!", domain)
922
923                     if not domain_helper.is_wanted(domain):
924                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
925                         continue
926                     elif domain in domains:
927                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
928                         continue
929                     elif instances.is_registered(domain):
930                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
931                         continue
932                     elif instances.is_recent(domain):
933                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
934                         continue
935
936                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
937                     domains.append(domain)
938
939     logger.debug("domains()=%d", len(domains))
940     if len(domains) > 0:
941         logger.info("Adding %d new instances ...", len(domains))
942         for domain in domains:
943             logger.debug("domain='%s'", domain)
944             try:
945                 logger.info("Fetching instances from domain='%s' ...", domain)
946                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
947             except network.exceptions as exception:
948                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
949                 instances.set_last_error(domain, exception)
950                 return 100
951
952     logger.debug("Success! - EXIT!")
953     return 0
954
955 def fetch_instances(args: argparse.Namespace) -> int:
956     logger.debug("args[]='%s' - CALLED!", type(args))
957
958     logger.debug("Invoking locking.acquire() ...")
959     locking.acquire()
960
961     # Is domain or software set?
962     if args.domain != "":
963         logger.debug("args.domain='%s' - checking ...", args.domain)
964         if not validators.domain(args.domain):
965             logger.warning("args.domain='%s' is not valid.", args.domain)
966             return 100
967         elif blacklist.is_blacklisted(args.domain):
968             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
969             return 101
970
971         logger.debug("args.domain='%s' - BEFORE!", args.domain)
972         domain = tidyup.domain(args.domain)
973         logger.debug("domain='%s' - AFTER!", domain)
974
975         # Fetch record
976         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
977
978     rows = database.cursor.fetchall()
979     logger.info("Checking %d entries ...", len(rows))
980     for row in rows:
981         logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
982         if row["software"] is None:
983             logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
984             continue
985         elif software_helper.is_relay(row["software"]):
986             logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
987             continue
988
989         # Initial fetch
990         try:
991             logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
992             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
993         except network.exceptions as exception:
994             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
995             instances.set_last_error(row["domain"], exception)
996             instances.update(row["domain"])
997             raise exception
998
999         if args.single:
1000             logger.debug("Not fetching more instances - BREAK!")
1001             break
1002
1003     # Loop through some instances
1004     database.cursor.execute(
1005         "SELECT domain, origin, software \
1006 FROM instances \
1007 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') \
1008 AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) \
1009 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
1010     )
1011
1012     rows = database.cursor.fetchall()
1013     logger.info("Checking %d entries ...", len(rows))
1014     for row in rows:
1015         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1016         domain = row["domain"].encode("idna").decode("utf-8")
1017         logger.debug("domain='%s' - AFTER!", domain)
1018
1019         if not domain_helper.is_wanted(domain):
1020             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1021             continue
1022
1023         try:
1024             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1025             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1026         except network.exceptions as exception:
1027             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1028             instances.set_last_error(domain, exception)
1029
1030     logger.debug("Success - EXIT!")
1031     return 0
1032
1033 def fetch_csv(args: argparse.Namespace) -> int:
1034     logger.debug("args[]='%s' - CALLED!", type(args))
1035
1036     logger.debug("Invoking locking.acquire() ...")
1037     locking.acquire()
1038
1039     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1040     for block in blocklists.csv_files:
1041         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1042
1043         # Is domain given and not equal blocker?
1044         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1045             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1046             continue
1047
1048         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1049         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1050
1051     logger.debug("Success - EXIT!")
1052     return 0
1053
1054 def fetch_oliphant(args: argparse.Namespace) -> int:
1055     logger.debug("args[]='%s' - CALLED!", type(args))
1056
1057     logger.debug("Invoking locking.acquire() ...")
1058     locking.acquire()
1059
1060     source_domain = "codeberg.org"
1061     if sources.is_recent(source_domain):
1062         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1063         return 1
1064     else:
1065         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1066         sources.update(source_domain)
1067
1068     # Base URL
1069     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1070
1071     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1072     for block in blocklists.oliphant_blocklists:
1073         # Is domain given and not equal blocker?
1074         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1075         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1076             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1077             continue
1078
1079         url = f"{base_url}/{block['csv_url']}"
1080
1081         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1082         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1083
1084     logger.debug("Success! - EXIT!")
1085     return 0
1086
1087 def fetch_txt(args: argparse.Namespace) -> int:
1088     logger.debug("args[]='%s' - CALLED!", type(args))
1089
1090     logger.debug("Invoking locking.acquire() ...")
1091     locking.acquire()
1092
1093     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1094     for row in blocklists.txt_files:
1095         logger.debug("Fetching row[url]='%s' ...", row["url"])
1096         response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1097
1098         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1099         if response.ok and response.status_code == 200 and response.text != "":
1100             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1101             domains = response.text.strip().split("\n")
1102
1103             logger.info("Processing %d domains ...", len(domains))
1104             for domain in domains:
1105                 logger.debug("domain='%s' - BEFORE!", domain)
1106                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1107                 logger.debug("domain='%s' - AFTER!", domain)
1108
1109                 if domain in [None, ""]:
1110                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1111                     continue
1112                 elif not domain_helper.is_wanted(domain):
1113                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1114                     continue
1115                 elif not args.force and instances.is_registered(domain):
1116                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1117                     continue
1118
1119                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1120                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1121                 logger.debug("processed='%s'", processed)
1122
1123     logger.debug("Success! - EXIT!")
1124     return 0
1125
1126 def fetch_fedipact(args: argparse.Namespace) -> int:
1127     logger.debug("args[]='%s' - CALLED!", type(args))
1128
1129     logger.debug("Invoking locking.acquire() ...")
1130     locking.acquire()
1131
1132     source_domain = "fedipact.online"
1133     if sources.is_recent(source_domain):
1134         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1135         return 1
1136     else:
1137         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1138         sources.update(source_domain)
1139
1140     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1141     response = network.fetch_url(
1142         f"https://{source_domain}",
1143         network.web_headers,
1144         (config.get("connection_timeout"), config.get("read_timeout"))
1145     )
1146
1147     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1148     if response.ok and response.status_code == 200 and response.text != "":
1149         logger.debug("Parsing %d Bytes ...", len(response.text))
1150
1151         doc = bs4.BeautifulSoup(response.text, "html.parser")
1152         logger.debug("doc[]='%s'", type(doc))
1153
1154         rows = doc.findAll("li")
1155         logger.info("Checking %d row(s) ...", len(rows))
1156         for row in rows:
1157             logger.debug("row[]='%s'", type(row))
1158             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1159
1160             logger.debug("domain='%s' - AFTER!", domain)
1161             if domain in [None, ""]:
1162                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1163                 continue
1164
1165             logger.debug("domain='%s' - BEFORE!", domain)
1166             domain = domain.encode("idna").decode("utf-8")
1167             logger.debug("domain='%s' - AFTER!", domain)
1168
1169             if not domain_helper.is_wanted(domain):
1170                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1171                 continue
1172             elif instances.is_registered(domain):
1173                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1174                 continue
1175             elif instances.is_recent(domain):
1176                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1177                 continue
1178
1179             logger.info("Fetching domain='%s' ...", domain)
1180             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1181
1182     logger.debug("Success! - EXIT!")
1183     return 0
1184
1185 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1186     logger.debug("args[]='%s' - CALLED!", type(args))
1187
1188     logger.debug("Invoking locking.acquire() ...")
1189     locking.acquire()
1190
1191     source_domain = "instances.joinmobilizon.org"
1192     if sources.is_recent(source_domain):
1193         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1194         return 1
1195     else:
1196         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1197         sources.update(source_domain)
1198
1199     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1200     raw = network.fetch_url(
1201         f"https://{source_domain}/api/v1/instances",
1202         network.web_headers,
1203         (config.get("connection_timeout"), config.get("read_timeout"))
1204     ).text
1205     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1206
1207     parsed = json.loads(raw)
1208     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1209
1210     if "data" not in parsed:
1211         logger.warning("parsed()=%d does not contain key 'data'")
1212         return 1
1213
1214     logger.info("Checking %d instances ...", len(parsed["data"]))
1215     for row in parsed["data"]:
1216         logger.debug("row[]='%s'", type(row))
1217         if "host" not in row:
1218             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1219             continue
1220         elif not domain_helper.is_wanted(row["host"]):
1221             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1222             continue
1223         elif instances.is_registered(row["host"]):
1224             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1225             continue
1226
1227         logger.info("Fetching row[host]='%s' ...", row["host"])
1228         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1229
1230     logger.debug("Success! - EXIT!")
1231     return 0
1232
1233 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1234     logger.debug("args[]='%s' - CALLED!", type(args))
1235
1236     logger.debug("Invoking locking.acquire() ...")
1237     locking.acquire()
1238
1239     source_domain = "instanceapp.misskey.page"
1240     if sources.is_recent(source_domain):
1241         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1242         return 1
1243     else:
1244         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1245         sources.update(source_domain)
1246
1247     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1248     raw = network.fetch_url(
1249         f"https://{source_domain}/instances.json",
1250         network.web_headers,
1251         (config.get("connection_timeout"), config.get("read_timeout"))
1252     ).text
1253     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1254
1255     parsed = json.loads(raw)
1256     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1257
1258     if "instancesInfos" not in parsed:
1259         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1260         return 1
1261
1262     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1263     for row in parsed["instancesInfos"]:
1264         logger.debug("row[%s]='%s'", type(row), row)
1265         if "url" not in row:
1266             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1267             continue
1268         elif not domain_helper.is_wanted(row["url"]):
1269             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1270             continue
1271         elif instances.is_registered(row["url"]):
1272             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1273             continue
1274
1275         logger.info("Fetching row[url]='%s' ...", row["url"])
1276         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1277
1278     logger.debug("Success! - EXIT!")
1279     return 0
1280
1281 def recheck_obfuscation(args: argparse.Namespace) -> int:
1282     logger.debug("args[]='%s' - CALLED!", type(args))
1283
1284     logger.debug("Invoking locking.acquire() ...")
1285     locking.acquire()
1286
1287     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1288         logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1289         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1290     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1291         logger.debug("Fetching records for args.software='%s' ...", args.software)
1292         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1293     else:
1294         logger.debug("Fetching records where domains have obfuscated block entries ...")
1295         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1296
1297     rows = database.cursor.fetchall()
1298     logger.info("Checking %d domains ...", len(rows))
1299     for row in rows:
1300         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1301         if blacklist.is_blacklisted(row["domain"]):
1302             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1303             continue
1304         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1305             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1306             continue
1307
1308         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1309         blocking = federation.fetch_blocks(row["domain"])
1310
1311         logger.debug("blocking()=%d", len(blocking))
1312         if len(blocking) == 0:
1313             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1314             if row["software"] == "pleroma":
1315                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1316                 blocking = pleroma.fetch_blocks(row["domain"])
1317             elif row["software"] == "mastodon":
1318                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1319                 blocking = mastodon.fetch_blocks(row["domain"])
1320             elif row["software"] == "lemmy":
1321                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1322                 blocking = lemmy.fetch_blocks(row["domain"])
1323             elif row["software"] == "friendica":
1324                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1325                 blocking = friendica.fetch_blocks(row["domain"])
1326             elif row["software"] == "misskey":
1327                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1328                 blocking = misskey.fetch_blocks(row["domain"])
1329             else:
1330                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1331
1332         # c.s isn't part of oliphant's "hidden" blocklists
1333         logger.debug("row[domain]='%s'", row["domain"])
1334         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1335             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1336             instances.set_last_blocked(row["domain"])
1337             instances.set_total_blocks(row["domain"], blocking)
1338
1339         obfuscated = 0
1340         blockdict = list()
1341
1342         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1343         for block in blocking:
1344             logger.debug("block[blocked]='%s'", block["blocked"])
1345             blocked = None
1346
1347             if block["blocked"] == "":
1348                 logger.debug("block[blocked] is empty - SKIPPED!")
1349                 continue
1350             elif block["blocked"].endswith(".onion"):
1351                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1352                 continue
1353             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1354                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1355                 continue
1356             elif block["blocked"].endswith(".arpa"):
1357                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1358                 continue
1359             elif block["blocked"].endswith(".tld"):
1360                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1361                 continue
1362             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1363                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1364                 obfuscated = obfuscated + 1
1365                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1366             elif not domain_helper.is_wanted(block["blocked"]):
1367                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1368                 continue
1369             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1370                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1371                 continue
1372
1373             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1374             if blocked is not None and blocked != block["blocked"]:
1375                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1376                 obfuscated = obfuscated - 1
1377
1378                 if blacklist.is_blacklisted(blocked):
1379                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1380                     continue
1381                 elif blacklist.is_blacklisted(row["domain"]):
1382                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1383                     continue
1384                 elif blocks.is_instance_blocked(row["domain"], blocked):
1385                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1386                     continue
1387
1388                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1389
1390                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1391                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1392                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1393                     blockdict.append({
1394                         "blocked": blocked,
1395                         "reason" : block["reason"],
1396                     })
1397
1398         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1399         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1400         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1401
1402         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1403         if instances.has_pending(row["domain"]):
1404             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1405             instances.update(row["domain"])
1406
1407         logger.debug("Invoking commit() ...")
1408         database.connection.commit()
1409
1410         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1411         if config.get("bot_enabled") and len(blockdict) > 0:
1412             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1413             network.send_bot_post(row["domain"], blockdict)
1414
1415     logger.debug("Success! - EXIT!")
1416     return 0
1417
1418 def fetch_fedilist(args: argparse.Namespace) -> int:
1419     logger.debug("args[]='%s' - CALLED!", type(args))
1420
1421     logger.debug("Invoking locking.acquire() ...")
1422     locking.acquire()
1423
1424     source_domain = "demo.fedilist.com"
1425     if sources.is_recent(source_domain):
1426         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1427         return 1
1428     else:
1429         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1430         sources.update(source_domain)
1431
1432     url = f"http://{source_domain}/instance/csv?onion=not"
1433     if args.software is not None and args.software != "":
1434         logger.debug("args.software='%s'", args.software)
1435         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1436
1437     logger.info("Fetching url='%s' ...", url)
1438     response = reqto.get(
1439         url,
1440         headers=network.web_headers,
1441         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1442         allow_redirects=False
1443     )
1444
1445     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1446     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1447         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1448         return 1
1449
1450     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1451
1452     logger.debug("reader[]='%s'", type(reader))
1453     if reader is None:
1454         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1455         return 2
1456
1457     rows = list(reader)
1458
1459     logger.info("Checking %d rows ...", len(rows))
1460     for row in rows:
1461         logger.debug("row[]='%s'", type(row))
1462         if "hostname" not in row:
1463             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1464             continue
1465
1466         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1467         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1468         logger.debug("domain='%s' - AFTER!", domain)
1469
1470         if domain in [None, ""]:
1471             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1472             continue
1473
1474         logger.debug("domain='%s' - BEFORE!", domain)
1475         domain = domain.encode("idna").decode("utf-8")
1476         logger.debug("domain='%s' - AFTER!", domain)
1477
1478         if not domain_helper.is_wanted(domain):
1479             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1480             continue
1481         elif (args.force is None or not args.force) and instances.is_registered(domain):
1482             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1483             continue
1484         elif instances.is_recent(domain):
1485             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1486             continue
1487
1488         logger.info("Fetching instances from domain='%s' ...", domain)
1489         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1490
1491     logger.debug("Success! - EXIT!")
1492     return 0
1493
1494 def update_nodeinfo(args: argparse.Namespace) -> int:
1495     logger.debug("args[]='%s' - CALLED!", type(args))
1496
1497     logger.debug("Invoking locking.acquire() ...")
1498     locking.acquire()
1499
1500     if args.domain is not None and args.domain != "":
1501         logger.debug("Fetching args.domain='%s'", args.domain)
1502         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1503     elif args.software is not None and args.software != "":
1504         logger.info("Fetching domains for args.software='%s'", args.software)
1505         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1506     elif args.mode is not None and args.mode != "":
1507         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1508         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1509     elif args.no_software:
1510         logger.info("Fetching domains with no software type detected ...")
1511         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1512     elif args.with_software:
1513         logger.info("Fetching domains with any software type detected ...")
1514         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1515     elif args.no_auto:
1516         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1517         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1518     elif args.no_detection:
1519         logger.info("Fetching domains with no detection mode being set ...")
1520         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1521     else:
1522         logger.info("Fetching domains for recently updated ...")
1523         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1524
1525     domains = database.cursor.fetchall()
1526
1527     logger.info("Checking %d domain(s) ...", len(domains))
1528     cnt = 0
1529     for row in domains:
1530         logger.debug("row[]='%s'", type(row))
1531         if blacklist.is_blacklisted(row["domain"]):
1532             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1533             continue
1534         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1535             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1536             continue
1537
1538         try:
1539             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1540             software = federation.determine_software(row["domain"])
1541
1542             logger.debug("Determined software='%s'", software)
1543             if (software != row["software"] and software is not None) or args.force is True:
1544                 logger.debug("software='%s'", software)
1545                 if software is None:
1546                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1547                     instances.set_nodeinfo_url(row["domain"], None)
1548
1549                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1550                 instances.set_software(row["domain"], software)
1551
1552             if software is not None:
1553                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1554                 instances.set_success(row["domain"])
1555         except network.exceptions as exception:
1556             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1557             instances.set_last_error(row["domain"], exception)
1558
1559         instances.set_last_nodeinfo(row["domain"])
1560         instances.update(row["domain"])
1561         cnt = cnt + 1
1562
1563     logger.debug("Success! - EXIT!")
1564     return 0
1565
1566 def fetch_instances_social(args: argparse.Namespace) -> int:
1567     logger.debug("args[]='%s' - CALLED!", type(args))
1568
1569     logger.debug("Invoking locking.acquire() ...")
1570     locking.acquire()
1571
1572     source_domain = "instances.social"
1573
1574     if config.get("instances_social_api_key") == "":
1575         logger.error("API key not set. Please set in your config.json file.")
1576         return 1
1577     elif sources.is_recent(source_domain):
1578         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1579         return 2
1580     else:
1581         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1582         sources.update(source_domain)
1583
1584     headers = {
1585         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1586     }
1587
1588     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1589     fetched = network.get_json_api(
1590         source_domain,
1591         "/api/1.0/instances/list?count=0&sort_by=name",
1592         headers=headers,
1593         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1594     )
1595     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1596
1597     if "error_message" in fetched:
1598         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1599         return 2
1600     elif "exception" in fetched:
1601         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1602         return 3
1603     elif "json" not in fetched:
1604         logger.warning("fetched has no element 'json' - EXIT!")
1605         return 4
1606     elif "instances" not in fetched["json"]:
1607         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1608         return 5
1609
1610     domains = list()
1611     rows = fetched["json"]["instances"]
1612
1613     logger.info("Checking %d row(s) ...", len(rows))
1614     for row in rows:
1615         logger.debug("row[]='%s'", type(row))
1616         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1617         logger.debug("domain='%s' - AFTER!", domain)
1618
1619         if domain is None and domain == "":
1620             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1621             continue
1622
1623         logger.debug("domain='%s' - BEFORE!", domain)
1624         domain = domain.encode("idna").decode("utf-8")
1625         logger.debug("domain='%s' - AFTER!", domain)
1626
1627         if not domain_helper.is_wanted(domain):
1628             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1629             continue
1630         elif domain in domains:
1631             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1632             continue
1633         elif instances.is_registered(domain):
1634             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1635             continue
1636         elif instances.is_recent(domain):
1637             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1638             continue
1639
1640         logger.info("Fetching instances from domain='%s' ...", domain)
1641         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1642
1643     logger.debug("Success! - EXIT!")
1644     return 0
1645
1646 def fetch_relaylist(args: argparse.Namespace) -> int:
1647     logger.debug("args[]='%s' - CALLED!", type(args))
1648
1649     logger.debug("Invoking locking.acquire() ...")
1650     locking.acquire()
1651
1652     source_domain = "api.relaylist.com"
1653
1654     if sources.is_recent(source_domain):
1655         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1656         return 1
1657     else:
1658         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1659         sources.update(source_domain)
1660
1661     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1662     fetched = network.get_json_api(
1663         source_domain,
1664         "/relays",
1665         {},
1666         (config.get("connection_timeout"), config.get("read_timeout"))
1667     )
1668     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1669
1670     if "error_message" in fetched:
1671         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1672         return 2
1673     elif "exception" in fetched:
1674         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1675         return 3
1676     elif "json" not in fetched:
1677         logger.warning("fetched has no element 'json' - EXIT!")
1678         return 4
1679
1680     domains = list()
1681
1682     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1683     for row in fetched["json"]:
1684         logger.debug("row[]='%s'", type(row))
1685         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1686         logger.debug("domain='%s' - AFTER!", domain)
1687
1688         if domain is None and domain == "":
1689             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1690             continue
1691
1692         logger.debug("domain='%s' - BEFORE!", domain)
1693         domain = domain.encode("idna").decode("utf-8")
1694         logger.debug("domain='%s' - AFTER!", domain)
1695
1696         if not domain_helper.is_wanted(domain):
1697             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1698             continue
1699         elif domain in domains:
1700             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1701             continue
1702         elif instances.is_registered(domain):
1703             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1704             continue
1705         elif instances.is_recent(domain):
1706             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1707             continue
1708
1709         logger.info("Fetching instances from domain='%s'", domain)
1710         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1711
1712     logger.debug("Success! - EXIT!")
1713     return 0
1714
1715 def fetch_relays(args: argparse.Namespace) -> int:
1716     logger.debug("args[]='%s' - CALLED!", type(args))
1717
1718     logger.debug("Invoking locking.acquire() ...")
1719     locking.acquire()
1720
1721     if args.domain is not None and args.domain != "":
1722         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1723         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1724     elif args.software is not None and args.software != "":
1725         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1726         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1727     else:
1728         logger.debug("Fetch all relay instances ...")
1729         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1730
1731     domains = list()
1732     rows = database.cursor.fetchall()
1733
1734     logger.info("Checking %d relays ...", len(rows))
1735     for row in rows:
1736         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1737         if not args.force and instances.is_recent(row["domain"]):
1738             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1739             continue
1740         elif row["nodeinfo_url"] is None:
1741             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1742             continue
1743
1744         peers = list()
1745         try:
1746             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1747             if row["software"] == "pub-relay":
1748                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1749                 raw = network.fetch_api_url(
1750                     row["nodeinfo_url"],
1751                     (config.get("connection_timeout"), config.get("read_timeout"))
1752                 )
1753
1754                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1755                 if "exception" in raw:
1756                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1757                     raise raw["exception"]
1758                 elif "error_message" in raw:
1759                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1760                     instances.set_last_error(row["domain"], raw)
1761                     instances.set_last_instance_fetch(row["domain"])
1762                     instances.update(row["domain"])
1763                     continue
1764                 elif "json" not in raw:
1765                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1766                     continue
1767                 elif not "metadata" in raw["json"]:
1768                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1769                     continue
1770                 elif not "peers" in raw["json"]["metadata"]:
1771                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1772                     continue
1773             else:
1774                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1775                 raw = network.fetch_url(
1776                     f"https://{row['domain']}",
1777                     network.web_headers,
1778                     (config.get("connection_timeout"), config.get("read_timeout"))
1779                 ).text
1780                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1781
1782                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1783                 logger.debug("doc[]='%s'", type(doc))
1784
1785         except network.exceptions as exception:
1786             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1787             instances.set_last_error(row["domain"], exception)
1788             instances.set_last_instance_fetch(row["domain"])
1789             instances.update(row["domain"])
1790             continue
1791
1792         logger.debug("row[software]='%s'", row["software"])
1793         if row["software"] == "activityrelay":
1794             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1795             tags = doc.findAll("p")
1796
1797             logger.debug("Checking %d paragraphs ...", len(tags))
1798             for tag in tags:
1799                 logger.debug("tag[]='%s'", type(tag))
1800                 if len(tag.contents) == 0:
1801                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1802                     continue
1803                 elif "registered instances" not in tag.contents[0]:
1804                     logger.debug("Skipping paragraph, text not found.")
1805                     continue
1806
1807                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1808                 for domain in tag.contents:
1809                     logger.debug("domain[%s]='%s'", type(domain), domain)
1810                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1811                         continue
1812
1813                     domain = str(domain)
1814                     logger.debug("domain='%s'", domain)
1815                     if not domain_helper.is_wanted(domain):
1816                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1817                         continue
1818
1819                     logger.debug("domain='%s' - BEFORE!", domain)
1820                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1821                     logger.debug("domain='%s' - AFTER!", domain)
1822
1823                     if domain in [None, ""]:
1824                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1825                         continue
1826                     elif domain not in peers:
1827                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1828                         peers.append(domain)
1829
1830                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1831                     if dict_helper.has_key(domains, "domain", domain):
1832                         logger.debug("domain='%s' already added", domain)
1833                         continue
1834
1835                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1836                     domains.append({
1837                         "domain": domain,
1838                         "origin": row["domain"],
1839                     })
1840         elif row["software"] in ["aoderelay", "selective-relay"]:
1841             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1842             if row["software"] == "aoderelay":
1843                 tags = doc.findAll("section", {"class": "instance"})
1844             else:
1845                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1846
1847             logger.debug("Checking %d tags ...", len(tags))
1848             for tag in tags:
1849                 logger.debug("tag[]='%s'", type(tag))
1850
1851                 link = tag.find("a")
1852                 logger.debug("link[%s]='%s'", type(link), link)
1853                 if not isinstance(link, bs4.element.Tag):
1854                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1855                     continue
1856
1857                 components = urlparse(link.get("href"))
1858                 logger.debug("components(%d)='%s'", len(components), components)
1859                 domain = components.netloc.lower().split(":")[0]
1860
1861                 logger.debug("domain='%s' - BEFORE!", domain)
1862                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1863                 logger.debug("domain='%s' - AFTER!", domain)
1864
1865                 if domain in [None, ""]:
1866                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1867                     continue
1868                 elif domain not in peers:
1869                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1870                     peers.append(domain)
1871
1872                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1873                 if dict_helper.has_key(domains, "domain", domain):
1874                     logger.debug("domain='%s' already added", domain)
1875                     continue
1876
1877                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1878                 domains.append({
1879                     "domain": domain,
1880                     "origin": row["domain"],
1881                 })
1882         elif row["software"] == "pub-relay":
1883             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1884             for domain in raw["json"]["metadata"]["peers"]:
1885                 logger.debug("domain='%s' - BEFORE!", domain)
1886                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1887                 logger.debug("domain='%s' - AFTER!", domain)
1888
1889                 if domain in [None, ""]:
1890                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1891                     continue
1892                 elif domain not in peers:
1893                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1894                     peers.append(domain)
1895
1896                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1897                 if dict_helper.has_key(domains, "domain", domain):
1898                     logger.debug("domain='%s' already added", domain)
1899                     continue
1900
1901                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1902                 domains.append({
1903                     "domain": domain,
1904                     "origin": row["domain"],
1905                 })
1906         else:
1907             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1908             continue
1909
1910         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1911         instances.set_last_instance_fetch(row["domain"])
1912
1913         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1914         instances.set_total_peers(row["domain"], peers)
1915
1916         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1917         instances.update(row["domain"])
1918
1919     logger.info("Checking %d domains ...", len(domains))
1920     for row in domains:
1921         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1922         if not domain_helper.is_wanted(row["domain"]):
1923             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1924             continue
1925         elif instances.is_registered(row["domain"]):
1926             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1927             continue
1928
1929         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1930         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1931
1932     logger.debug("Success! - EXIT!")
1933     return 0
1934
1935 def convert_idna(args: argparse.Namespace) -> int:
1936     logger.debug("args[]='%s' - CALLED!", type(args))
1937
1938     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1939     rows = database.cursor.fetchall()
1940
1941     logger.debug("rows[]='%s'", type(rows))
1942     instances.translate_idnas(rows, "domain")
1943
1944     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1945     rows = database.cursor.fetchall()
1946
1947     logger.debug("rows[]='%s'", type(rows))
1948     instances.translate_idnas(rows, "origin")
1949
1950     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1951     rows = database.cursor.fetchall()
1952
1953     logger.debug("rows[]='%s'", type(rows))
1954     blocks.translate_idnas(rows, "blocker")
1955
1956     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1957     rows = database.cursor.fetchall()
1958
1959     logger.debug("rows[]='%s'", type(rows))
1960     blocks.translate_idnas(rows, "blocked")
1961
1962     logger.debug("Success! - EXIT!")
1963     return 0
1964
1965 def remove_invalid(args: argparse.Namespace) -> int:
1966     logger.debug("args[]='%s' - CALLED!", type(args))
1967
1968     logger.debug("Invoking locking.acquire() ...")
1969     locking.acquire()
1970
1971     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1972     rows = database.cursor.fetchall()
1973
1974     logger.info("Checking %d domains ...", len(rows))
1975     for row in rows:
1976         logger.debug("row[domain]='%s'", row["domain"])
1977         if not validators.domain(row["domain"].split("/")[0]):
1978             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1979             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1980             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1981
1982     logger.debug("Invoking commit() ...")
1983     database.connection.commit()
1984
1985     logger.info("Vaccum cleaning database ...")
1986     database.cursor.execute("VACUUM")
1987
1988     logger.debug("Success! - EXIT!")
1989     return 0