]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] is None or row["domain"] == "":
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] is None or entry["domain"] == "":
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353         deobfuscated = obfuscated = 0
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         for block in blocking:
357             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358
359             if block["block_level"] == "":
360                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
361                 continue
362
363             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
364             block["blocked"] = tidyup.domain(block["blocked"])
365             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
366             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367
368             if block["blocked"] is None or block["blocked"] == "":
369                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
370                 continue
371             elif block["blocked"].endswith(".onion"):
372                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373                 continue
374             elif block["blocked"].endswith(".arpa"):
375                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".tld"):
378                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].find("*") >= 0:
381                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
382                 instances.set_has_obfuscation(blocker, True)
383                 obfuscated = obfuscated + 1
384
385                 # Some friendica servers also obscure domains without hash
386                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
387
388                 logger.debug("row[]='%s'", type(row))
389                 if row is None:
390                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
391                     continue
392
393                 deobfuscated = deobfuscated + 1
394                 block["blocked"] = row["domain"]
395                 origin           = row["origin"]
396                 nodeinfo_url     = row["nodeinfo_url"]
397             elif block["blocked"].find("?") >= 0:
398                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
399                 instances.set_has_obfuscation(blocker, True)
400                 obfuscated = obfuscated + 1
401
402                 # Some obscure them with question marks, not sure if that's dependent on version or not
403                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
404
405                 logger.debug("row[]='%s'", type(row))
406                 if row is None:
407                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
408                     continue
409
410                 deobfuscated = deobfuscated + 1
411                 block["blocked"] = row["domain"]
412                 origin           = row["origin"]
413                 nodeinfo_url     = row["nodeinfo_url"]
414
415             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
416             if block["blocked"] is None or block["blocked"] == "":
417                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
418                 continue
419
420             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
421             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
422             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
423
424             if not domain_helper.is_wanted(block["blocked"]):
425                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
426                 continue
427             elif block["block_level"] in ["accept", "accepted"]:
428                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
429                 continue
430             elif not instances.is_registered(block["blocked"]):
431                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
432                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
433
434             block["block_level"] = blocks.alias_block_level(block["block_level"])
435
436             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
437                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
438                 blockdict.append({
439                     "blocked": block["blocked"],
440                     "reason" : block["reason"],
441                 })
442
443             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
444             cookies.clear(block["blocked"])
445
446         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
447         instances.set_obfuscated_blocks(blocker, obfuscated)
448
449         logger.debug("Flushing updates for blocker='%s' ...", blocker)
450         instances.update(blocker)
451
452         logger.debug("Invoking commit() ...")
453         database.connection.commit()
454
455         logger.debug("Invoking cookies.clear(%s) ...", blocker)
456         cookies.clear(blocker)
457
458         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
459         if config.get("bot_enabled") and len(blockdict) > 0:
460             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
461             network.send_bot_post(blocker, blockdict)
462
463     logger.debug("Success! - EXIT!")
464     return 0
465
466 def fetch_observer(args: argparse.Namespace) -> int:
467     logger.debug("args[]='%s' - CALLED!", type(args))
468
469     logger.debug("Invoking locking.acquire() ...")
470     locking.acquire()
471
472     source_domain = "fediverse.observer"
473     if sources.is_recent(source_domain):
474         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
475         return 1
476     else:
477         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
478         sources.update(source_domain)
479
480     types = list()
481     if args.software is None:
482         logger.info("Fetching software list ...")
483         raw = utils.fetch_url(
484             f"https://{source_domain}",
485             network.web_headers,
486             (config.get("connection_timeout"), config.get("read_timeout"))
487         ).text
488         logger.debug("raw[%s]()=%d", type(raw), len(raw))
489
490         doc = bs4.BeautifulSoup(raw, features="html.parser")
491         logger.debug("doc[]='%s'", type(doc))
492
493         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
494         logger.debug("navbar[]='%s'", type(navbar))
495         if navbar is None:
496             logger.warning("Cannot find navigation bar, cannot continue!")
497             return 1
498
499         items = navbar.findAll("a", {"class": "dropdown-item"})
500         logger.debug("items[]='%s'", type(items))
501
502         logger.info("Checking %d menu items ...", len(items))
503         for item in items:
504             logger.debug("item[%s]='%s'", type(item), item)
505             if item.text.lower() == "all":
506                 logger.debug("Skipping 'All' menu entry ...")
507                 continue
508
509             logger.debug("Appending item.text='%s' ...", item.text)
510             types.append(tidyup.domain(item.text))
511     else:
512         logger.info("Adding args.software='%s' as type ...", args.software)
513         types.append(args.software)
514
515     logger.info("Fetching %d different table data ...", len(types))
516     for software in types:
517         logger.debug("software='%s'", software)
518
519         if args.software is not None and args.software != software:
520             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
521             continue
522
523         doc = None
524         try:
525             logger.debug("Fetching table data for software='%s' ...", software)
526             raw = utils.fetch_url(
527                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
528                 network.web_headers,
529                 (config.get("connection_timeout"), config.get("read_timeout"))
530             ).text
531             logger.debug("raw[%s]()=%d", type(raw), len(raw))
532
533             doc = bs4.BeautifulSoup(raw, features="html.parser")
534             logger.debug("doc[]='%s'", type(doc))
535         except network.exceptions as exception:
536             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
537             continue
538
539         items = doc.findAll("a", {"class": "url"})
540         logger.info("Checking %d items,software='%s' ...", len(items), software)
541         for item in items:
542             logger.debug("item[]='%s'", type(item))
543             domain = item.decode_contents()
544             logger.debug("domain[%s]='%s'", type(domain), domain)
545             domain = tidyup.domain(domain) if domain not in [None, ""] else None
546             logger.debug("domain='%s' - AFTER!", domain)
547
548             if domain is None or domain == "":
549                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
550                 continue
551
552             logger.debug("domain='%s' - BEFORE!", domain)
553             domain = domain.encode("idna").decode("utf-8")
554             logger.debug("domain='%s' - AFTER!", domain)
555
556             if not domain_helper.is_wanted(domain):
557                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
558                 continue
559             elif instances.is_registered(domain):
560                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
561                 continue
562
563             logger.info("Fetching instances for domain='%s'", domain)
564             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
565
566     logger.debug("Success! - EXIT!")
567     return 0
568
569 def fetch_todon_wiki(args: argparse.Namespace) -> int:
570     logger.debug("args[]='%s' - CALLED!", type(args))
571
572     logger.debug("Invoking locking.acquire() ...")
573     locking.acquire()
574
575     source_domain = "wiki.todon.eu"
576     if sources.is_recent(source_domain):
577         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
578         return 1
579     else:
580         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
581         sources.update(source_domain)
582
583     blocklist = {
584         "silenced": list(),
585         "reject": list(),
586     }
587
588     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
589     raw = utils.fetch_url(
590         f"https://{source_domain}/todon/domainblocks",
591         network.web_headers,
592         (config.get("connection_timeout"), config.get("read_timeout"))
593     ).text
594     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
595
596     doc = bs4.BeautifulSoup(raw, "html.parser")
597     logger.debug("doc[]='%s'", type(doc))
598
599     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
600     logger.info("Checking %d silenced/limited entries ...", len(silenced))
601     blocklist["silenced"] = utils.find_domains(silenced, "div")
602
603     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
604     logger.info("Checking %d suspended entries ...", len(suspended))
605     blocklist["reject"] = utils.find_domains(suspended, "div")
606
607     blocking = blocklist["silenced"] + blocklist["reject"]
608     blocker = "todon.eu"
609
610     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
611     instances.set_last_blocked(blocker)
612     instances.set_total_blocks(blocker, blocking)
613
614     blockdict = list()
615     for block_level in blocklist:
616         blockers = blocklist[block_level]
617
618         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
619         for blocked in blockers:
620             logger.debug("blocked='%s'", blocked)
621
622             if not instances.is_registered(blocked):
623                 try:
624                     logger.info("Fetching instances from domain='%s' ...", blocked)
625                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
626                 except network.exceptions as exception:
627                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
628                     instances.set_last_error(blocked, exception)
629
630             if not domain_helper.is_wanted(blocked):
631                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
632                 continue
633             elif not domain_helper.is_wanted(blocker):
634                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
635                 continue
636             elif blocks.is_instance_blocked(blocker, blocked, block_level):
637                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
638                 continue
639
640             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
641             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
642                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
643                 blockdict.append({
644                     "blocked": blocked,
645                     "reason" : None,
646                 })
647
648         logger.debug("Invoking commit() ...")
649         database.connection.commit()
650
651         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
652         if config.get("bot_enabled") and len(blockdict) > 0:
653             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
654             network.send_bot_post(blocker, blockdict)
655
656     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
657     if instances.has_pending(blocker):
658         logger.debug("Flushing updates for blocker='%s' ...", blocker)
659         instances.update(blocker)
660
661     logger.debug("Success! - EXIT!")
662     return 0
663
664 def fetch_cs(args: argparse.Namespace):
665     logger.debug("args[]='%s' - CALLED!", type(args))
666
667     logger.debug("Invoking locking.acquire() ...")
668     locking.acquire()
669
670     extensions = [
671         "extra",
672         "abbr",
673         "attr_list",
674         "def_list",
675         "fenced_code",
676         "footnotes",
677         "md_in_html",
678         "admonition",
679         "codehilite",
680         "legacy_attrs",
681         "legacy_em",
682         "meta",
683         "nl2br",
684         "sane_lists",
685         "smarty",
686         "toc",
687         "wikilinks"
688     ]
689
690     blocklist = {
691         "silenced": list(),
692         "reject"  : list(),
693     }
694
695     source_domain = "raw.githubusercontent.com"
696     if sources.is_recent(source_domain):
697         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
698         return 1
699     else:
700         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
701         sources.update(source_domain)
702
703     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
704     raw = utils.fetch_url(
705         f"https://{source_domain}/chaossocial/meta/master/federation.md",
706         network.web_headers,
707         (config.get("connection_timeout"), config.get("read_timeout"))
708     ).text
709     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
710
711     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
712     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
713
714     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
715     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
716     blocklist["silenced"] = federation.find_domains(silenced)
717
718     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
719     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
720     blocklist["reject"] = federation.find_domains(blocked)
721
722     blocking = blocklist["silenced"] + blocklist["reject"]
723     blocker = "chaos.social"
724
725     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
726     instances.set_last_blocked(blocker)
727     instances.set_total_blocks(blocker, blocking)
728
729     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
730     if len(blocking) > 0:
731         blockdict = list()
732         for block_level in blocklist:
733             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
734
735             for row in blocklist[block_level]:
736                 logger.debug("row[%s]='%s'", type(row), row)
737                 if not "domain" in row:
738                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
739                     continue
740                 elif not instances.is_registered(row["domain"]):
741                     try:
742                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
743                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
744                     except network.exceptions as exception:
745                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
746                         instances.set_last_error(row["domain"], exception)
747
748                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
749                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
750                     blockdict.append({
751                         "blocked": row["domain"],
752                         "reason" : row["reason"],
753                     })
754
755         logger.debug("Invoking commit() ...")
756         database.connection.commit()
757
758         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
759         if config.get("bot_enabled") and len(blockdict) > 0:
760             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
761             network.send_bot_post(blocker, blockdict)
762
763     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
764     if instances.has_pending(blocker):
765         logger.debug("Flushing updates for blocker='%s' ...", blocker)
766         instances.update(blocker)
767
768     logger.debug("Success! - EXIT!")
769     return 0
770
771 def fetch_fba_rss(args: argparse.Namespace) -> int:
772     logger.debug("args[]='%s' - CALLED!", type(args))
773
774     domains = list()
775
776     logger.debug("Invoking locking.acquire() ...")
777     locking.acquire()
778
779     components = urlparse(args.feed)
780     domain = components.netloc.lower().split(":")[0]
781
782     logger.debug("domain='%s'", domain)
783     if sources.is_recent(domain):
784         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
785         return 0
786     else:
787         logger.debug("domain='%s' has not been recently used, marking ...", domain)
788         sources.update(domain)
789
790     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
791     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
792
793     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
794     if response.ok and response.status_code == 200 and len(response.text) > 0:
795         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
796         rss = atoma.parse_rss_bytes(response.content)
797
798         logger.debug("rss[]='%s'", type(rss))
799         for item in rss.items:
800             logger.debug("item[%s]='%s'", type(item), item)
801             domain = item.link.split("=")[1]
802             domain = tidyup.domain(domain) if domain not in[None, ""] else None
803
804             logger.debug("domain='%s' - AFTER!", domain)
805             if domain is None or domain == "":
806                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
807                 continue
808
809             logger.debug("domain='%s' - BEFORE!", domain)
810             domain = domain.encode("idna").decode("utf-8")
811             logger.debug("domain='%s' - AFTER!", domain)
812
813             if not domain_helper.is_wanted(domain):
814                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
815                 continue
816             elif domain in domains:
817                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
818                 continue
819             elif instances.is_registered(domain):
820                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
821                 continue
822             elif instances.is_recent(domain):
823                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
824                 continue
825
826             logger.debug("Adding domain='%s'", domain)
827             domains.append(domain)
828
829     logger.debug("domains()=%d", len(domains))
830     if len(domains) > 0:
831         logger.info("Adding %d new instances ...", len(domains))
832         for domain in domains:
833             logger.debug("domain='%s'", domain)
834             try:
835                 logger.info("Fetching instances from domain='%s' ...", domain)
836                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
837             except network.exceptions as exception:
838                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
839                 instances.set_last_error(domain, exception)
840                 return 100
841
842     logger.debug("Success! - EXIT!")
843     return 0
844
845 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
846     logger.debug("args[]='%s' - CALLED!", type(args))
847
848     logger.debug("Invoking locking.acquire() ...")
849     locking.acquire()
850
851     source_domain = "ryona.agency"
852     feed = f"https://{source_domain}/users/fba/feed.atom"
853
854     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
855     if args.feed is not None and validators.url(args.feed):
856         logger.debug("Setting feed='%s' ...", args.feed)
857         feed = str(args.feed)
858         source_domain = urlparse(args.feed).netloc
859
860     if sources.is_recent(source_domain):
861         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
862         return 1
863     else:
864         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
865         sources.update(source_domain)
866
867     domains = list()
868
869     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
870     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
871
872     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
873     if response.ok and response.status_code == 200 and len(response.text) > 0:
874         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
875         atom = atoma.parse_atom_bytes(response.content)
876
877         logger.debug("atom[]='%s'", type(atom))
878         for entry in atom.entries:
879             logger.debug("entry[]='%s'", type(entry))
880             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
881             logger.debug("doc[]='%s'", type(doc))
882
883             for element in doc.findAll("a"):
884                 logger.debug("element[]='%s'", type(element))
885                 for href in element["href"].split(","):
886                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
887                     domain = tidyup.domain(href) if href not in [None, ""] else None
888
889                     logger.debug("domain='%s' - AFTER!", domain)
890                     if domain is None or domain == "":
891                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
892                         continue
893
894                     logger.debug("domain='%s' - BEFORE!", domain)
895                     domain = domain.encode("idna").decode("utf-8")
896                     logger.debug("domain='%s' - AFTER!", domain)
897
898                     if not domain_helper.is_wanted(domain):
899                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
900                         continue
901                     elif domain in domains:
902                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
903                         continue
904                     elif instances.is_registered(domain):
905                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
906                         continue
907                     elif instances.is_recent(domain):
908                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
909                         continue
910
911                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
912                     domains.append(domain)
913
914     logger.debug("domains()=%d", len(domains))
915     if len(domains) > 0:
916         logger.info("Adding %d new instances ...", len(domains))
917         for domain in domains:
918             logger.debug("domain='%s'", domain)
919             try:
920                 logger.info("Fetching instances from domain='%s' ...", domain)
921                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
922             except network.exceptions as exception:
923                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
924                 instances.set_last_error(domain, exception)
925                 return 100
926
927     logger.debug("Success! - EXIT!")
928     return 0
929
930 def fetch_instances(args: argparse.Namespace) -> int:
931     logger.debug("args[]='%s' - CALLED!", type(args))
932
933     logger.debug("args.domain='%s' - checking ...", args.domain)
934     if not validators.domain(args.domain):
935         logger.warning("args.domain='%s' is not valid.", args.domain)
936         return 100
937     elif blacklist.is_blacklisted(args.domain):
938         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
939         return 101
940
941     logger.debug("Invoking locking.acquire() ...")
942     locking.acquire()
943
944     # Initialize values
945     domain = tidyup.domain(args.domain)
946     origin = software = None
947
948     # Fetch record
949     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
950     row = database.cursor.fetchone()
951     if row is not None:
952         origin = row["origin"]
953         software = row["software"]
954
955     if software_helper.is_relay(software):
956         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
957         return 102
958
959     # Initial fetch
960     try:
961         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
962         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
963     except network.exceptions as exception:
964         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
965         instances.set_last_error(args.domain, exception)
966         instances.update(args.domain)
967         return 100
968
969     if args.single:
970         logger.debug("Not fetching more instances - EXIT!")
971         return 0
972
973     # Loop through some instances
974     database.cursor.execute(
975         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
976     )
977
978     rows = database.cursor.fetchall()
979     logger.info("Checking %d entries ...", len(rows))
980     for row in rows:
981         logger.debug("row[domain]='%s'", row["domain"])
982         if row["domain"] == "":
983             logger.debug("row[domain] is empty - SKIPPED!")
984             continue
985
986         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
987         domain = row["domain"].encode("idna").decode("utf-8")
988         logger.debug("domain='%s' - AFTER!", domain)
989
990         if not domain_helper.is_wanted(domain):
991             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
992             continue
993
994         try:
995             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
996             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
997         except network.exceptions as exception:
998             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
999             instances.set_last_error(domain, exception)
1000
1001     logger.debug("Success - EXIT!")
1002     return 0
1003
1004 def fetch_csv(args: argparse.Namespace) -> int:
1005     logger.debug("args[]='%s' - CALLED!", type(args))
1006
1007     logger.debug("Invoking locking.acquire() ...")
1008     locking.acquire()
1009
1010     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1011     for block in blocklists.csv_files:
1012         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1013
1014         # Is domain given and not equal blocker?
1015         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1016             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1017             continue
1018
1019         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1020         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1021
1022     logger.debug("Success - EXIT!")
1023     return 0
1024
1025 def fetch_oliphant(args: argparse.Namespace) -> int:
1026     logger.debug("args[]='%s' - CALLED!", type(args))
1027
1028     logger.debug("Invoking locking.acquire() ...")
1029     locking.acquire()
1030
1031     source_domain = "codeberg.org"
1032     if sources.is_recent(source_domain):
1033         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1034         return 1
1035     else:
1036         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1037         sources.update(source_domain)
1038
1039     # Base URL
1040     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1041
1042     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1043     for block in blocklists.oliphant_blocklists:
1044         # Is domain given and not equal blocker?
1045         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1046         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1047             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1048             continue
1049
1050         url = f"{base_url}/{block['csv_url']}"
1051
1052         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1053         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1054
1055     logger.debug("Success! - EXIT!")
1056     return 0
1057
1058 def fetch_txt(args: argparse.Namespace) -> int:
1059     logger.debug("args[]='%s' - CALLED!", type(args))
1060
1061     logger.debug("Invoking locking.acquire() ...")
1062     locking.acquire()
1063
1064     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1065     for row in blocklists.txt_files:
1066         logger.debug("Fetching row[url]='%s' ...", row["url"])
1067         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1068
1069         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1070         if response.ok and response.status_code == 200 and response.text != "":
1071             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1072             domains = response.text.strip().split("\n")
1073
1074             logger.info("Processing %d domains ...", len(domains))
1075             for domain in domains:
1076                 logger.debug("domain='%s' - BEFORE!", domain)
1077                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1078
1079                 logger.debug("domain='%s' - AFTER!", domain)
1080                 if domain is None or domain == "":
1081                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1082                     continue
1083                 elif not domain_helper.is_wanted(domain):
1084                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1085                     continue
1086                 elif instances.is_recent(domain):
1087                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1088                     continue
1089
1090                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1091                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1092
1093                 logger.debug("processed='%s'", processed)
1094                 if not processed:
1095                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1096                     continue
1097
1098     logger.debug("Success! - EXIT!")
1099     return 0
1100
1101 def fetch_fedipact(args: argparse.Namespace) -> int:
1102     logger.debug("args[]='%s' - CALLED!", type(args))
1103
1104     logger.debug("Invoking locking.acquire() ...")
1105     locking.acquire()
1106
1107     source_domain = "fedipact.online"
1108     if sources.is_recent(source_domain):
1109         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1110         return 1
1111     else:
1112         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1113         sources.update(source_domain)
1114
1115     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1116     response = utils.fetch_url(
1117         f"https://{source_domain}",
1118         network.web_headers,
1119         (config.get("connection_timeout"), config.get("read_timeout"))
1120     )
1121
1122     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1123     if response.ok and response.status_code == 200 and response.text != "":
1124         logger.debug("Parsing %d Bytes ...", len(response.text))
1125
1126         doc = bs4.BeautifulSoup(response.text, "html.parser")
1127         logger.debug("doc[]='%s'", type(doc))
1128
1129         rows = doc.findAll("li")
1130         logger.info("Checking %d row(s) ...", len(rows))
1131         for row in rows:
1132             logger.debug("row[]='%s'", type(row))
1133             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1134
1135             logger.debug("domain='%s' - AFTER!", domain)
1136             if domain is None or domain == "":
1137                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1138                 continue
1139
1140             logger.debug("domain='%s' - BEFORE!", domain)
1141             domain = domain.encode("idna").decode("utf-8")
1142             logger.debug("domain='%s' - AFTER!", domain)
1143
1144             if not domain_helper.is_wanted(domain):
1145                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1146                 continue
1147             elif instances.is_registered(domain):
1148                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1149                 continue
1150             elif instances.is_recent(domain):
1151                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1152                 continue
1153
1154             logger.info("Fetching domain='%s' ...", domain)
1155             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1156
1157     logger.debug("Success! - EXIT!")
1158     return 0
1159
1160 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1161     logger.debug("args[]='%s' - CALLED!", type(args))
1162
1163     logger.debug("Invoking locking.acquire() ...")
1164     locking.acquire()
1165
1166     source_domain = "instances.joinmobilizon.org"
1167     if sources.is_recent(source_domain):
1168         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1169         return 1
1170     else:
1171         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1172         sources.update(source_domain)
1173
1174     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1175     raw = utils.fetch_url(
1176         f"https://{source_domain}/api/v1/instances",
1177         network.web_headers,
1178         (config.get("connection_timeout"), config.get("read_timeout"))
1179     ).text
1180     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1181
1182     parsed = json.loads(raw)
1183     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1184
1185     if "data" not in parsed:
1186         logger.warning("parsed()=%d does not contain key 'data'")
1187         return 1
1188
1189     logger.info("Checking %d instances ...", len(parsed["data"]))
1190     for row in parsed["data"]:
1191         logger.debug("row[]='%s'", type(row))
1192         if "host" not in row:
1193             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1194             continue
1195         elif not domain_helper.is_wanted(row["host"]):
1196             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1197             continue
1198         elif instances.is_registered(row["host"]):
1199             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1200             continue
1201
1202         logger.info("Fetching row[host]='%s' ...", row["host"])
1203         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1204
1205     logger.debug("Success! - EXIT!")
1206     return 0
1207
1208 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1209     logger.debug("args[]='%s' - CALLED!", type(args))
1210
1211     logger.debug("Invoking locking.acquire() ...")
1212     locking.acquire()
1213
1214     source_domain = "instanceapp.misskey.page"
1215     if sources.is_recent(source_domain):
1216         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1217         return 1
1218     else:
1219         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1220         sources.update(source_domain)
1221
1222     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1223     raw = utils.fetch_url(
1224         f"https://{source_domain}/instances.json",
1225         network.web_headers,
1226         (config.get("connection_timeout"), config.get("read_timeout"))
1227     ).text
1228     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1229
1230     parsed = json.loads(raw)
1231     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1232
1233     if "instancesInfos" not in parsed:
1234         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1235         return 1
1236
1237     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1238     for row in parsed["instancesInfos"]:
1239         logger.debug("row[%s]='%s'", type(row), row)
1240         if "url" not in row:
1241             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1242             continue
1243         elif not domain_helper.is_wanted(row["url"]):
1244             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1245             continue
1246         elif instances.is_registered(row["url"]):
1247             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1248             continue
1249
1250         logger.info("Fetching row[url]='%s' ...", row["url"])
1251         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1252
1253     logger.debug("Success! - EXIT!")
1254     return 0
1255
1256 def recheck_obfuscation(args: argparse.Namespace) -> int:
1257     logger.debug("args[]='%s' - CALLED!", type(args))
1258
1259     logger.debug("Invoking locking.acquire() ...")
1260     locking.acquire()
1261
1262     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1263         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1264     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1265         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1266     else:
1267         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1268
1269     rows = database.cursor.fetchall()
1270     logger.info("Checking %d domains ...", len(rows))
1271     for row in rows:
1272         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1273         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1274             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1275             continue
1276
1277         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1278         blocking = federation.fetch_blocks(row["domain"])
1279
1280         logger.debug("blocking()=%d", len(blocking))
1281         if len(blocking) == 0:
1282             if row["software"] == "pleroma":
1283                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1284                 blocking = pleroma.fetch_blocks(row["domain"])
1285             elif row["software"] == "mastodon":
1286                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1287                 blocking = mastodon.fetch_blocks(row["domain"])
1288             elif row["software"] == "lemmy":
1289                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1290                 blocking = lemmy.fetch_blocks(row["domain"])
1291             elif row["software"] == "friendica":
1292                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1293                 blocking = friendica.fetch_blocks(row["domain"])
1294             elif row["software"] == "misskey":
1295                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1296                 blocking = misskey.fetch_blocks(row["domain"])
1297             else:
1298                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1299
1300         # c.s isn't part of oliphant's "hidden" blocklists
1301         logger.debug("row[domain]='%s'", row["domain"])
1302         if row["domain"] != "chaos.social" and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1303             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1304             instances.set_last_blocked(row["domain"])
1305             instances.set_total_blocks(row["domain"], blocking)
1306
1307         obfuscated = 0
1308         blockdict = list()
1309
1310         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1311         for block in blocking:
1312             logger.debug("block[blocked]='%s'", block["blocked"])
1313             blocked = None
1314
1315             if block["blocked"] == "":
1316                 logger.debug("block[blocked] is empty - SKIPPED!")
1317                 continue
1318             elif block["blocked"].endswith(".arpa"):
1319                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1320                 continue
1321             elif block["blocked"].endswith(".tld"):
1322                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1323                 continue
1324             elif block["blocked"].endswith(".onion"):
1325                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1326                 continue
1327             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1328                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1329                 obfuscated = obfuscated + 1
1330                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1331             elif not domain_helper.is_wanted(block["blocked"]):
1332                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1333                 continue
1334             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1335                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1336                 continue
1337
1338             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1339             if blocked is not None and blocked != block["blocked"]:
1340                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1341                 obfuscated = obfuscated - 1
1342
1343                 if blacklist.is_blacklisted(blocked):
1344                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1345                     continue
1346                 elif blacklist.is_blacklisted(row["domain"]):
1347                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1348                     continue
1349                 elif blocks.is_instance_blocked(row["domain"], blocked):
1350                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1351                     continue
1352
1353                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1354
1355                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1356                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1357                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1358                     blockdict.append({
1359                         "blocked": blocked,
1360                         "reason" : block["reason"],
1361                     })
1362
1363         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1364         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1365         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1366
1367         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1368         if instances.has_pending(row["domain"]):
1369             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1370             instances.update(row["domain"])
1371
1372         logger.debug("Invoking commit() ...")
1373         database.connection.commit()
1374
1375         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1376         if config.get("bot_enabled") and len(blockdict) > 0:
1377             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1378             network.send_bot_post(row["domain"], blockdict)
1379
1380     logger.debug("Success! - EXIT!")
1381     return 0
1382
1383 def fetch_fedilist(args: argparse.Namespace) -> int:
1384     logger.debug("args[]='%s' - CALLED!", type(args))
1385
1386     logger.debug("Invoking locking.acquire() ...")
1387     locking.acquire()
1388
1389     source_domain = "demo.fedilist.com"
1390     if sources.is_recent(source_domain):
1391         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1392         return 1
1393     else:
1394         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1395         sources.update(source_domain)
1396
1397     url = f"http://{source_domain}/instance/csv?onion=not"
1398     if args.software is not None and args.software != "":
1399         logger.debug("args.software='%s'", args.software)
1400         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1401
1402     logger.info("Fetching url='%s' ...", url)
1403     response = reqto.get(
1404         url,
1405         headers=network.web_headers,
1406         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1407         allow_redirects=False
1408     )
1409
1410     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1411     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1412         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1413         return 1
1414
1415     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1416
1417     logger.debug("reader[]='%s'", type(reader))
1418     if reader is None:
1419         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1420         return 2
1421
1422     rows = list(reader)
1423
1424     logger.info("Checking %d rows ...", len(rows))
1425     for row in rows:
1426         logger.debug("row[]='%s'", type(row))
1427         if "hostname" not in row:
1428             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1429             continue
1430
1431         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1432         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1433         logger.debug("domain='%s' - AFTER!", domain)
1434
1435         if domain is None or domain == "":
1436             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1437             continue
1438
1439         logger.debug("domain='%s' - BEFORE!", domain)
1440         domain = domain.encode("idna").decode("utf-8")
1441         logger.debug("domain='%s' - AFTER!", domain)
1442
1443         if not domain_helper.is_wanted(domain):
1444             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1445             continue
1446         elif (args.force is None or not args.force) and instances.is_registered(domain):
1447             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1448             continue
1449         elif instances.is_recent(domain):
1450             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1451             continue
1452
1453         logger.info("Fetching instances from domain='%s' ...", domain)
1454         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1455
1456     logger.debug("Success! - EXIT!")
1457     return 0
1458
1459 def update_nodeinfo(args: argparse.Namespace) -> int:
1460     logger.debug("args[]='%s' - CALLED!", type(args))
1461
1462     logger.debug("Invoking locking.acquire() ...")
1463     locking.acquire()
1464
1465     if args.domain is not None and args.domain != "":
1466         logger.debug("Fetching args.domain='%s'", args.domain)
1467         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1468     elif args.software is not None and args.software != "":
1469         logger.info("Fetching domains for args.software='%s'", args.software)
1470         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1471     elif args.mode is not None and args.mode != "":
1472         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1473         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1474     elif args.no_software:
1475         logger.info("Fetching domains with no software type detected ...")
1476         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1477     elif args.no_auto:
1478         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1479         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1480     elif args.no_detection:
1481         logger.info("Fetching domains with no detection mode being set ...")
1482         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1483     else:
1484         logger.info("Fetching domains for recently updated ...")
1485         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1486
1487     domains = database.cursor.fetchall()
1488
1489     logger.info("Checking %d domain(s) ...", len(domains))
1490     cnt = 0
1491     for row in domains:
1492         logger.debug("row[]='%s'", type(row))
1493         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1494             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1495             continue
1496
1497         try:
1498             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1499             software = federation.determine_software(row["domain"])
1500
1501             logger.debug("Determined software='%s'", software)
1502             if (software != row["software"] and software is not None) or args.force is True:
1503                 logger.debug("software='%s'", software)
1504                 if software is None:
1505                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1506                     instances.set_nodeinfo_url(row["domain"], None)
1507
1508                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1509                 instances.set_software(row["domain"], software)
1510
1511             if software is not None:
1512                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1513                 instances.set_success(row["domain"])
1514         except network.exceptions as exception:
1515             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1516             instances.set_last_error(row["domain"], exception)
1517
1518         instances.set_last_nodeinfo(row["domain"])
1519         instances.update(row["domain"])
1520         cnt = cnt + 1
1521
1522     logger.debug("Success! - EXIT!")
1523     return 0
1524
1525 def fetch_instances_social(args: argparse.Namespace) -> int:
1526     logger.debug("args[]='%s' - CALLED!", type(args))
1527
1528     logger.debug("Invoking locking.acquire() ...")
1529     locking.acquire()
1530
1531     source_domain = "instances.social"
1532
1533     if config.get("instances_social_api_key") == "":
1534         logger.error("API key not set. Please set in your config.json file.")
1535         return 1
1536     elif sources.is_recent(source_domain):
1537         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1538         return 2
1539     else:
1540         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1541         sources.update(source_domain)
1542
1543     headers = {
1544         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1545     }
1546
1547     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1548     fetched = network.get_json_api(
1549         source_domain,
1550         "/api/1.0/instances/list?count=0&sort_by=name",
1551         headers=headers,
1552         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1553     )
1554     logger.debug("fetched[]='%s'", type(fetched))
1555
1556     if "error_message" in fetched:
1557         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1558         return 2
1559     elif "exception" in fetched:
1560         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1561         return 3
1562     elif "json" not in fetched:
1563         logger.warning("fetched has no element 'json' - EXIT!")
1564         return 4
1565     elif "instances" not in fetched["json"]:
1566         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1567         return 5
1568
1569     domains = list()
1570     rows = fetched["json"]["instances"]
1571
1572     logger.info("Checking %d row(s) ...", len(rows))
1573     for row in rows:
1574         logger.debug("row[]='%s'", type(row))
1575         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1576         logger.debug("domain='%s' - AFTER!", domain)
1577
1578         if domain is None and domain == "":
1579             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1580             continue
1581
1582         logger.debug("domain='%s' - BEFORE!", domain)
1583         domain = domain.encode("idna").decode("utf-8")
1584         logger.debug("domain='%s' - AFTER!", domain)
1585
1586         if not domain_helper.is_wanted(domain):
1587             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1588             continue
1589         elif domain in domains:
1590             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1591             continue
1592         elif instances.is_registered(domain):
1593             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1594             continue
1595         elif instances.is_recent(domain):
1596             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1597             continue
1598
1599         logger.info("Fetching instances from domain='%s'", domain)
1600         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1601
1602     logger.debug("Success! - EXIT!")
1603     return 0
1604
1605 def fetch_relaylist(args: argparse.Namespace) -> int:
1606     logger.debug("args[]='%s' - CALLED!", type(args))
1607
1608     logger.debug("Invoking locking.acquire() ...")
1609     locking.acquire()
1610
1611     source_domain = "api.relaylist.com"
1612
1613     if sources.is_recent(source_domain):
1614         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1615         return 1
1616     else:
1617         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1618         sources.update(source_domain)
1619
1620     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1621     fetched = network.get_json_api(
1622         source_domain,
1623         "/relays",
1624         {},
1625         (config.get("connection_timeout"), config.get("read_timeout"))
1626     )
1627     logger.debug("fetched[]='%s'", type(fetched))
1628
1629     if "error_message" in fetched:
1630         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1631         return 2
1632     elif "exception" in fetched:
1633         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1634         return 3
1635     elif "json" not in fetched:
1636         logger.warning("fetched has no element 'json' - EXIT!")
1637         return 4
1638
1639     domains = list()
1640
1641     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1642     for row in fetched["json"]:
1643         logger.debug("row[]='%s'", type(row))
1644         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1645         logger.debug("domain='%s' - AFTER!", domain)
1646
1647         if domain is None and domain == "":
1648             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1649             continue
1650
1651         logger.debug("domain='%s' - BEFORE!", domain)
1652         domain = domain.encode("idna").decode("utf-8")
1653         logger.debug("domain='%s' - AFTER!", domain)
1654
1655         if not domain_helper.is_wanted(domain):
1656             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1657             continue
1658         elif domain in domains:
1659             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1660             continue
1661         elif instances.is_registered(domain):
1662             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1663             continue
1664         elif instances.is_recent(domain):
1665             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1666             continue
1667
1668         logger.info("Fetching instances from domain='%s'", domain)
1669         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1670
1671     logger.debug("Success! - EXIT!")
1672     return 0
1673
1674 def fetch_relays(args: argparse.Namespace) -> int:
1675     logger.debug("args[]='%s' - CALLED!", type(args))
1676
1677     logger.debug("Invoking locking.acquire() ...")
1678     locking.acquire()
1679
1680     if args.domain is not None and args.domain != "":
1681         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1682     elif args.software is not None and args.software != "":
1683         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1684     else:
1685         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1686
1687     domains = list()
1688     rows = database.cursor.fetchall()
1689
1690     logger.info("Checking %d relays ...", len(rows))
1691     for row in rows:
1692         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1693         peers = list()
1694         if not args.force and instances.is_recent(row["domain"]):
1695             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1696             continue
1697
1698         try:
1699             if row["software"] == "pub-relay":
1700                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1701                 raw = network.fetch_api_url(
1702                     row["nodeinfo_url"],
1703                     (config.get("connection_timeout"), config.get("read_timeout"))
1704                 )
1705
1706                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1707                 if "exception" in raw:
1708                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1709                     raise raw["exception"]
1710                 elif "error_message" in raw:
1711                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1712                     instances.set_last_error(row["domain"], raw)
1713                     instances.set_last_instance_fetch(row["domain"])
1714                     instances.update(row["domain"])
1715                     continue
1716                 elif "json" not in raw:
1717                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1718                     continue
1719                 elif not "metadata" in raw["json"]:
1720                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1721                     continue
1722                 elif not "peers" in raw["json"]["metadata"]:
1723                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1724                     continue
1725             else:
1726                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1727                 raw = utils.fetch_url(
1728                     f"https://{row['domain']}",
1729                     network.web_headers,
1730                     (config.get("connection_timeout"), config.get("read_timeout"))
1731                 ).text
1732                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1733
1734                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1735                 logger.debug("doc[]='%s'", type(doc))
1736
1737         except network.exceptions as exception:
1738             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1739             instances.set_last_error(row["domain"], exception)
1740             instances.set_last_instance_fetch(row["domain"])
1741             instances.update(row["domain"])
1742             continue
1743
1744         logger.debug("row[software]='%s'", row["software"])
1745         if row["software"] == "activityrelay":
1746             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1747             tags = doc.findAll("p")
1748
1749             logger.debug("Checking %d paragraphs ...", len(tags))
1750             for tag in tags:
1751                 logger.debug("tag[]='%s'", type(tag))
1752                 if len(tag.contents) == 0:
1753                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1754                     continue
1755                 elif "registered instances" not in tag.contents[0]:
1756                     logger.debug("Skipping paragraph, text not found.")
1757                     continue
1758
1759                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1760                 for domain in tag.contents:
1761                     logger.debug("domain[%s]='%s'", type(domain), domain)
1762                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1763                         continue
1764
1765                     domain = str(domain)
1766                     logger.debug("domain='%s'", domain)
1767                     if not domain_helper.is_wanted(domain):
1768                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1769                         continue
1770
1771                     logger.debug("domain='%s' - BEFORE!", domain)
1772                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1773                     logger.debug("domain='%s' - AFTER!", domain)
1774
1775                     if domain is None or domain == "":
1776                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1777                         continue
1778                     elif domain not in peers:
1779                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1780                         peers.append(domain)
1781
1782                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1783                     if dict_helper.has_key(domains, "domain", domain):
1784                         logger.debug("domain='%s' already added", domain)
1785                         continue
1786
1787                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1788                     domains.append({
1789                         "domain": domain,
1790                         "origin": row["domain"],
1791                     })
1792         elif row["software"] in ["aoderelay", "selective-relay"]:
1793             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1794             if row["software"] == "aoderelay":
1795                 tags = doc.findAll("section", {"class": "instance"})
1796             else:
1797                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1798
1799             logger.debug("Checking %d tags ...", len(tags))
1800             for tag in tags:
1801                 logger.debug("tag[]='%s'", type(tag))
1802
1803                 link = tag.find("a")
1804                 logger.debug("link[%s]='%s'", type(link), link)
1805                 if not isinstance(link, bs4.element.Tag):
1806                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1807                     continue
1808
1809                 components = urlparse(link.get("href"))
1810                 logger.debug("components(%d)='%s'", len(components), components)
1811                 domain = components.netloc.lower().split(":")[0]
1812
1813                 logger.debug("domain='%s' - BEFORE!", domain)
1814                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1815                 logger.debug("domain='%s' - AFTER!", domain)
1816
1817                 if domain is None or domain == "":
1818                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1819                     continue
1820                 elif domain not in peers:
1821                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1822                     peers.append(domain)
1823
1824                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1825                 if dict_helper.has_key(domains, "domain", domain):
1826                     logger.debug("domain='%s' already added", domain)
1827                     continue
1828
1829                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1830                 domains.append({
1831                     "domain": domain,
1832                     "origin": row["domain"],
1833                 })
1834         elif row["software"] == "pub-relay":
1835             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1836             for domain in raw["json"]["metadata"]["peers"]:
1837                 logger.debug("domain='%s' - BEFORE!", domain)
1838                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1839                 logger.debug("domain='%s' - AFTER!", domain)
1840
1841                 if domain is None or domain == "":
1842                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1843                     continue
1844                 elif domain not in peers:
1845                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1846                     peers.append(domain)
1847
1848                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1849                 if dict_helper.has_key(domains, "domain", domain):
1850                     logger.debug("domain='%s' already added", domain)
1851                     continue
1852
1853                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1854                 domains.append({
1855                     "domain": domain,
1856                     "origin": row["domain"],
1857                 })
1858         else:
1859             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1860             continue
1861
1862         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1863         instances.set_last_instance_fetch(row["domain"])
1864
1865         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1866         instances.set_total_peers(row["domain"], peers)
1867
1868         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1869         instances.update(row["domain"])
1870
1871     logger.info("Checking %d domains ...", len(domains))
1872     for row in domains:
1873         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1874         if not domain_helper.is_wanted(row["domain"]):
1875             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1876             continue
1877         elif instances.is_registered(row["domain"]):
1878             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1879             continue
1880
1881         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1882         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1883
1884     logger.debug("Success! - EXIT!")
1885     return 0
1886
1887 def convert_idna(args: argparse.Namespace) -> int:
1888     logger.debug("args[]='%s' - CALLED!", type(args))
1889
1890     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1891     rows = database.cursor.fetchall()
1892
1893     logger.debug("rows[]='%s'", type(rows))
1894     instances.translate_idnas(rows, "domain")
1895
1896     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1897     rows = database.cursor.fetchall()
1898
1899     logger.debug("rows[]='%s'", type(rows))
1900     instances.translate_idnas(rows, "origin")
1901
1902     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1903     rows = database.cursor.fetchall()
1904
1905     logger.debug("rows[]='%s'", type(rows))
1906     blocks.translate_idnas(rows, "blocker")
1907
1908     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1909     rows = database.cursor.fetchall()
1910
1911     logger.debug("rows[]='%s'", type(rows))
1912     blocks.translate_idnas(rows, "blocked")
1913
1914     logger.debug("Success! - EXIT!")
1915     return 0
1916
1917 def remove_invalid(args: argparse.Namespace) -> int:
1918     logger.debug("args[]='%s' - CALLED!", type(args))
1919
1920     logger.debug("Invoking locking.acquire() ...")
1921     locking.acquire()
1922
1923     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1924     rows = database.cursor.fetchall()
1925
1926     logger.info("Checking %d domains ...", len(rows))
1927     for row in rows:
1928         logger.debug("row[domain]='%s'", row["domain"])
1929         if not validators.domain(row["domain"].split("/")[0]):
1930             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1931             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1932             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1933
1934     logger.debug("Invoking commit() ...")
1935     database.connection.commit()
1936
1937     logger.info("Vaccum cleaning database ...")
1938     database.cursor.execute("VACUUM")
1939
1940     logger.debug("Success! - EXIT!")
1941     return 0