]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] is None:
152                 logger.debug("row[domain] is None - SKIPPED!")
153                 continue
154             elif row["domain"] == "":
155                 logger.debug("row[domain] is empty - SKIPPED!")
156                 continue
157
158             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
159             domain = row["domain"].encode("idna").decode("utf-8")
160             logger.debug("domain='%s' - AFTER!", domain)
161
162             if not domain_helper.is_wanted(domain):
163                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
164                 continue
165             elif instances.is_registered(domain):
166                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
167                 continue
168             elif instances.is_recent(domain):
169                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
170                 continue
171
172             logger.debug("Fetching instances from domain='%s' ...", domain)
173             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
174
175     except network.exceptions as exception:
176         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
177         return 103
178
179     logger.debug("Success! - EXIT!")
180     return 0
181
182 def fetch_bkali(args: argparse.Namespace) -> int:
183     logger.debug("args[]='%s' - CALLED!", type(args))
184
185     logger.debug("Invoking locking.acquire() ...")
186     locking.acquire()
187
188     source_domain = "gql.api.bka.li"
189     if sources.is_recent(source_domain):
190         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
191         return 1
192     else:
193         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
194         sources.update(source_domain)
195
196     domains = list()
197     try:
198         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
199         fetched = network.post_json_api(
200             source_domain,
201             "/v1/graphql",
202             json.dumps({
203                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
204             })
205         )
206
207         logger.debug("fetched[]='%s'", type(fetched))
208         if "error_message" in fetched:
209             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
210             return 100
211         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
212             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
213             return 101
214
215         rows = fetched["json"]
216
217         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
218         if len(rows) == 0:
219             raise Exception("WARNING: Returned no records")
220         elif "data" not in rows:
221             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
222         elif "nodeinfo" not in rows["data"]:
223             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
224
225         for entry in rows["data"]["nodeinfo"]:
226             logger.debug("entry[%s]='%s'", type(entry), entry)
227             if "domain" not in entry:
228                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
229                 continue
230             elif entry["domain"] == "":
231                 logger.debug("entry[domain] is empty - SKIPPED!")
232                 continue
233             elif not domain_helper.is_wanted(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_registered(entry["domain"]):
237                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
238                 continue
239             elif instances.is_recent(entry["domain"]):
240                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
241                 continue
242
243             logger.debug("Adding domain='%s' ...", entry["domain"])
244             domains.append(entry["domain"])
245
246     except network.exceptions as exception:
247         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
248         return 102
249
250     logger.debug("domains()=%d", len(domains))
251     if len(domains) > 0:
252         logger.info("Adding %d new instances ...", len(domains))
253         for domain in domains:
254             logger.debug("domain='%s' - BEFORE!", domain)
255             domain = domain.encode("idna").decode("utf-8")
256             logger.debug("domain='%s' - AFTER!", domain)
257
258             try:
259                 logger.info("Fetching instances from domain='%s' ...", domain)
260                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
261             except network.exceptions as exception:
262                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
263                 instances.set_last_error(domain, exception)
264                 return 100
265
266     logger.debug("Success - EXIT!")
267     return 0
268
269 def fetch_blocks(args: argparse.Namespace) -> int:
270     logger.debug("args[]='%s' - CALLED!", type(args))
271     if args.domain is not None and args.domain != "":
272         logger.debug("args.domain='%s' - checking ...", args.domain)
273         if not validators.domain(args.domain):
274             logger.warning("args.domain='%s' is not valid.", args.domain)
275             return 100
276         elif blacklist.is_blacklisted(args.domain):
277             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
278             return 101
279         elif not instances.is_registered(args.domain):
280             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
281             return 102
282
283     logger.debug("Invoking locking.acquire() ...")
284     locking.acquire()
285
286     if args.domain is not None and args.domain != "":
287         # Re-check single domain
288         logger.debug("Querying database for args.domain='%s' ...", args.domain)
289         database.cursor.execute(
290             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
291         )
292     elif args.software is not None and args.software != "":
293         # Re-check single software
294         logger.debug("Querying database for args.software='%s' ...", args.software)
295         database.cursor.execute(
296             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
297         )
298     elif args.force:
299         # Re-check all
300         logger.debug("Re-checking all instances ...")
301         database.cursor.execute(
302             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
303         )
304     else:
305         # Re-check after "timeout" (aka. minimum interval)
306         database.cursor.execute(
307             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_block")]
308         )
309
310     rows = database.cursor.fetchall()
311     logger.info("Checking %d entries ...", len(rows))
312     for blocker, software, origin, nodeinfo_url in rows:
313         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
314
315         if not domain_helper.is_wanted(blocker):
316             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
317             continue
318
319         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
320         instances.set_last_blocked(blocker)
321         instances.set_has_obfuscation(blocker, False)
322
323         # c.s isn't part of oliphant's "hidden" blocklists
324         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
325             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
326             continue
327
328         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
329         blocking = federation.fetch_blocks(blocker)
330
331         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
332         if len(blocking) == 0:
333             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
334             if software == "pleroma":
335                 blocking = pleroma.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "mastodon":
338                 blocking = mastodon.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "lemmy":
341                 blocking = lemmy.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "friendica":
344                 blocking = friendica.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             elif software == "misskey":
347                 blocking = misskey.fetch_blocks(blocker)
348                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
349             else:
350                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
351
352         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
353         instances.set_total_blocks(blocker, blocking)
354
355         blockdict = list()
356         deobfuscated = obfuscated = 0
357
358         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
359         for block in blocking:
360             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
361
362             if block["block_level"] == "":
363                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
364                 continue
365
366             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
367             block["blocked"] = tidyup.domain(block["blocked"])
368             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
369             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
370
371             if block["blocked"] == "":
372                 logger.warning("blocked is empty, blocker='%s'", blocker)
373                 continue
374             elif block["blocked"].endswith(".onion"):
375                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".arpa"):
378                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].endswith(".tld"):
381                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
382                 continue
383             elif block["blocked"].find("*") >= 0:
384                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
385                 instances.set_has_obfuscation(blocker, True)
386                 obfuscated = obfuscated + 1
387
388                 # Some friendica servers also obscure domains without hash
389                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
390
391                 logger.debug("row[]='%s'", type(row))
392                 if row is None:
393                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
394                     continue
395
396                 deobfuscated = deobfuscated + 1
397                 block["blocked"] = row["domain"]
398                 origin           = row["origin"]
399                 nodeinfo_url     = row["nodeinfo_url"]
400             elif block["blocked"].find("?") >= 0:
401                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
402                 instances.set_has_obfuscation(blocker, True)
403                 obfuscated = obfuscated + 1
404
405                 # Some obscure them with question marks, not sure if that's dependent on version or not
406                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
407
408                 logger.debug("row[]='%s'", type(row))
409                 if row is None:
410                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
411                     continue
412
413                 deobfuscated = deobfuscated + 1
414                 block["blocked"] = row["domain"]
415                 origin           = row["origin"]
416                 nodeinfo_url     = row["nodeinfo_url"]
417
418             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
419             if block["blocked"] == "":
420                 logger.debug("block[blocked] is empty - SKIPPED!")
421                 continue
422
423             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
424             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
425             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
426
427             if not domain_helper.is_wanted(block["blocked"]):
428                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
429                 continue
430             elif block["block_level"] in ["accept", "accepted"]:
431                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
432                 continue
433             elif not instances.is_registered(block["blocked"]):
434                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
435                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
436
437             block["block_level"] = blocks.alias_block_level(block["block_level"])
438
439             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
440                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
441                 blockdict.append({
442                     "blocked": block["blocked"],
443                     "reason" : block["reason"],
444                 })
445
446             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
447             cookies.clear(block["blocked"])
448
449         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
450         instances.set_obfuscated_blocks(blocker, obfuscated)
451
452         logger.debug("Flushing updates for blocker='%s' ...", blocker)
453         instances.update(blocker)
454
455         logger.debug("Invoking commit() ...")
456         database.connection.commit()
457
458         logger.debug("Invoking cookies.clear(%s) ...", blocker)
459         cookies.clear(blocker)
460
461         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
462         if config.get("bot_enabled") and len(blockdict) > 0:
463             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
464             network.send_bot_post(blocker, blockdict)
465
466     logger.debug("Success! - EXIT!")
467     return 0
468
469 def fetch_observer(args: argparse.Namespace) -> int:
470     logger.debug("args[]='%s' - CALLED!", type(args))
471
472     logger.debug("Invoking locking.acquire() ...")
473     locking.acquire()
474
475     source_domain = "fediverse.observer"
476     if sources.is_recent(source_domain):
477         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
478         return 1
479     else:
480         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
481         sources.update(source_domain)
482
483     types = list()
484     if args.software is None:
485         logger.info("Fetching software list ...")
486         raw = utils.fetch_url(
487             f"https://{source_domain}",
488             network.web_headers,
489             (config.get("connection_timeout"), config.get("read_timeout"))
490         ).text
491         logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493         doc = bs4.BeautifulSoup(raw, features="html.parser")
494         logger.debug("doc[]='%s'", type(doc))
495
496         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
497         logger.debug("navbar[]='%s'", type(navbar))
498         if navbar is None:
499             logger.warning("Cannot find navigation bar, cannot continue!")
500             return 1
501
502         items = navbar.findAll("a", {"class": "dropdown-item"})
503         logger.debug("items[]='%s'", type(items))
504
505         logger.info("Checking %d menu items ...", len(items))
506         for item in items:
507             logger.debug("item[%s]='%s'", type(item), item)
508             if item.text.lower() == "all":
509                 logger.debug("Skipping 'All' menu entry ...")
510                 continue
511
512             logger.debug("Appending item.text='%s' ...", item.text)
513             types.append(tidyup.domain(item.text))
514     else:
515         logger.info("Adding args.software='%s' as type ...", args.software)
516         types.append(args.software)
517
518     logger.info("Fetching %d different table data ...", len(types))
519     for software in types:
520         logger.debug("software='%s'", software)
521
522         if args.software is not None and args.software != software:
523             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
524             continue
525
526         doc = None
527         try:
528             logger.debug("Fetching table data for software='%s' ...", software)
529             raw = utils.fetch_url(
530                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
531                 network.web_headers,
532                 (config.get("connection_timeout"), config.get("read_timeout"))
533             ).text
534             logger.debug("raw[%s]()=%d", type(raw), len(raw))
535
536             doc = bs4.BeautifulSoup(raw, features="html.parser")
537             logger.debug("doc[]='%s'", type(doc))
538         except network.exceptions as exception:
539             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
540             continue
541
542         items = doc.findAll("a", {"class": "url"})
543         logger.info("Checking %d items,software='%s' ...", len(items), software)
544         for item in items:
545             logger.debug("item[]='%s'", type(item))
546             domain = item.decode_contents()
547             domain = tidyup.domain(domain) if domain not in [None, ""] else None
548             logger.debug("domain='%s' - AFTER!", domain)
549
550             if domain is None or domain == "":
551                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
552                 continue
553
554             logger.debug("domain='%s' - BEFORE!", domain)
555             domain = domain.encode("idna").decode("utf-8")
556             logger.debug("domain='%s' - AFTER!", domain)
557
558             if not domain_helper.is_wanted(domain):
559                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
560                 continue
561             elif instances.is_registered(domain):
562                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
563                 continue
564
565             logger.info("Fetching instances for domain='%s'", domain)
566             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
567
568     logger.debug("Success! - EXIT!")
569     return 0
570
571 def fetch_todon_wiki(args: argparse.Namespace) -> int:
572     logger.debug("args[]='%s' - CALLED!", type(args))
573
574     logger.debug("Invoking locking.acquire() ...")
575     locking.acquire()
576
577     source_domain = "wiki.todon.eu"
578     if sources.is_recent(source_domain):
579         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
580         return 1
581     else:
582         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
583         sources.update(source_domain)
584
585     blocklist = {
586         "silenced": list(),
587         "reject": list(),
588     }
589
590     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
591     raw = utils.fetch_url(
592         f"https://{source_domain}/todon/domainblocks",
593         network.web_headers,
594         (config.get("connection_timeout"), config.get("read_timeout"))
595     ).text
596     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
597
598     doc = bs4.BeautifulSoup(raw, "html.parser")
599     logger.debug("doc[]='%s'", type(doc))
600
601     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
602     logger.info("Checking %d silenced/limited entries ...", len(silenced))
603     blocklist["silenced"] = utils.find_domains(silenced, "div")
604
605     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
606     logger.info("Checking %d suspended entries ...", len(suspended))
607     blocklist["reject"] = utils.find_domains(suspended, "div")
608
609     blocking = blocklist["silenced"] + blocklist["reject"]
610     blocker = "todon.eu"
611
612     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
613     instances.set_last_blocked(blocker)
614     instances.set_total_blocks(blocker, blocking)
615
616     blockdict = list()
617     for block_level in blocklist:
618         blockers = blocklist[block_level]
619
620         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
621         for blocked in blockers:
622             logger.debug("blocked='%s'", blocked)
623
624             if not instances.is_registered(blocked):
625                 try:
626                     logger.info("Fetching instances from domain='%s' ...", blocked)
627                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
628                 except network.exceptions as exception:
629                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
630                     instances.set_last_error(blocked, exception)
631
632             if not domain_helper.is_wanted(blocked):
633                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
634                 continue
635             elif not domain_helper.is_wanted(blocker):
636                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
637                 continue
638             elif blocks.is_instance_blocked(blocker, blocked, block_level):
639                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
640                 continue
641
642             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
643             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
644                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
645                 blockdict.append({
646                     "blocked": blocked,
647                     "reason" : None,
648                 })
649
650         logger.debug("Invoking commit() ...")
651         database.connection.commit()
652
653         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
654         if config.get("bot_enabled") and len(blockdict) > 0:
655             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
656             network.send_bot_post(blocker, blockdict)
657
658     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
659     if instances.has_pending(blocker):
660         logger.debug("Flushing updates for blocker='%s' ...", blocker)
661         instances.update(blocker)
662
663     logger.debug("Success! - EXIT!")
664     return 0
665
666 def fetch_cs(args: argparse.Namespace):
667     logger.debug("args[]='%s' - CALLED!", type(args))
668
669     logger.debug("Invoking locking.acquire() ...")
670     locking.acquire()
671
672     extensions = [
673         "extra",
674         "abbr",
675         "attr_list",
676         "def_list",
677         "fenced_code",
678         "footnotes",
679         "md_in_html",
680         "admonition",
681         "codehilite",
682         "legacy_attrs",
683         "legacy_em",
684         "meta",
685         "nl2br",
686         "sane_lists",
687         "smarty",
688         "toc",
689         "wikilinks"
690     ]
691
692     blocklist = {
693         "silenced": list(),
694         "reject"  : list(),
695     }
696
697     source_domain = "raw.githubusercontent.com"
698     if sources.is_recent(source_domain):
699         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
700         return 1
701     else:
702         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
703         sources.update(source_domain)
704
705     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
706     raw = utils.fetch_url(
707         f"https://{source_domain}/chaossocial/meta/master/federation.md",
708         network.web_headers,
709         (config.get("connection_timeout"), config.get("read_timeout"))
710     ).text
711     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
712
713     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
714     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
715
716     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
717     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
718     blocklist["silenced"] = federation.find_domains(silenced)
719
720     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
721     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
722     blocklist["reject"] = federation.find_domains(blocked)
723
724     blocking = blocklist["silenced"] + blocklist["reject"]
725     blocker = "chaos.social"
726
727     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
728     instances.set_last_blocked(blocker)
729     instances.set_total_blocks(blocker, blocking)
730
731     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
732     if len(blocking) > 0:
733         blockdict = list()
734         for block_level in blocklist:
735             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
736
737             for row in blocklist[block_level]:
738                 logger.debug("row[%s]='%s'", type(row), row)
739                 if not "domain" in row:
740                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
741                     continue
742                 elif not instances.is_registered(row["domain"]):
743                     try:
744                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
745                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
746                     except network.exceptions as exception:
747                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
748                         instances.set_last_error(row["domain"], exception)
749
750                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
751                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
752                     blockdict.append({
753                         "blocked": row["domain"],
754                         "reason" : row["reason"],
755                     })
756
757         logger.debug("Invoking commit() ...")
758         database.connection.commit()
759
760         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
761         if config.get("bot_enabled") and len(blockdict) > 0:
762             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
763             network.send_bot_post(blocker, blockdict)
764
765     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
766     if instances.has_pending(blocker):
767         logger.debug("Flushing updates for blocker='%s' ...", blocker)
768         instances.update(blocker)
769
770     logger.debug("Success! - EXIT!")
771     return 0
772
773 def fetch_fba_rss(args: argparse.Namespace) -> int:
774     logger.debug("args[]='%s' - CALLED!", type(args))
775
776     domains = list()
777
778     logger.debug("Invoking locking.acquire() ...")
779     locking.acquire()
780
781     components = urlparse(args.feed)
782     domain = components.netloc.lower().split(":")[0]
783
784     logger.debug("domain='%s'", domain)
785     if sources.is_recent(domain):
786         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
787         return 0
788     else:
789         logger.debug("domain='%s' has not been recently used, marking ...", domain)
790         sources.update(domain)
791
792     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
793     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
794
795     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
796     if response.ok and response.status_code == 200 and len(response.text) > 0:
797         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
798         rss = atoma.parse_rss_bytes(response.content)
799
800         logger.debug("rss[]='%s'", type(rss))
801         for item in rss.items:
802             logger.debug("item[%s]='%s'", type(item), item)
803             domain = item.link.split("=")[1]
804             domain = tidyup.domain(domain) if domain not in[None, ""] else None
805
806             logger.debug("domain='%s' - AFTER!", domain)
807             if domain is None or domain == "":
808                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
809                 continue
810
811             logger.debug("domain='%s' - BEFORE!", domain)
812             domain = domain.encode("idna").decode("utf-8")
813             logger.debug("domain='%s' - AFTER!", domain)
814
815             if not domain_helper.is_wanted(domain):
816                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
817                 continue
818             elif domain in domains:
819                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
820                 continue
821             elif instances.is_registered(domain):
822                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
823                 continue
824             elif instances.is_recent(domain):
825                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
826                 continue
827
828             logger.debug("Adding domain='%s'", domain)
829             domains.append(domain)
830
831     logger.debug("domains()=%d", len(domains))
832     if len(domains) > 0:
833         logger.info("Adding %d new instances ...", len(domains))
834         for domain in domains:
835             logger.debug("domain='%s'", domain)
836             try:
837                 logger.info("Fetching instances from domain='%s' ...", domain)
838                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
839             except network.exceptions as exception:
840                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
841                 instances.set_last_error(domain, exception)
842                 return 100
843
844     logger.debug("Success! - EXIT!")
845     return 0
846
847 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
848     logger.debug("args[]='%s' - CALLED!", type(args))
849
850     logger.debug("Invoking locking.acquire() ...")
851     locking.acquire()
852
853     source_domain = "ryona.agency"
854     feed = f"https://{source_domain}/users/fba/feed.atom"
855
856     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
857     if args.feed is not None and validators.url(args.feed):
858         logger.debug("Setting feed='%s' ...", args.feed)
859         feed = str(args.feed)
860         source_domain = urlparse(args.feed).netloc
861
862     if sources.is_recent(source_domain):
863         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
864         return 1
865     else:
866         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
867         sources.update(source_domain)
868
869     domains = list()
870
871     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
872     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
873
874     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
875     if response.ok and response.status_code == 200 and len(response.text) > 0:
876         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
877         atom = atoma.parse_atom_bytes(response.content)
878
879         logger.debug("atom[]='%s'", type(atom))
880         for entry in atom.entries:
881             logger.debug("entry[]='%s'", type(entry))
882             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
883             logger.debug("doc[]='%s'", type(doc))
884             for element in doc.findAll("a"):
885                 logger.debug("element[]='%s'", type(element))
886                 for href in element["href"].split(","):
887                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
888                     domain = tidyup.domain(href) if href not in [None, ""] else None
889
890                     logger.debug("domain='%s' - AFTER!", domain)
891                     if domain is None or domain == "":
892                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
893                         continue
894
895                     logger.debug("domain='%s' - BEFORE!", domain)
896                     domain = domain.encode("idna").decode("utf-8")
897                     logger.debug("domain='%s' - AFTER!", domain)
898
899                     if not domain_helper.is_wanted(domain):
900                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
901                         continue
902                     elif domain in domains:
903                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
904                         continue
905                     elif instances.is_registered(domain):
906                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
907                         continue
908                     elif instances.is_recent(domain):
909                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
910                         continue
911
912                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
913                     domains.append(domain)
914
915     logger.debug("domains()=%d", len(domains))
916     if len(domains) > 0:
917         logger.info("Adding %d new instances ...", len(domains))
918         for domain in domains:
919             logger.debug("domain='%s'", domain)
920             try:
921                 logger.info("Fetching instances from domain='%s' ...", domain)
922                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
923             except network.exceptions as exception:
924                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
925                 instances.set_last_error(domain, exception)
926                 return 100
927
928     logger.debug("Success! - EXIT!")
929     return 0
930
931 def fetch_instances(args: argparse.Namespace) -> int:
932     logger.debug("args[]='%s' - CALLED!", type(args))
933
934     logger.debug("args.domain='%s' - checking ...", args.domain)
935     if not validators.domain(args.domain):
936         logger.warning("args.domain='%s' is not valid.", args.domain)
937         return 100
938     elif blacklist.is_blacklisted(args.domain):
939         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
940         return 101
941
942     logger.debug("Invoking locking.acquire() ...")
943     locking.acquire()
944
945     # Initialize values
946     domain = tidyup.domain(args.domain)
947     origin = software = None
948
949     # Fetch record
950     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
951     row = database.cursor.fetchone()
952     if row is not None:
953         origin = row["origin"]
954         software = row["software"]
955
956     if software_helper.is_relay(software):
957         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
958         return 102
959
960     # Initial fetch
961     try:
962         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
963         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
964     except network.exceptions as exception:
965         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
966         instances.set_last_error(args.domain, exception)
967         instances.update(args.domain)
968         return 100
969
970     if args.single:
971         logger.debug("Not fetching more instances - EXIT!")
972         return 0
973
974     # Loop through some instances
975     database.cursor.execute(
976         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
977     )
978
979     rows = database.cursor.fetchall()
980     logger.info("Checking %d entries ...", len(rows))
981     for row in rows:
982         logger.debug("row[domain]='%s'", row["domain"])
983         if row["domain"] == "":
984             logger.debug("row[domain] is empty - SKIPPED!")
985             continue
986
987         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
988         domain = row["domain"].encode("idna").decode("utf-8")
989         logger.debug("domain='%s' - AFTER!", domain)
990
991         if not domain_helper.is_wanted(domain):
992             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
993             continue
994
995         try:
996             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
997             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
998         except network.exceptions as exception:
999             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1000             instances.set_last_error(domain, exception)
1001
1002     logger.debug("Success - EXIT!")
1003     return 0
1004
1005 def fetch_csv(args: argparse.Namespace) -> int:
1006     logger.debug("args[]='%s' - CALLED!", type(args))
1007
1008     logger.debug("Invoking locking.acquire() ...")
1009     locking.acquire()
1010
1011     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1012     for block in blocklists.csv_files:
1013         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1014
1015         # Is domain given and not equal blocker?
1016         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1017             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1018             continue
1019
1020         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1021         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1022
1023     logger.debug("Success - EXIT!")
1024     return 0
1025
1026 def fetch_oliphant(args: argparse.Namespace) -> int:
1027     logger.debug("args[]='%s' - CALLED!", type(args))
1028
1029     logger.debug("Invoking locking.acquire() ...")
1030     locking.acquire()
1031
1032     source_domain = "codeberg.org"
1033     if sources.is_recent(source_domain):
1034         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1035         return 1
1036     else:
1037         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1038         sources.update(source_domain)
1039
1040     # Base URL
1041     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1042
1043     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1044     for block in blocklists.oliphant_blocklists:
1045         # Is domain given and not equal blocker?
1046         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1047         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1048             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1049             continue
1050
1051         url = f"{base_url}/{block['csv_url']}"
1052
1053         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1054         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1055
1056     logger.debug("Success! - EXIT!")
1057     return 0
1058
1059 def fetch_txt(args: argparse.Namespace) -> int:
1060     logger.debug("args[]='%s' - CALLED!", type(args))
1061
1062     logger.debug("Invoking locking.acquire() ...")
1063     locking.acquire()
1064
1065     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1066     for row in blocklists.txt_files:
1067         logger.debug("Fetching row[url]='%s' ...", row["url"])
1068         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1069
1070         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1071         if response.ok and response.status_code == 200 and response.text != "":
1072             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1073             domains = response.text.strip().split("\n")
1074
1075             logger.info("Processing %d domains ...", len(domains))
1076             for domain in domains:
1077                 logger.debug("domain='%s' - BEFORE!", domain)
1078                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1079
1080                 logger.debug("domain='%s' - AFTER!", domain)
1081                 if domain is None or domain == "":
1082                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1083                     continue
1084                 elif not domain_helper.is_wanted(domain):
1085                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1086                     continue
1087                 elif instances.is_recent(domain):
1088                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1089                     continue
1090
1091                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1092                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1093
1094                 logger.debug("processed='%s'", processed)
1095                 if not processed:
1096                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1097                     continue
1098
1099     logger.debug("Success! - EXIT!")
1100     return 0
1101
1102 def fetch_fedipact(args: argparse.Namespace) -> int:
1103     logger.debug("args[]='%s' - CALLED!", type(args))
1104
1105     logger.debug("Invoking locking.acquire() ...")
1106     locking.acquire()
1107
1108     source_domain = "fedipact.online"
1109     if sources.is_recent(source_domain):
1110         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1111         return 1
1112     else:
1113         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1114         sources.update(source_domain)
1115
1116     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1117     response = utils.fetch_url(
1118         f"https://{source_domain}",
1119         network.web_headers,
1120         (config.get("connection_timeout"), config.get("read_timeout"))
1121     )
1122
1123     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1124     if response.ok and response.status_code == 200 and response.text != "":
1125         logger.debug("Parsing %d Bytes ...", len(response.text))
1126
1127         doc = bs4.BeautifulSoup(response.text, "html.parser")
1128         logger.debug("doc[]='%s'", type(doc))
1129
1130         rows = doc.findAll("li")
1131         logger.info("Checking %d row(s) ...", len(rows))
1132         for row in rows:
1133             logger.debug("row[]='%s'", type(row))
1134             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1135
1136             logger.debug("domain='%s' - AFTER!", domain)
1137             if domain is None or domain == "":
1138                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1139                 continue
1140
1141             logger.debug("domain='%s' - BEFORE!", domain)
1142             domain = domain.encode("idna").decode("utf-8")
1143             logger.debug("domain='%s' - AFTER!", domain)
1144
1145             if not domain_helper.is_wanted(domain):
1146                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1147                 continue
1148             elif instances.is_registered(domain):
1149                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1150                 continue
1151             elif instances.is_recent(domain):
1152                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1153                 continue
1154
1155             logger.info("Fetching domain='%s' ...", domain)
1156             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1157
1158     logger.debug("Success! - EXIT!")
1159     return 0
1160
1161 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1162     logger.debug("args[]='%s' - CALLED!", type(args))
1163
1164     logger.debug("Invoking locking.acquire() ...")
1165     locking.acquire()
1166
1167     source_domain = "instances.joinmobilizon.org"
1168     if sources.is_recent(source_domain):
1169         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1170         return 1
1171     else:
1172         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1173         sources.update(source_domain)
1174
1175     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1176     raw = utils.fetch_url(
1177         f"https://{source_domain}/api/v1/instances",
1178         network.web_headers,
1179         (config.get("connection_timeout"), config.get("read_timeout"))
1180     ).text
1181     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1182
1183     parsed = json.loads(raw)
1184     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1185
1186     if "data" not in parsed:
1187         logger.warning("parsed()=%d does not contain key 'data'")
1188         return 1
1189
1190     logger.info("Checking %d instances ...", len(parsed["data"]))
1191     for row in parsed["data"]:
1192         logger.debug("row[]='%s'", type(row))
1193         if "host" not in row:
1194             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1195             continue
1196         elif not domain_helper.is_wanted(row["host"]):
1197             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1198             continue
1199         elif instances.is_registered(row["host"]):
1200             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1201             continue
1202
1203         logger.info("Fetching row[host]='%s' ...", row["host"])
1204         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1205
1206     logger.debug("Success! - EXIT!")
1207     return 0
1208
1209 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1210     logger.debug("args[]='%s' - CALLED!", type(args))
1211
1212     logger.debug("Invoking locking.acquire() ...")
1213     locking.acquire()
1214
1215     source_domain = "instanceapp.misskey.page"
1216     if sources.is_recent(source_domain):
1217         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1218         return 1
1219     else:
1220         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1221         sources.update(source_domain)
1222
1223     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1224     raw = utils.fetch_url(
1225         f"https://{source_domain}/instances.json",
1226         network.web_headers,
1227         (config.get("connection_timeout"), config.get("read_timeout"))
1228     ).text
1229     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1230
1231     parsed = json.loads(raw)
1232     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1233
1234     if "instancesInfos" not in parsed:
1235         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1236         return 1
1237
1238     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1239     for row in parsed["instancesInfos"]:
1240         logger.debug("row[%s]='%s'", type(row), row)
1241         if "url" not in row:
1242             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1243             continue
1244         elif not domain_helper.is_wanted(row["url"]):
1245             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1246             continue
1247         elif instances.is_registered(row["url"]):
1248             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1249             continue
1250
1251         logger.info("Fetching row[url]='%s' ...", row["url"])
1252         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1253
1254     logger.debug("Success! - EXIT!")
1255     return 0
1256
1257 def recheck_obfuscation(args: argparse.Namespace) -> int:
1258     logger.debug("args[]='%s' - CALLED!", type(args))
1259
1260     logger.debug("Invoking locking.acquire() ...")
1261     locking.acquire()
1262
1263     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1264         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1265     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1266         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1267     else:
1268         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1269
1270     rows = database.cursor.fetchall()
1271     logger.info("Checking %d domains ...", len(rows))
1272     for row in rows:
1273         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1274         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1275             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1276             continue
1277
1278         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1279         blocking = federation.fetch_blocks(row["domain"])
1280
1281         logger.debug("blocking()=%d", len(blocking))
1282         if len(blocking) == 0:
1283             if row["software"] == "pleroma":
1284                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1285                 blocking = pleroma.fetch_blocks(row["domain"])
1286             elif row["software"] == "mastodon":
1287                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1288                 blocking = mastodon.fetch_blocks(row["domain"])
1289             elif row["software"] == "lemmy":
1290                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1291                 blocking = lemmy.fetch_blocks(row["domain"])
1292             elif row["software"] == "friendica":
1293                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1294                 blocking = friendica.fetch_blocks(row["domain"])
1295             elif row["software"] == "misskey":
1296                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1297                 blocking = misskey.fetch_blocks(row["domain"])
1298             else:
1299                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1300
1301         # c.s isn't part of oliphant's "hidden" blocklists
1302         logger.debug("row[domain]='%s'", row["domain"])
1303         if row["domain"] != "chaos.social" and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1304             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1305             instances.set_last_blocked(row["domain"])
1306             instances.set_total_blocks(row["domain"], blocking)
1307
1308         obfuscated = 0
1309         blockdict = list()
1310
1311         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1312         for block in blocking:
1313             logger.debug("block[blocked]='%s'", block["blocked"])
1314             blocked = None
1315
1316             if block["blocked"] == "":
1317                 logger.debug("block[blocked] is empty - SKIPPED!")
1318                 continue
1319             elif block["blocked"].endswith(".arpa"):
1320                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1321                 continue
1322             elif block["blocked"].endswith(".tld"):
1323                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1324                 continue
1325             elif block["blocked"].endswith(".onion"):
1326                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1327                 continue
1328             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1329                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1330                 obfuscated = obfuscated + 1
1331                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1332             elif not domain_helper.is_wanted(block["blocked"]):
1333                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1334                 continue
1335             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1336                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1337                 continue
1338
1339             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1340             if blocked is not None and blocked != block["blocked"]:
1341                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1342                 obfuscated = obfuscated - 1
1343
1344                 if blacklist.is_blacklisted(blocked):
1345                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1346                     continue
1347                 elif blacklist.is_blacklisted(row["domain"]):
1348                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1349                     continue
1350                 elif blocks.is_instance_blocked(row["domain"], blocked):
1351                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1352                     continue
1353
1354                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1355
1356                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1357                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1358                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1359                     blockdict.append({
1360                         "blocked": blocked,
1361                         "reason" : block["reason"],
1362                     })
1363
1364         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1365         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1366
1367         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1368         if instances.has_pending(row["domain"]):
1369             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1370             instances.update(row["domain"])
1371
1372         logger.debug("Invoking commit() ...")
1373         database.connection.commit()
1374
1375         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1376         if config.get("bot_enabled") and len(blockdict) > 0:
1377             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1378             network.send_bot_post(row["domain"], blockdict)
1379
1380     logger.debug("Success! - EXIT!")
1381     return 0
1382
1383 def fetch_fedilist(args: argparse.Namespace) -> int:
1384     logger.debug("args[]='%s' - CALLED!", type(args))
1385
1386     logger.debug("Invoking locking.acquire() ...")
1387     locking.acquire()
1388
1389     source_domain = "demo.fedilist.com"
1390     if sources.is_recent(source_domain):
1391         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1392         return 1
1393     else:
1394         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1395         sources.update(source_domain)
1396
1397     url = f"http://{source_domain}/instance/csv?onion=not"
1398     if args.software is not None and args.software != "":
1399         logger.debug("args.software='%s'", args.software)
1400         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1401
1402     logger.info("Fetching url='%s' ...", url)
1403     response = reqto.get(
1404         url,
1405         headers=network.web_headers,
1406         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1407         allow_redirects=False
1408     )
1409
1410     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1411     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1412         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1413         return 1
1414
1415     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1416
1417     logger.debug("reader[]='%s'", type(reader))
1418     if reader is None:
1419         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1420         return 2
1421
1422     rows = list(reader)
1423
1424     logger.info("Checking %d rows ...", len(rows))
1425     for row in rows:
1426         logger.debug("row[]='%s'", type(row))
1427         if "hostname" not in row:
1428             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1429             continue
1430
1431         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1432         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1433         logger.debug("domain='%s' - AFTER!", domain)
1434
1435         if domain is None or domain == "":
1436             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1437             continue
1438
1439         logger.debug("domain='%s' - BEFORE!", domain)
1440         domain = domain.encode("idna").decode("utf-8")
1441         logger.debug("domain='%s' - AFTER!", domain)
1442
1443         if not domain_helper.is_wanted(domain):
1444             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1445             continue
1446         elif (args.force is None or not args.force) and instances.is_registered(domain):
1447             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1448             continue
1449         elif instances.is_recent(domain):
1450             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1451             continue
1452
1453         logger.info("Fetching instances from domain='%s' ...", domain)
1454         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1455
1456     logger.debug("Success! - EXIT!")
1457     return 0
1458
1459 def update_nodeinfo(args: argparse.Namespace) -> int:
1460     logger.debug("args[]='%s' - CALLED!", type(args))
1461
1462     logger.debug("Invoking locking.acquire() ...")
1463     locking.acquire()
1464
1465     if args.domain is not None and args.domain != "":
1466         logger.debug("Fetching args.domain='%s'", args.domain)
1467         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1468     elif args.software is not None and args.software != "":
1469         logger.info("Fetching domains for args.software='%s'", args.software)
1470         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1471     elif args.mode is not None and args.mode != "":
1472         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1473         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1474     elif args.no_software:
1475         logger.info("Fetching domains with no software type detected ...")
1476         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1477     elif args.no_auto:
1478         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1479         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1480     elif args.no_detection:
1481         logger.info("Fetching domains with no detection mode being set ...")
1482         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1483     else:
1484         logger.info("Fetching domains for recently updated ...")
1485         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1486
1487     domains = database.cursor.fetchall()
1488
1489     logger.info("Checking %d domain(s) ...", len(domains))
1490     cnt = 0
1491     for row in domains:
1492         logger.debug("row[]='%s'", type(row))
1493         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1494             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1495             continue
1496
1497         try:
1498             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1499             software = federation.determine_software(row["domain"])
1500
1501             logger.debug("Determined software='%s'", software)
1502             if (software != row["software"] and software is not None) or args.force is True:
1503                 logger.debug("software='%s'", software)
1504                 if software is None:
1505                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1506                     instances.set_nodeinfo_url(row["domain"], None)
1507
1508                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1509                 instances.set_software(row["domain"], software)
1510
1511             if software is not None:
1512                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1513                 instances.set_success(row["domain"])
1514         except network.exceptions as exception:
1515             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1516             instances.set_last_error(row["domain"], exception)
1517
1518         instances.set_last_nodeinfo(row["domain"])
1519         instances.update(row["domain"])
1520         cnt = cnt + 1
1521
1522     logger.debug("Success! - EXIT!")
1523     return 0
1524
1525 def fetch_instances_social(args: argparse.Namespace) -> int:
1526     logger.debug("args[]='%s' - CALLED!", type(args))
1527
1528     logger.debug("Invoking locking.acquire() ...")
1529     locking.acquire()
1530
1531     source_domain = "instances.social"
1532
1533     if config.get("instances_social_api_key") == "":
1534         logger.error("API key not set. Please set in your config.json file.")
1535         return 1
1536     elif sources.is_recent(source_domain):
1537         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1538         return 2
1539     else:
1540         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1541         sources.update(source_domain)
1542
1543     headers = {
1544         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1545     }
1546
1547     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1548     fetched = network.get_json_api(
1549         source_domain,
1550         "/api/1.0/instances/list?count=0&sort_by=name",
1551         headers,
1552         (config.get("connection_timeout"), config.get("read_timeout"))
1553     )
1554     logger.debug("fetched[]='%s'", type(fetched))
1555
1556     if "error_message" in fetched:
1557         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1558         return 2
1559     elif "exception" in fetched:
1560         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1561         return 3
1562     elif "json" not in fetched:
1563         logger.warning("fetched has no element 'json' - EXIT!")
1564         return 4
1565     elif "instances" not in fetched["json"]:
1566         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1567         return 5
1568
1569     domains = list()
1570     rows = fetched["json"]["instances"]
1571
1572     logger.info("Checking %d row(s) ...", len(rows))
1573     for row in rows:
1574         logger.debug("row[]='%s'", type(row))
1575         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1576         logger.debug("domain='%s' - AFTER!", domain)
1577
1578         if domain is None and domain == "":
1579             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1580             continue
1581
1582         logger.debug("domain='%s' - BEFORE!", domain)
1583         domain = domain.encode("idna").decode("utf-8")
1584         logger.debug("domain='%s' - AFTER!", domain)
1585
1586         if not domain_helper.is_wanted(domain):
1587             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1588             continue
1589         elif domain in domains:
1590             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1591             continue
1592         elif instances.is_registered(domain):
1593             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1594             continue
1595         elif instances.is_recent(domain):
1596             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1597             continue
1598
1599         logger.info("Fetching instances from domain='%s'", domain)
1600         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1601
1602     logger.debug("Success! - EXIT!")
1603     return 0
1604
1605 def fetch_relaylist(args: argparse.Namespace) -> int:
1606     logger.debug("args[]='%s' - CALLED!", type(args))
1607
1608     logger.debug("Invoking locking.acquire() ...")
1609     locking.acquire()
1610
1611     source_domain = "api.relaylist.com"
1612
1613     if sources.is_recent(source_domain):
1614         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1615         return 1
1616     else:
1617         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1618         sources.update(source_domain)
1619
1620     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1621     fetched = network.get_json_api(
1622         source_domain,
1623         "/relays",
1624         {},
1625         (config.get("connection_timeout"), config.get("read_timeout"))
1626     )
1627     logger.debug("fetched[]='%s'", type(fetched))
1628
1629     if "error_message" in fetched:
1630         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1631         return 2
1632     elif "exception" in fetched:
1633         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1634         return 3
1635     elif "json" not in fetched:
1636         logger.warning("fetched has no element 'json' - EXIT!")
1637         return 4
1638
1639     domains = list()
1640
1641     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1642     for row in fetched["json"]:
1643         logger.debug("row[]='%s'", type(row))
1644         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1645         logger.debug("domain='%s' - AFTER!", domain)
1646
1647         if domain is None and domain == "":
1648             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1649             continue
1650
1651         logger.debug("domain='%s' - BEFORE!", domain)
1652         domain = domain.encode("idna").decode("utf-8")
1653         logger.debug("domain='%s' - AFTER!", domain)
1654
1655         if not domain_helper.is_wanted(domain):
1656             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1657             continue
1658         elif domain in domains:
1659             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1660             continue
1661         elif instances.is_registered(domain):
1662             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1663             continue
1664         elif instances.is_recent(domain):
1665             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1666             continue
1667
1668         logger.info("Fetching instances from domain='%s'", domain)
1669         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1670
1671     logger.debug("Success! - EXIT!")
1672     return 0
1673
1674 def fetch_relays(args: argparse.Namespace) -> int:
1675     logger.debug("args[]='%s' - CALLED!", type(args))
1676
1677     logger.debug("Invoking locking.acquire() ...")
1678     locking.acquire()
1679
1680     if args.domain is not None and args.domain != "":
1681         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1682     elif args.software is not None and args.software != "":
1683         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1684     else:
1685         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1686
1687     domains = list()
1688     rows = database.cursor.fetchall()
1689
1690     logger.info("Checking %d relays ...", len(rows))
1691     for row in rows:
1692         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1693         peers = list()
1694         if not args.force and instances.is_recent(row["domain"]):
1695             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1696             continue
1697
1698         try:
1699             if row["software"] == "pub-relay":
1700                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1701                 raw = network.fetch_api_url(
1702                     row["nodeinfo_url"],
1703                     (config.get("connection_timeout"), config.get("read_timeout"))
1704                 )
1705
1706                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1707                 if "exception" in raw:
1708                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1709                     raise raw["exception"]
1710                 elif "error_message" in raw:
1711                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1712                     instances.set_last_error(row["domain"], raw)
1713                     instances.set_last_instance_fetch(row["domain"])
1714                     instances.update(row["domain"])
1715                     continue
1716                 elif "json" not in raw:
1717                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1718                     continue
1719                 elif not "metadata" in raw["json"]:
1720                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1721                     continue
1722                 elif not "peers" in raw["json"]["metadata"]:
1723                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1724                     continue
1725             else:
1726                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1727                 raw = utils.fetch_url(
1728                     f"https://{row['domain']}",
1729                     network.web_headers,
1730                     (config.get("connection_timeout"), config.get("read_timeout"))
1731                 ).text
1732                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1733
1734                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1735                 logger.debug("doc[]='%s'", type(doc))
1736
1737         except network.exceptions as exception:
1738             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1739             instances.set_last_error(row["domain"], exception)
1740             instances.set_last_instance_fetch(row["domain"])
1741             instances.update(row["domain"])
1742             continue
1743
1744         logger.debug("row[software]='%s'", row["software"])
1745         if row["software"] == "activityrelay":
1746             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1747             tags = doc.findAll("p")
1748
1749             logger.debug("Checking %d paragraphs ...", len(tags))
1750             for tag in tags:
1751                 logger.debug("tag[]='%s'", type(tag))
1752                 if len(tag.contents) == 0:
1753                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1754                     continue
1755                 elif "registered instances" not in tag.contents[0]:
1756                     logger.debug("Skipping paragraph, text not found.")
1757                     continue
1758
1759                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1760                 for domain in tag.contents:
1761                     logger.debug("domain[%s]='%s'", type(domain), domain)
1762                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1763                         continue
1764
1765                     domain = str(domain)
1766                     logger.debug("domain='%s'", domain)
1767                     if not domain_helper.is_wanted(domain):
1768                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1769                         continue
1770
1771                     logger.debug("domain='%s' - BEFORE!", domain)
1772                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1773                     logger.debug("domain='%s' - AFTER!", domain)
1774
1775                     if domain is None or domain == "":
1776                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1777                         continue
1778                     elif domain not in peers:
1779                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1780                         peers.append(domain)
1781
1782                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1783                     if dict_helper.has_key(domains, "domain", domain):
1784                         logger.debug("domain='%s' already added", domain)
1785                         continue
1786
1787                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1788                     domains.append({
1789                         "domain": domain,
1790                         "origin": row["domain"],
1791                     })
1792         elif row["software"] in ["aoderelay", "selective-relay"]:
1793             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1794             if row["software"] == "aoderelay":
1795                 tags = doc.findAll("section", {"class": "instance"})
1796             else:
1797                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1798
1799             logger.debug("Checking %d tags ...", len(tags))
1800             for tag in tags:
1801                 logger.debug("tag[]='%s'", type(tag))
1802
1803                 link = tag.find("a")
1804                 logger.debug("link[%s]='%s'", type(link), link)
1805                 if not isinstance(link, bs4.element.Tag):
1806                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1807                     continue
1808
1809                 components = urlparse(link.get("href"))
1810                 logger.debug("components(%d)='%s'", len(components), components)
1811                 domain = components.netloc.lower().split(":")[0]
1812
1813                 logger.debug("domain='%s' - BEFORE!", domain)
1814                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1815                 logger.debug("domain='%s' - AFTER!", domain)
1816
1817                 if domain is None or domain == "":
1818                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1819                     continue
1820                 elif domain not in peers:
1821                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1822                     peers.append(domain)
1823
1824                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1825                 if dict_helper.has_key(domains, "domain", domain):
1826                     logger.debug("domain='%s' already added", domain)
1827                     continue
1828
1829                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1830                 domains.append({
1831                     "domain": domain,
1832                     "origin": row["domain"],
1833                 })
1834         elif row["software"] == "pub-relay":
1835             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1836             for domain in raw["json"]["metadata"]["peers"]:
1837                 logger.debug("domain='%s' - BEFORE!", domain)
1838                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1839                 logger.debug("domain='%s' - AFTER!", domain)
1840
1841                 if domain is None or domain == "":
1842                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1843                     continue
1844                 elif domain not in peers:
1845                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1846                     peers.append(domain)
1847
1848                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1849                 if dict_helper.has_key(domains, "domain", domain):
1850                     logger.debug("domain='%s' already added", domain)
1851                     continue
1852
1853                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1854                 domains.append({
1855                     "domain": domain,
1856                     "origin": row["domain"],
1857                 })
1858         else:
1859             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1860             continue
1861
1862         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1863         instances.set_last_instance_fetch(row["domain"])
1864
1865         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1866         instances.set_total_peers(row["domain"], peers)
1867
1868         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1869         instances.update(row["domain"])
1870
1871     logger.info("Checking %d domains ...", len(domains))
1872     for row in domains:
1873         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1874         if not domain_helper.is_wanted(row["domain"]):
1875             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1876             continue
1877         elif instances.is_registered(row["domain"]):
1878             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1879             continue
1880
1881         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1882         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1883
1884     logger.debug("Success! - EXIT!")
1885     return 0
1886
1887 def convert_idna(args: argparse.Namespace) -> int:
1888     logger.debug("args[]='%s' - CALLED!", type(args))
1889
1890     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1891     rows = database.cursor.fetchall()
1892
1893     logger.debug("rows[]='%s'", type(rows))
1894     instances.translate_idnas(rows, "domain")
1895
1896     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1897     rows = database.cursor.fetchall()
1898
1899     logger.debug("rows[]='%s'", type(rows))
1900     instances.translate_idnas(rows, "origin")
1901
1902     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1903     rows = database.cursor.fetchall()
1904
1905     logger.debug("rows[]='%s'", type(rows))
1906     blocks.translate_idnas(rows, "blocker")
1907
1908     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1909     rows = database.cursor.fetchall()
1910
1911     logger.debug("rows[]='%s'", type(rows))
1912     blocks.translate_idnas(rows, "blocked")
1913
1914     logger.debug("Success! - EXIT!")
1915     return 0
1916
1917 def remove_invalid(args: argparse.Namespace) -> int:
1918     logger.debug("args[]='%s' - CALLED!", type(args))
1919
1920     logger.debug("Invoking locking.acquire() ...")
1921     locking.acquire()
1922
1923     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1924     rows = database.cursor.fetchall()
1925
1926     logger.info("Checking %d domains ...", len(rows))
1927     for row in rows:
1928         logger.debug("row[domain]='%s'", row["domain"])
1929         if not validators.domain(row["domain"].split("/")[0]):
1930             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1931             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1932             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1933
1934     logger.debug("Invoking commit() ...")
1935     database.connection.commit()
1936
1937     logger.info("Vaccum cleaning database ...")
1938     database.cursor.execute("VACUUM")
1939
1940     logger.debug("Success! - EXIT!")
1941     return 0