]> git.mxchange.org Git - fba.git/blob - fba/commands.py
c047597d4703d0a7f7945dd4911a9a781a95a441
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] == "":
152                 logger.debug("row[domain] is empty - SKIPPED!")
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] == "":
228                 logger.debug("entry[domain] is empty - SKIPPED!")
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353         deobfuscated = obfuscated = 0
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         for block in blocking:
357             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358
359             if block["block_level"] == "":
360                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
361                 continue
362
363             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
364             block["blocked"] = tidyup.domain(block["blocked"])
365             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
366             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367
368             if block["blocked"] == "":
369                 logger.warning("blocked is empty, blocker='%s'", blocker)
370                 continue
371             elif block["blocked"].endswith(".onion"):
372                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373                 continue
374             elif block["blocked"].endswith(".arpa"):
375                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".tld"):
378                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].find("*") >= 0:
381                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
382                 instances.set_has_obfuscation(blocker, True)
383                 obfuscated = obfuscated + 1
384
385                 # Some friendica servers also obscure domains without hash
386                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
387
388                 logger.debug("row[]='%s'", type(row))
389                 if row is None:
390                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
391                     continue
392
393                 deobfuscated = deobfuscated + 1
394                 block["blocked"] = row["domain"]
395                 origin           = row["origin"]
396                 nodeinfo_url     = row["nodeinfo_url"]
397             elif block["blocked"].find("?") >= 0:
398                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
399                 instances.set_has_obfuscation(blocker, True)
400                 obfuscated = obfuscated + 1
401
402                 # Some obscure them with question marks, not sure if that's dependent on version or not
403                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
404
405                 logger.debug("row[]='%s'", type(row))
406                 if row is None:
407                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
408                     continue
409
410                 deobfuscated = deobfuscated + 1
411                 block["blocked"] = row["domain"]
412                 origin           = row["origin"]
413                 nodeinfo_url     = row["nodeinfo_url"]
414
415             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
416             if block["blocked"] == "":
417                 logger.debug("block[blocked] is empty - SKIPPED!")
418                 continue
419
420             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
421             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
422             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
423
424             if not domain_helper.is_wanted(block["blocked"]):
425                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
426                 continue
427             elif block["block_level"] in ["accept", "accepted"]:
428                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
429                 continue
430             elif not instances.is_registered(block["blocked"]):
431                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
432                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
433
434             block["block_level"] = blocks.alias_block_level(block["block_level"])
435
436             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
437                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
438                 blockdict.append({
439                     "blocked": block["blocked"],
440                     "reason" : block["reason"],
441                 })
442
443             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
444             cookies.clear(block["blocked"])
445
446         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
447
448         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
449         if instances.has_pending(blocker):
450             logger.debug("Flushing updates for blocker='%s' ...", blocker)
451             instances.update(blocker)
452
453         logger.debug("Invoking commit() ...")
454         database.connection.commit()
455
456         logger.debug("Invoking cookies.clear(%s) ...", blocker)
457         cookies.clear(blocker)
458
459         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
460         if config.get("bot_enabled") and len(blockdict) > 0:
461             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
462             network.send_bot_post(blocker, blockdict)
463
464     logger.debug("Success! - EXIT!")
465     return 0
466
467 def fetch_observer(args: argparse.Namespace) -> int:
468     logger.debug("args[]='%s' - CALLED!", type(args))
469
470     logger.debug("Invoking locking.acquire() ...")
471     locking.acquire()
472
473     source_domain = "fediverse.observer"
474     if sources.is_recent(source_domain):
475         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
476         return 1
477     else:
478         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
479         sources.update(source_domain)
480
481     types = list()
482     if args.software is None:
483         logger.info("Fetching software list ...")
484         raw = utils.fetch_url(
485             f"https://{source_domain}",
486             network.web_headers,
487             (config.get("connection_timeout"), config.get("read_timeout"))
488         ).text
489         logger.debug("raw[%s]()=%d", type(raw), len(raw))
490
491         doc = bs4.BeautifulSoup(raw, features="html.parser")
492         logger.debug("doc[]='%s'", type(doc))
493
494         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
495         logger.debug("navbar[]='%s'", type(navbar))
496         if navbar is None:
497             logger.warning("Cannot find navigation bar, cannot continue!")
498             return 1
499
500         items = navbar.findAll("a", {"class": "dropdown-item"})
501         logger.debug("items[]='%s'", type(items))
502
503         logger.info("Checking %d menu items ...", len(items))
504         for item in items:
505             logger.debug("item[%s]='%s'", type(item), item)
506             if item.text.lower() == "all":
507                 logger.debug("Skipping 'All' menu entry ...")
508                 continue
509
510             logger.debug("Appending item.text='%s' ...", item.text)
511             types.append(tidyup.domain(item.text))
512     else:
513         logger.info("Adding args.software='%s' as type ...", args.software)
514         types.append(args.software)
515
516     logger.info("Fetching %d different table data ...", len(types))
517     for software in types:
518         logger.debug("software='%s' - BEFORE!", software)
519         software = software_helper.alias(software)
520         logger.debug("software='%s' - AFTER!", software)
521
522         if args.software is not None and args.software != software:
523             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
524             continue
525
526         doc = None
527         try:
528             logger.debug("Fetching table data for software='%s' ...", software)
529             raw = utils.fetch_url(
530                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
531                 network.web_headers,
532                 (config.get("connection_timeout"), config.get("read_timeout"))
533             ).text
534             logger.debug("raw[%s]()=%d", type(raw), len(raw))
535
536             doc = bs4.BeautifulSoup(raw, features="html.parser")
537             logger.debug("doc[]='%s'", type(doc))
538         except network.exceptions as exception:
539             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
540             continue
541
542         items = doc.findAll("a", {"class": "url"})
543         logger.info("Checking %d items,software='%s' ...", len(items), software)
544         for item in items:
545             logger.debug("item[]='%s'", type(item))
546             domain = item.decode_contents()
547             domain = tidyup.domain(domain) if domain not in [None, ""] else None
548             logger.debug("domain='%s' - AFTER!", domain)
549
550             if domain is None or domain == "":
551                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
552                 continue
553
554             logger.debug("domain='%s' - BEFORE!", domain)
555             domain = domain.encode("idna").decode("utf-8")
556             logger.debug("domain='%s' - AFTER!", domain)
557
558             if not domain_helper.is_wanted(domain):
559                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
560                 continue
561             elif instances.is_registered(domain):
562                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
563                 continue
564
565             logger.info("Fetching instances for domain='%s'", domain)
566             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
567
568     logger.debug("Success! - EXIT!")
569     return 0
570
571 def fetch_todon_wiki(args: argparse.Namespace) -> int:
572     logger.debug("args[]='%s' - CALLED!", type(args))
573
574     logger.debug("Invoking locking.acquire() ...")
575     locking.acquire()
576
577     source_domain = "wiki.todon.eu"
578     if sources.is_recent(source_domain):
579         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
580         return 1
581     else:
582         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
583         sources.update(source_domain)
584
585     blocklist = {
586         "silenced": list(),
587         "reject": list(),
588     }
589
590     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
591     raw = utils.fetch_url(
592         f"https://{source_domain}/todon/domainblocks",
593         network.web_headers,
594         (config.get("connection_timeout"), config.get("read_timeout"))
595     ).text
596     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
597
598     doc = bs4.BeautifulSoup(raw, "html.parser")
599     logger.debug("doc[]='%s'", type(doc))
600
601     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
602     logger.info("Checking %d silenced/limited entries ...", len(silenced))
603     blocklist["silenced"] = utils.find_domains(silenced, "div")
604
605     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
606     logger.info("Checking %d suspended entries ...", len(suspended))
607     blocklist["reject"] = utils.find_domains(suspended, "div")
608
609     blocking = blocklist["silenced"] + blocklist["reject"]
610     blocker = "todon.eu"
611
612     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
613     instances.set_last_blocked(blocker)
614     instances.set_total_blocks(blocker, blocking)
615
616     blockdict = list()
617     for block_level in blocklist:
618         blockers = blocklist[block_level]
619
620         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
621         for blocked in blockers:
622             logger.debug("blocked='%s'", blocked)
623
624             if not instances.is_registered(blocked):
625                 try:
626                     logger.info("Fetching instances from domain='%s' ...", blocked)
627                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
628                 except network.exceptions as exception:
629                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
630                     instances.set_last_error(blocked, exception)
631
632             if not domain_helper.is_wanted(blocked):
633                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
634                 continue
635             elif not domain_helper.is_wanted(blocker):
636                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
637                 continue
638             elif blocks.is_instance_blocked(blocker, blocked, block_level):
639                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
640                 continue
641
642             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
643             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
644                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
645                 blockdict.append({
646                     "blocked": blocked,
647                     "reason" : None,
648                 })
649
650         logger.debug("Invoking commit() ...")
651         database.connection.commit()
652
653         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
654         if config.get("bot_enabled") and len(blockdict) > 0:
655             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
656             network.send_bot_post(blocker, blockdict)
657
658     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
659     if instances.has_pending(blocker):
660         logger.debug("Flushing updates for blocker='%s' ...", blocker)
661         instances.update(blocker)
662
663     logger.debug("Success! - EXIT!")
664     return 0
665
666 def fetch_cs(args: argparse.Namespace):
667     logger.debug("args[]='%s' - CALLED!", type(args))
668
669     logger.debug("Invoking locking.acquire() ...")
670     locking.acquire()
671
672     extensions = [
673         "extra",
674         "abbr",
675         "attr_list",
676         "def_list",
677         "fenced_code",
678         "footnotes",
679         "md_in_html",
680         "admonition",
681         "codehilite",
682         "legacy_attrs",
683         "legacy_em",
684         "meta",
685         "nl2br",
686         "sane_lists",
687         "smarty",
688         "toc",
689         "wikilinks"
690     ]
691
692     blocklist = {
693         "silenced": list(),
694         "reject"  : list(),
695     }
696
697     source_domain = "raw.githubusercontent.com"
698     if sources.is_recent(source_domain):
699         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
700         return 1
701     else:
702         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
703         sources.update(source_domain)
704
705     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
706     raw = utils.fetch_url(
707         f"https://{source_domain}/chaossocial/meta/master/federation.md",
708         network.web_headers,
709         (config.get("connection_timeout"), config.get("read_timeout"))
710     ).text
711     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
712
713     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
714     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
715
716     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
717     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
718     blocklist["silenced"] = federation.find_domains(silenced)
719
720     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
721     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
722     blocklist["reject"] = federation.find_domains(blocked)
723
724     blocking = blocklist["silenced"] + blocklist["reject"]
725     blocker = "chaos.social"
726
727     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
728     instances.set_last_blocked(blocker)
729     instances.set_total_blocks(blocker, blocking)
730
731     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
732     if len(blocking) > 0:
733         blockdict = list()
734         for block_level in blocklist:
735             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
736
737             for row in blocklist[block_level]:
738                 logger.debug("row[%s]='%s'", type(row), row)
739                 if not "domain" in row:
740                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
741                     continue
742                 elif not instances.is_registered(row["domain"]):
743                     try:
744                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
745                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
746                     except network.exceptions as exception:
747                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
748                         instances.set_last_error(row["domain"], exception)
749
750                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
751                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
752                     blockdict.append({
753                         "blocked": row["domain"],
754                         "reason" : row["reason"],
755                     })
756
757         logger.debug("Invoking commit() ...")
758         database.connection.commit()
759
760         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
761         if config.get("bot_enabled") and len(blockdict) > 0:
762             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
763             network.send_bot_post(blocker, blockdict)
764
765     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
766     if instances.has_pending(blocker):
767         logger.debug("Flushing updates for blocker='%s' ...", blocker)
768         instances.update(blocker)
769
770     logger.debug("Success! - EXIT!")
771     return 0
772
773 def fetch_fba_rss(args: argparse.Namespace) -> int:
774     logger.debug("args[]='%s' - CALLED!", type(args))
775
776     domains = list()
777
778     logger.debug("Invoking locking.acquire() ...")
779     locking.acquire()
780
781     components = urlparse(args.feed)
782     domain = components.netloc.lower().split(":")[0]
783
784     logger.debug("domain='%s'", domain)
785     if sources.is_recent(domain):
786         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
787         return 0
788     else:
789         logger.debug("domain='%s' has not been recently used, marking ...", domain)
790         sources.update(domain)
791
792     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
793     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
794
795     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
796     if response.ok and response.status_code == 200 and len(response.text) > 0:
797         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
798         rss = atoma.parse_rss_bytes(response.content)
799
800         logger.debug("rss[]='%s'", type(rss))
801         for item in rss.items:
802             logger.debug("item[%s]='%s'", type(item), item)
803             domain = item.link.split("=")[1]
804             domain = tidyup.domain(domain) if domain not in[None, ""] else None
805
806             logger.debug("domain='%s' - AFTER!", domain)
807             if domain is None or domain == "":
808                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
809                 continue
810
811             logger.debug("domain='%s' - BEFORE!", domain)
812             domain = domain.encode("idna").decode("utf-8")
813             logger.debug("domain='%s' - AFTER!", domain)
814
815             if not domain_helper.is_wanted(domain):
816                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
817                 continue
818             elif domain in domains:
819                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
820                 continue
821             elif instances.is_registered(domain):
822                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
823                 continue
824             elif instances.is_recent(domain):
825                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
826                 continue
827
828             logger.debug("Adding domain='%s'", domain)
829             domains.append(domain)
830
831     logger.debug("domains()=%d", len(domains))
832     if len(domains) > 0:
833         logger.info("Adding %d new instances ...", len(domains))
834         for domain in domains:
835             logger.debug("domain='%s'", domain)
836             try:
837                 logger.info("Fetching instances from domain='%s' ...", domain)
838                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
839             except network.exceptions as exception:
840                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
841                 instances.set_last_error(domain, exception)
842                 return 100
843
844     logger.debug("Success! - EXIT!")
845     return 0
846
847 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
848     logger.debug("args[]='%s' - CALLED!", type(args))
849
850     logger.debug("Invoking locking.acquire() ...")
851     locking.acquire()
852
853     source_domain = "ryona.agency"
854     feed = f"https://{source_domain}/users/fba/feed.atom"
855
856     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
857     if args.feed is not None and validators.url(args.feed):
858         logger.debug("Setting feed='%s' ...", args.feed)
859         feed = str(args.feed)
860         source_domain = urlparse(args.feed).netloc
861
862     if sources.is_recent(source_domain):
863         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
864         return 1
865     else:
866         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
867         sources.update(source_domain)
868
869     domains = list()
870
871     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
872     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
873
874     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
875     if response.ok and response.status_code == 200 and len(response.text) > 0:
876         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
877         atom = atoma.parse_atom_bytes(response.content)
878
879         logger.debug("atom[]='%s'", type(atom))
880         for entry in atom.entries:
881             logger.debug("entry[]='%s'", type(entry))
882             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
883             logger.debug("doc[]='%s'", type(doc))
884             for element in doc.findAll("a"):
885                 logger.debug("element[]='%s'", type(element))
886                 for href in element["href"].split(","):
887                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
888                     domain = tidyup.domain(href) if href not in [None, ""] else None
889
890                     logger.debug("domain='%s' - AFTER!", domain)
891                     if domain is None or domain == "":
892                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
893                         continue
894
895                     logger.debug("domain='%s' - BEFORE!", domain)
896                     domain = domain.encode("idna").decode("utf-8")
897                     logger.debug("domain='%s' - AFTER!", domain)
898
899                     if not domain_helper.is_wanted(domain):
900                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
901                         continue
902                     elif domain in domains:
903                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
904                         continue
905                     elif instances.is_registered(domain):
906                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
907                         continue
908                     elif instances.is_recent(domain):
909                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
910                         continue
911
912                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
913                     domains.append(domain)
914
915     logger.debug("domains()=%d", len(domains))
916     if len(domains) > 0:
917         logger.info("Adding %d new instances ...", len(domains))
918         for domain in domains:
919             logger.debug("domain='%s'", domain)
920             try:
921                 logger.info("Fetching instances from domain='%s' ...", domain)
922                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
923             except network.exceptions as exception:
924                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
925                 instances.set_last_error(domain, exception)
926                 return 100
927
928     logger.debug("Success! - EXIT!")
929     return 0
930
931 def fetch_instances(args: argparse.Namespace) -> int:
932     logger.debug("args[]='%s' - CALLED!", type(args))
933
934     logger.debug("args.domain='%s' - checking ...", args.domain)
935     if not validators.domain(args.domain):
936         logger.warning("args.domain='%s' is not valid.", args.domain)
937         return 100
938     elif blacklist.is_blacklisted(args.domain):
939         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
940         return 101
941
942     logger.debug("Invoking locking.acquire() ...")
943     locking.acquire()
944
945     # Initialize values
946     domain = tidyup.domain(args.domain)
947     origin = software = None
948
949     # Fetch record
950     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
951     row = database.cursor.fetchone()
952     if row is not None:
953         origin = row["origin"]
954         software = row["software"]
955
956     if software_helper.is_relay(software):
957         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
958         return 102
959
960     # Initial fetch
961     try:
962         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
963         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
964     except network.exceptions as exception:
965         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
966         instances.set_last_error(args.domain, exception)
967         instances.update(args.domain)
968         return 100
969
970     if args.single:
971         logger.debug("Not fetching more instances - EXIT!")
972         return 0
973
974     # Loop through some instances
975     database.cursor.execute(
976         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
977     )
978
979     rows = database.cursor.fetchall()
980     logger.info("Checking %d entries ...", len(rows))
981     for row in rows:
982         logger.debug("row[domain]='%s'", row["domain"])
983         if row["domain"] == "":
984             logger.debug("row[domain] is empty - SKIPPED!")
985             continue
986
987         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
988         domain = row["domain"].encode("idna").decode("utf-8")
989         logger.debug("domain='%s' - AFTER!", domain)
990
991         if not domain_helper.is_wanted(domain):
992             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
993             continue
994
995         try:
996             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
997             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
998         except network.exceptions as exception:
999             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1000             instances.set_last_error(domain, exception)
1001
1002     logger.debug("Success - EXIT!")
1003     return 0
1004
1005 def fetch_csv(args: argparse.Namespace) -> int:
1006     logger.debug("args[]='%s' - CALLED!", type(args))
1007
1008     logger.debug("Invoking locking.acquire() ...")
1009     locking.acquire()
1010
1011     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1012     for block in blocklists.csv_files:
1013         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1014
1015         # Is domain given and not equal blocker?
1016         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1017             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1018             continue
1019
1020         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1021         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1022
1023     logger.debug("Success - EXIT!")
1024     return 0
1025
1026 def fetch_oliphant(args: argparse.Namespace) -> int:
1027     logger.debug("args[]='%s' - CALLED!", type(args))
1028
1029     logger.debug("Invoking locking.acquire() ...")
1030     locking.acquire()
1031
1032     source_domain = "codeberg.org"
1033     if sources.is_recent(source_domain):
1034         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1035         return 1
1036     else:
1037         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1038         sources.update(source_domain)
1039
1040     # Base URL
1041     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1042
1043     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1044     for block in blocklists.oliphant_blocklists:
1045         # Is domain given and not equal blocker?
1046         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1047             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1048             continue
1049
1050         url = f"{base_url}/{block['csv_url']}"
1051
1052         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1053         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1054
1055     logger.debug("Success! - EXIT!")
1056     return 0
1057
1058 def fetch_txt(args: argparse.Namespace) -> int:
1059     logger.debug("args[]='%s' - CALLED!", type(args))
1060
1061     logger.debug("Invoking locking.acquire() ...")
1062     locking.acquire()
1063
1064     # Static URLs
1065     urls = ({
1066         "blocker": "seirdy.one",
1067         "url"    : "https://seirdy.one/pb/bsl.txt",
1068     },)
1069
1070     logger.info("Checking %d text file(s) ...", len(urls))
1071     for row in urls:
1072         logger.debug("Fetching row[url]='%s' ...", row["url"])
1073         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1074
1075         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1076         if response.ok and response.status_code == 200 and response.text != "":
1077             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1078             domains = response.text.strip().split("\n")
1079
1080             logger.info("Processing %d domains ...", len(domains))
1081             for domain in domains:
1082                 logger.debug("domain='%s' - BEFORE!", domain)
1083                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1084
1085                 logger.debug("domain='%s' - AFTER!", domain)
1086                 if domain is None or domain == "":
1087                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1088                     continue
1089                 elif not domain_helper.is_wanted(domain):
1090                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1091                     continue
1092                 elif instances.is_recent(domain):
1093                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1094                     continue
1095
1096                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1097                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1098
1099                 logger.debug("processed='%s'", processed)
1100                 if not processed:
1101                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1102                     continue
1103
1104     logger.debug("Success! - EXIT!")
1105     return 0
1106
1107 def fetch_fedipact(args: argparse.Namespace) -> int:
1108     logger.debug("args[]='%s' - CALLED!", type(args))
1109
1110     logger.debug("Invoking locking.acquire() ...")
1111     locking.acquire()
1112
1113     source_domain = "fedipact.online"
1114     if sources.is_recent(source_domain):
1115         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1116         return 1
1117     else:
1118         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1119         sources.update(source_domain)
1120
1121     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1122     response = utils.fetch_url(
1123         f"https://{source_domain}",
1124         network.web_headers,
1125         (config.get("connection_timeout"), config.get("read_timeout"))
1126     )
1127
1128     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1129     if response.ok and response.status_code == 200 and response.text != "":
1130         logger.debug("Parsing %d Bytes ...", len(response.text))
1131
1132         doc = bs4.BeautifulSoup(response.text, "html.parser")
1133         logger.debug("doc[]='%s'", type(doc))
1134
1135         rows = doc.findAll("li")
1136         logger.info("Checking %d row(s) ...", len(rows))
1137         for row in rows:
1138             logger.debug("row[]='%s'", type(row))
1139             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1140
1141             logger.debug("domain='%s' - AFTER!", domain)
1142             if domain is None or domain == "":
1143                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1144                 continue
1145
1146             logger.debug("domain='%s' - BEFORE!", domain)
1147             domain = domain.encode("idna").decode("utf-8")
1148             logger.debug("domain='%s' - AFTER!", domain)
1149
1150             if not domain_helper.is_wanted(domain):
1151                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1152                 continue
1153             elif instances.is_registered(domain):
1154                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1155                 continue
1156             elif instances.is_recent(domain):
1157                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1158                 continue
1159
1160             logger.info("Fetching domain='%s' ...", domain)
1161             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1162
1163     logger.debug("Success! - EXIT!")
1164     return 0
1165
1166 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1167     logger.debug("args[]='%s' - CALLED!", type(args))
1168
1169     logger.debug("Invoking locking.acquire() ...")
1170     locking.acquire()
1171
1172     source_domain = "instances.joinmobilizon.org"
1173     if sources.is_recent(source_domain):
1174         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1175         return 1
1176     else:
1177         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1178         sources.update(source_domain)
1179
1180     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1181     raw = utils.fetch_url(
1182         f"https://{source_domain}/api/v1/instances",
1183         network.web_headers,
1184         (config.get("connection_timeout"), config.get("read_timeout"))
1185     ).text
1186     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1187
1188     parsed = json.loads(raw)
1189     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1190
1191     if "data" not in parsed:
1192         logger.warning("parsed()=%d does not contain key 'data'")
1193         return 1
1194
1195     logger.info("Checking %d instances ...", len(parsed["data"]))
1196     for row in parsed["data"]:
1197         logger.debug("row[]='%s'", type(row))
1198         if "host" not in row:
1199             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1200             continue
1201         elif not domain_helper.is_wanted(row["host"]):
1202             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1203             continue
1204         elif instances.is_registered(row["host"]):
1205             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1206             continue
1207
1208         logger.info("Fetching row[host]='%s' ...", row["host"])
1209         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1210
1211     logger.debug("Success! - EXIT!")
1212     return 0
1213
1214 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1215     logger.debug("args[]='%s' - CALLED!", type(args))
1216
1217     logger.debug("Invoking locking.acquire() ...")
1218     locking.acquire()
1219
1220     source_domain = "instanceapp.misskey.page"
1221     if sources.is_recent(source_domain):
1222         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1223         return 1
1224     else:
1225         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1226         sources.update(source_domain)
1227
1228     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1229     raw = utils.fetch_url(
1230         f"https://{source_domain}/instances.json",
1231         network.web_headers,
1232         (config.get("connection_timeout"), config.get("read_timeout"))
1233     ).text
1234     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1235
1236     parsed = json.loads(raw)
1237     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1238
1239     if "instancesInfos" not in parsed:
1240         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1241         return 1
1242
1243     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1244     for row in parsed["instancesInfos"]:
1245         logger.debug("row[%s]='%s'", type(row), row)
1246         if "url" not in row:
1247             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1248             continue
1249         elif not domain_helper.is_wanted(row["url"]):
1250             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1251             continue
1252         elif instances.is_registered(row["url"]):
1253             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1254             continue
1255
1256         logger.info("Fetching row[url]='%s' ...", row["url"])
1257         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1258
1259     logger.debug("Success! - EXIT!")
1260     return 0
1261
1262 def recheck_obfuscation(args: argparse.Namespace) -> int:
1263     logger.debug("args[]='%s' - CALLED!", type(args))
1264
1265     logger.debug("Invoking locking.acquire() ...")
1266     locking.acquire()
1267
1268     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1269         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1270     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1271         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1272     else:
1273         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1274
1275     rows = database.cursor.fetchall()
1276     logger.info("Checking %d domains ...", len(rows))
1277     for row in rows:
1278         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1279         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1280             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1281             continue
1282
1283         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1284         blocking = federation.fetch_blocks(row["domain"])
1285
1286         logger.debug("blocking()=%d", len(blocking))
1287         if len(blocking) == 0:
1288             if row["software"] == "pleroma":
1289                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1290                 blocking = pleroma.fetch_blocks(row["domain"])
1291             elif row["software"] == "mastodon":
1292                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1293                 blocking = mastodon.fetch_blocks(row["domain"])
1294             elif row["software"] == "lemmy":
1295                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1296                 blocking = lemmy.fetch_blocks(row["domain"])
1297             elif row["software"] == "friendica":
1298                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1299                 blocking = friendica.fetch_blocks(row["domain"])
1300             elif row["software"] == "misskey":
1301                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1302                 blocking = misskey.fetch_blocks(row["domain"])
1303             else:
1304                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1305
1306         # c.s isn't part of oliphant's "hidden" blocklists
1307         logger.debug("row[domain]='%s'", row["domain"])
1308         if row["domain"] != "chaos.social" and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1309             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1310             instances.set_last_blocked(row["domain"])
1311             instances.set_total_blocks(row["domain"], blocking)
1312
1313         obfuscated = 0
1314         blockdict = list()
1315
1316         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1317         for block in blocking:
1318             logger.debug("block[blocked]='%s'", block["blocked"])
1319             blocked = None
1320
1321             if block["blocked"] == "":
1322                 logger.debug("block[blocked] is empty - SKIPPED!")
1323                 continue
1324             elif block["blocked"].endswith(".arpa"):
1325                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1326                 continue
1327             elif block["blocked"].endswith(".tld"):
1328                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1329                 continue
1330             elif block["blocked"].endswith(".onion"):
1331                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1332                 continue
1333             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1334                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1335                 obfuscated = obfuscated + 1
1336                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1337             elif not domain_helper.is_wanted(block["blocked"]):
1338                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1339                 continue
1340             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1341                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1342                 continue
1343
1344             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1345             if blocked is not None and blocked != block["blocked"]:
1346                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1347                 obfuscated = obfuscated - 1
1348
1349                 if blacklist.is_blacklisted(blocked):
1350                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1351                     continue
1352                 elif blacklist.is_blacklisted(row["domain"]):
1353                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1354                     continue
1355                 elif blocks.is_instance_blocked(row["domain"], blocked):
1356                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1357                     continue
1358
1359                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1360
1361                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1362                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1363                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1364                     blockdict.append({
1365                         "blocked": blocked,
1366                         "reason" : block["reason"],
1367                     })
1368
1369         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1370         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1371
1372         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1373         if instances.has_pending(row["domain"]):
1374             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1375             instances.update(row["domain"])
1376
1377         logger.debug("Invoking commit() ...")
1378         database.connection.commit()
1379
1380         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1381         if config.get("bot_enabled") and len(blockdict) > 0:
1382             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1383             network.send_bot_post(row["domain"], blockdict)
1384
1385     logger.debug("Success! - EXIT!")
1386     return 0
1387
1388 def fetch_fedilist(args: argparse.Namespace) -> int:
1389     logger.debug("args[]='%s' - CALLED!", type(args))
1390
1391     logger.debug("Invoking locking.acquire() ...")
1392     locking.acquire()
1393
1394     source_domain = "demo.fedilist.com"
1395     if sources.is_recent(source_domain):
1396         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1397         return 1
1398     else:
1399         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1400         sources.update(source_domain)
1401
1402     url = f"http://{source_domain}/instance/csv?onion=not"
1403     if args.software is not None and args.software != "":
1404         logger.debug("args.software='%s'", args.software)
1405         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1406
1407     logger.info("Fetching url='%s' ...", url)
1408     response = reqto.get(
1409         url,
1410         headers=network.web_headers,
1411         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1412         allow_redirects=False
1413     )
1414
1415     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1416     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1417         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1418         return 1
1419
1420     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1421
1422     logger.debug("reader[]='%s'", type(reader))
1423     if reader is None:
1424         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1425         return 2
1426
1427     rows = list(reader)
1428
1429     logger.info("Checking %d rows ...", len(rows))
1430     for row in rows:
1431         logger.debug("row[]='%s'", type(row))
1432         if "hostname" not in row:
1433             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1434             continue
1435
1436         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1437         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1438         logger.debug("domain='%s' - AFTER!", domain)
1439
1440         if domain is None or domain == "":
1441             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1442             continue
1443
1444         logger.debug("domain='%s' - BEFORE!", domain)
1445         domain = domain.encode("idna").decode("utf-8")
1446         logger.debug("domain='%s' - AFTER!", domain)
1447
1448         if not domain_helper.is_wanted(domain):
1449             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1450             continue
1451         elif (args.force is None or not args.force) and instances.is_registered(domain):
1452             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1453             continue
1454         elif instances.is_recent(domain):
1455             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1456             continue
1457
1458         logger.info("Fetching instances from domain='%s' ...", domain)
1459         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1460
1461     logger.debug("Success! - EXIT!")
1462     return 0
1463
1464 def update_nodeinfo(args: argparse.Namespace) -> int:
1465     logger.debug("args[]='%s' - CALLED!", type(args))
1466
1467     logger.debug("Invoking locking.acquire() ...")
1468     locking.acquire()
1469
1470     if args.domain is not None and args.domain != "":
1471         logger.debug("Fetching args.domain='%s'", args.domain)
1472         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1473     elif args.software is not None and args.software != "":
1474         logger.info("Fetching domains for args.software='%s'", args.software)
1475         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC")
1476     elif args.mode is not None and args.mode != "":
1477         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1478         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC")
1479     elif args.no_software:
1480         logger.info("Fetching domains with no software type detected ...")
1481         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1482     elif args.no_auto:
1483         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1484         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1485     elif args.no_detection:
1486         logger.info("Fetching domains with no detection mode being set ...")
1487         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1488     else:
1489         logger.info("Fetching domains for recently updated ...")
1490         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1491
1492     domains = database.cursor.fetchall()
1493
1494     logger.info("Checking %d domain(s) ...", len(domains))
1495     cnt = 0
1496     for row in domains:
1497         logger.debug("row[]='%s'", type(row))
1498         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1499             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1500             continue
1501
1502         try:
1503             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1504             software = federation.determine_software(row["domain"])
1505
1506             logger.debug("Determined software='%s'", software)
1507             if (software != row["software"] and software is not None) or args.force is True:
1508                 logger.debug("software='%s'", software)
1509                 if software is None:
1510                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1511                     instances.set_nodeinfo_url(row["domain"], None)
1512
1513                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1514                 instances.set_software(row["domain"], software)
1515
1516             if software is not None:
1517                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1518                 instances.set_success(row["domain"])
1519         except network.exceptions as exception:
1520             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1521             instances.set_last_error(row["domain"], exception)
1522
1523         instances.set_last_nodeinfo(row["domain"])
1524         instances.update(row["domain"])
1525         cnt = cnt + 1
1526
1527     logger.debug("Success! - EXIT!")
1528     return 0
1529
1530 def fetch_instances_social(args: argparse.Namespace) -> int:
1531     logger.debug("args[]='%s' - CALLED!", type(args))
1532
1533     logger.debug("Invoking locking.acquire() ...")
1534     locking.acquire()
1535
1536     source_domain = "instances.social"
1537
1538     if config.get("instances_social_api_key") == "":
1539         logger.error("API key not set. Please set in your config.json file.")
1540         return 1
1541     elif sources.is_recent(source_domain):
1542         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1543         return 2
1544     else:
1545         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1546         sources.update(source_domain)
1547
1548     headers = {
1549         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1550     }
1551
1552     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1553     fetched = network.get_json_api(
1554         source_domain,
1555         "/api/1.0/instances/list?count=0&sort_by=name",
1556         headers,
1557         (config.get("connection_timeout"), config.get("read_timeout"))
1558     )
1559     logger.debug("fetched[]='%s'", type(fetched))
1560
1561     if "error_message" in fetched:
1562         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1563         return 2
1564     elif "exception" in fetched:
1565         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1566         return 3
1567     elif "json" not in fetched:
1568         logger.warning("fetched has no element 'json' - EXIT!")
1569         return 4
1570     elif "instances" not in fetched["json"]:
1571         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1572         return 5
1573
1574     domains = list()
1575     rows = fetched["json"]["instances"]
1576
1577     logger.info("Checking %d row(s) ...", len(rows))
1578     for row in rows:
1579         logger.debug("row[]='%s'", type(row))
1580         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1581         logger.debug("domain='%s' - AFTER!", domain)
1582
1583         if domain is None and domain == "":
1584             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1585             continue
1586
1587         logger.debug("domain='%s' - BEFORE!", domain)
1588         domain = domain.encode("idna").decode("utf-8")
1589         logger.debug("domain='%s' - AFTER!", domain)
1590
1591         if not domain_helper.is_wanted(domain):
1592             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1593             continue
1594         elif domain in domains:
1595             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1596             continue
1597         elif instances.is_registered(domain):
1598             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1599             continue
1600         elif instances.is_recent(domain):
1601             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1602             continue
1603
1604         logger.info("Fetching instances from domain='%s'", domain)
1605         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1606
1607     logger.debug("Success! - EXIT!")
1608     return 0
1609
1610 def fetch_relays(args: argparse.Namespace) -> int:
1611     logger.debug("args[]='%s' - CALLED!", type(args))
1612
1613     logger.debug("Invoking locking.acquire() ...")
1614     locking.acquire()
1615
1616     if args.domain is not None and args.domain != "":
1617         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1618     elif args.software is not None and args.software != "":
1619         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1620     else:
1621         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1622
1623     domains = list()
1624     rows = database.cursor.fetchall()
1625
1626     logger.info("Checking %d relays ...", len(rows))
1627     for row in rows:
1628         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1629         peers = list()
1630         if not args.force and instances.is_recent(row["domain"]):
1631             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1632             continue
1633
1634         try:
1635             if row["software"] == "pub-relay":
1636                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1637                 raw = network.fetch_api_url(
1638                     row["nodeinfo_url"],
1639                     (config.get("connection_timeout"), config.get("read_timeout"))
1640                 )
1641
1642                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1643                 if "exception" in raw:
1644                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1645                     raise raw["exception"]
1646                 elif "error_message" in raw:
1647                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1648                     instances.set_last_error(row["domain"], raw)
1649                     instances.set_last_instance_fetch(row["domain"])
1650                     instances.update(row["domain"])
1651                     continue
1652                 elif "json" not in raw:
1653                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1654                     continue
1655                 elif not "metadata" in raw["json"]:
1656                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1657                     continue
1658                 elif not "peers" in raw["json"]["metadata"]:
1659                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1660                     continue
1661             else:
1662                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1663                 raw = utils.fetch_url(
1664                     f"https://{row['domain']}",
1665                     network.web_headers,
1666                     (config.get("connection_timeout"), config.get("read_timeout"))
1667                 ).text
1668                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1669
1670                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1671                 logger.debug("doc[]='%s'", type(doc))
1672
1673         except network.exceptions as exception:
1674             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1675             instances.set_last_error(row["domain"], exception)
1676             instances.set_last_instance_fetch(row["domain"])
1677             instances.update(row["domain"])
1678             continue
1679
1680         logger.debug("row[software]='%s'", row["software"])
1681         if row["software"] == "activityrelay":
1682             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1683             tags = doc.findAll("p")
1684
1685             logger.debug("Checking %d paragraphs ...", len(tags))
1686             for tag in tags:
1687                 logger.debug("tag[]='%s'", type(tag))
1688                 if len(tag.contents) == 0:
1689                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1690                     continue
1691                 elif "registered instances" not in tag.contents[0]:
1692                     logger.debug("Skipping paragraph, text not found.")
1693                     continue
1694
1695                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1696                 for domain in tag.contents:
1697                     logger.debug("domain[%s]='%s'", type(domain), domain)
1698                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1699                         continue
1700
1701                     domain = str(domain)
1702                     logger.debug("domain='%s'", domain)
1703                     if not domain_helper.is_wanted(domain):
1704                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1705                         continue
1706
1707                     logger.debug("domain='%s' - BEFORE!", domain)
1708                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1709                     logger.debug("domain='%s' - AFTER!", domain)
1710
1711                     if domain is None or domain == "":
1712                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1713                         continue
1714                     elif domain not in peers:
1715                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1716                         peers.append(domain)
1717
1718                     if dict_helper.has_key(domains, "domain", domain):
1719                         logger.debug("domain='%s' already added", domain)
1720                         continue
1721
1722                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1723                     domains.append({
1724                         "domain": domain,
1725                         "origin": row["domain"],
1726                     })
1727         elif row["software"] in ["aoderelay", "selective-relay"]:
1728             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1729             if row["software"] == "aoderelay":
1730                 tags = doc.findAll("section", {"class": "instance"})
1731             else:
1732                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1733
1734             logger.debug("Checking %d tags ...", len(tags))
1735             for tag in tags:
1736                 logger.debug("tag[]='%s'", type(tag))
1737
1738                 link = tag.find("a")
1739                 logger.debug("link[%s]='%s'", type(link), link)
1740                 if not isinstance(link, bs4.element.Tag):
1741                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1742                     continue
1743
1744                 components = urlparse(link.get("href"))
1745                 logger.debug("components(%d)='%s'", len(components), components)
1746                 domain = components.netloc.lower().split(":")[0]
1747
1748                 logger.debug("domain='%s' - BEFORE!", domain)
1749                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1750                 logger.debug("domain='%s' - AFTER!", domain)
1751
1752                 if domain is None or domain == "":
1753                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1754                     continue
1755                 elif domain not in peers:
1756                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1757                     peers.append(domain)
1758
1759                 if dict_helper.has_key(domains, "domain", domain):
1760                     logger.debug("domain='%s' already added", domain)
1761                     continue
1762
1763                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1764                 domains.append({
1765                     "domain": domain,
1766                     "origin": row["domain"],
1767                 })
1768         elif row["software"] == "pub-relay":
1769             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1770             for domain in raw["json"]["metadata"]["peers"]:
1771                 logger.debug("domain='%s' - BEFORE!", domain)
1772                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1773                 logger.debug("domain='%s' - AFTER!", domain)
1774
1775                 if domain is None or domain == "":
1776                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1777                     continue
1778                 elif domain not in peers:
1779                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1780                     peers.append(domain)
1781
1782                 if dict_helper.has_key(domains, "domain", domain):
1783                     logger.debug("domain='%s' already added", domain)
1784                     continue
1785
1786                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1787                 domains.append({
1788                     "domain": domain,
1789                     "origin": row["domain"],
1790                 })
1791         else:
1792             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1793             continue
1794
1795         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1796         instances.set_last_instance_fetch(row["domain"])
1797
1798         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1799         instances.set_total_peers(row["domain"], peers)
1800
1801         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1802         instances.update(row["domain"])
1803
1804     logger.info("Checking %d domains ...", len(domains))
1805     for row in domains:
1806         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1807         if not domain_helper.is_wanted(row["domain"]):
1808             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1809             continue
1810         elif instances.is_registered(row["domain"]):
1811             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1812             continue
1813
1814         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1815         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1816
1817     logger.debug("Success! - EXIT!")
1818     return 0
1819
1820 def convert_idna(args: argparse.Namespace) -> int:
1821     logger.debug("args[]='%s' - CALLED!", type(args))
1822
1823     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1824     rows = database.cursor.fetchall()
1825
1826     logger.debug("rows[]='%s'", type(rows))
1827     instances.translate_idnas(rows, "domain")
1828
1829     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1830     rows = database.cursor.fetchall()
1831
1832     logger.debug("rows[]='%s'", type(rows))
1833     instances.translate_idnas(rows, "origin")
1834
1835     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1836     rows = database.cursor.fetchall()
1837
1838     logger.debug("rows[]='%s'", type(rows))
1839     blocks.translate_idnas(rows, "blocker")
1840
1841     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1842     rows = database.cursor.fetchall()
1843
1844     logger.debug("rows[]='%s'", type(rows))
1845     blocks.translate_idnas(rows, "blocked")
1846
1847     logger.debug("Success! - EXIT!")
1848     return 0
1849
1850 def remove_invalid(args: argparse.Namespace) -> int:
1851     logger.debug("args[]='%s' - CALLED!", type(args))
1852
1853     logger.debug("Invoking locking.acquire() ...")
1854     locking.acquire()
1855
1856     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1857     rows = database.cursor.fetchall()
1858
1859     logger.info("Checking %d domains ...", len(rows))
1860     for row in rows:
1861         logger.debug("row[domain]='%s'", row["domain"])
1862         if not validators.domain(row["domain"].split("/")[0]):
1863             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1864             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1865             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1866
1867     logger.debug("Invoking commit() ...")
1868     database.connection.commit()
1869
1870     logger.info("Vaccum cleaning database ...")
1871     database.cursor.execute("VACUUM")
1872
1873     logger.debug("Success! - EXIT!")
1874     return 0