]> git.mxchange.org Git - fba.git/blob - fba/commands.py
7886d379a30c87f1c0dd0189560690a64ff9cba5
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] is None or row["domain"] == "":
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] is None or entry["domain"] == "":
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353         deobfuscated = obfuscated = 0
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         for block in blocking:
357             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358
359             if block["block_level"] == "":
360                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
361                 continue
362
363             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
364             block["blocked"] = tidyup.domain(block["blocked"])
365             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
366             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367
368             if block["blocked"] is None or block["blocked"] == "":
369                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
370                 continue
371             elif block["blocked"].endswith(".onion"):
372                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373                 continue
374             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
375                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".arpa"):
378                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].endswith(".tld"):
381                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
382                 continue
383             elif block["blocked"].find("*") >= 0:
384                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
385                 instances.set_has_obfuscation(blocker, True)
386                 obfuscated = obfuscated + 1
387
388                 # Some friendica servers also obscure domains without hash
389                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
390
391                 logger.debug("row[]='%s'", type(row))
392                 if row is None:
393                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
394                     continue
395
396                 deobfuscated = deobfuscated + 1
397                 block["blocked"] = row["domain"]
398                 origin           = row["origin"]
399                 nodeinfo_url     = row["nodeinfo_url"]
400             elif block["blocked"].find("?") >= 0:
401                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
402                 instances.set_has_obfuscation(blocker, True)
403                 obfuscated = obfuscated + 1
404
405                 # Some obscure them with question marks, not sure if that's dependent on version or not
406                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
407
408                 logger.debug("row[]='%s'", type(row))
409                 if row is None:
410                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
411                     continue
412
413                 deobfuscated = deobfuscated + 1
414                 block["blocked"] = row["domain"]
415                 origin           = row["origin"]
416                 nodeinfo_url     = row["nodeinfo_url"]
417
418             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
419             if block["blocked"] is None or block["blocked"] == "":
420                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
421                 continue
422
423             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
424             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
425             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
426
427             if not domain_helper.is_wanted(block["blocked"]):
428                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
429                 continue
430             elif block["block_level"] in ["accept", "accepted"]:
431                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
432                 continue
433             elif not instances.is_registered(block["blocked"]):
434                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
435                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
436
437             block["block_level"] = blocks.alias_block_level(block["block_level"])
438
439             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
440                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
441                 blockdict.append({
442                     "blocked": block["blocked"],
443                     "reason" : block["reason"],
444                 })
445
446             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
447             cookies.clear(block["blocked"])
448
449         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
450         instances.set_obfuscated_blocks(blocker, obfuscated)
451
452         logger.debug("Flushing updates for blocker='%s' ...", blocker)
453         instances.update(blocker)
454
455         logger.debug("Invoking commit() ...")
456         database.connection.commit()
457
458         logger.debug("Invoking cookies.clear(%s) ...", blocker)
459         cookies.clear(blocker)
460
461         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
462         if config.get("bot_enabled") and len(blockdict) > 0:
463             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
464             network.send_bot_post(blocker, blockdict)
465
466     logger.debug("Success! - EXIT!")
467     return 0
468
469 def fetch_observer(args: argparse.Namespace) -> int:
470     logger.debug("args[]='%s' - CALLED!", type(args))
471
472     logger.debug("Invoking locking.acquire() ...")
473     locking.acquire()
474
475     source_domain = "fediverse.observer"
476     if sources.is_recent(source_domain):
477         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
478         return 1
479     else:
480         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
481         sources.update(source_domain)
482
483     types = list()
484     if args.software is None:
485         logger.info("Fetching software list ...")
486         raw = utils.fetch_url(
487             f"https://{source_domain}",
488             network.web_headers,
489             (config.get("connection_timeout"), config.get("read_timeout"))
490         ).text
491         logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493         doc = bs4.BeautifulSoup(raw, features="html.parser")
494         logger.debug("doc[]='%s'", type(doc))
495
496         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
497         logger.debug("navbar[]='%s'", type(navbar))
498         if navbar is None:
499             logger.warning("Cannot find navigation bar, cannot continue!")
500             return 1
501
502         items = navbar.findAll("a", {"class": "dropdown-item"})
503         logger.debug("items[]='%s'", type(items))
504
505         logger.info("Checking %d menu items ...", len(items))
506         for item in items:
507             logger.debug("item[%s]='%s'", type(item), item)
508             if item.text.lower() == "all":
509                 logger.debug("Skipping 'All' menu entry ...")
510                 continue
511
512             logger.debug("Appending item.text='%s' ...", item.text)
513             types.append(tidyup.domain(item.text))
514     else:
515         logger.info("Adding args.software='%s' as type ...", args.software)
516         types.append(args.software)
517
518     logger.info("Fetching %d different table data ...", len(types))
519     for software in types:
520         logger.debug("software='%s'", software)
521
522         if args.software is not None and args.software != software:
523             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
524             continue
525
526         doc = None
527         try:
528             logger.debug("Fetching table data for software='%s' ...", software)
529             raw = utils.fetch_url(
530                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
531                 network.web_headers,
532                 (config.get("connection_timeout"), config.get("read_timeout"))
533             ).text
534             logger.debug("raw[%s]()=%d", type(raw), len(raw))
535
536             doc = bs4.BeautifulSoup(raw, features="html.parser")
537             logger.debug("doc[]='%s'", type(doc))
538         except network.exceptions as exception:
539             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
540             continue
541
542         items = doc.findAll("a", {"class": "url"})
543         logger.info("Checking %d items,software='%s' ...", len(items), software)
544         for item in items:
545             logger.debug("item[]='%s'", type(item))
546             domain = item.decode_contents()
547             logger.debug("domain[%s]='%s'", type(domain), domain)
548             domain = tidyup.domain(domain) if domain not in [None, ""] else None
549             logger.debug("domain='%s' - AFTER!", domain)
550
551             if domain is None or domain == "":
552                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
553                 continue
554
555             logger.debug("domain='%s' - BEFORE!", domain)
556             domain = domain.encode("idna").decode("utf-8")
557             logger.debug("domain='%s' - AFTER!", domain)
558
559             if not domain_helper.is_wanted(domain):
560                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
561                 continue
562             elif instances.is_registered(domain):
563                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
564                 continue
565
566             logger.info("Fetching instances for domain='%s'", domain)
567             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
568
569     logger.debug("Success! - EXIT!")
570     return 0
571
572 def fetch_todon_wiki(args: argparse.Namespace) -> int:
573     logger.debug("args[]='%s' - CALLED!", type(args))
574
575     logger.debug("Invoking locking.acquire() ...")
576     locking.acquire()
577
578     source_domain = "wiki.todon.eu"
579     if sources.is_recent(source_domain):
580         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
581         return 1
582     else:
583         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
584         sources.update(source_domain)
585
586     blocklist = {
587         "silenced": list(),
588         "reject": list(),
589     }
590
591     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
592     raw = utils.fetch_url(
593         f"https://{source_domain}/todon/domainblocks",
594         network.web_headers,
595         (config.get("connection_timeout"), config.get("read_timeout"))
596     ).text
597     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
598
599     doc = bs4.BeautifulSoup(raw, "html.parser")
600     logger.debug("doc[]='%s'", type(doc))
601
602     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
603     logger.info("Checking %d silenced/limited entries ...", len(silenced))
604     blocklist["silenced"] = utils.find_domains(silenced, "div")
605
606     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
607     logger.info("Checking %d suspended entries ...", len(suspended))
608     blocklist["reject"] = utils.find_domains(suspended, "div")
609
610     blocking = blocklist["silenced"] + blocklist["reject"]
611     blocker = "todon.eu"
612
613     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
614     instances.set_last_blocked(blocker)
615     instances.set_total_blocks(blocker, blocking)
616
617     blockdict = list()
618     for block_level in blocklist:
619         blockers = blocklist[block_level]
620
621         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
622         for blocked in blockers:
623             logger.debug("blocked='%s'", blocked)
624
625             if not instances.is_registered(blocked):
626                 try:
627                     logger.info("Fetching instances from domain='%s' ...", blocked)
628                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
629                 except network.exceptions as exception:
630                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
631                     instances.set_last_error(blocked, exception)
632
633             if not domain_helper.is_wanted(blocked):
634                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
635                 continue
636             elif not domain_helper.is_wanted(blocker):
637                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
638                 continue
639             elif blocks.is_instance_blocked(blocker, blocked, block_level):
640                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
641                 continue
642
643             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
644             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
645                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
646                 blockdict.append({
647                     "blocked": blocked,
648                     "reason" : None,
649                 })
650
651         logger.debug("Invoking commit() ...")
652         database.connection.commit()
653
654         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
655         if config.get("bot_enabled") and len(blockdict) > 0:
656             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
657             network.send_bot_post(blocker, blockdict)
658
659     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
660     if instances.has_pending(blocker):
661         logger.debug("Flushing updates for blocker='%s' ...", blocker)
662         instances.update(blocker)
663
664     logger.debug("Success! - EXIT!")
665     return 0
666
667 def fetch_cs(args: argparse.Namespace):
668     logger.debug("args[]='%s' - CALLED!", type(args))
669
670     logger.debug("Invoking locking.acquire() ...")
671     locking.acquire()
672
673     extensions = [
674         "extra",
675         "abbr",
676         "attr_list",
677         "def_list",
678         "fenced_code",
679         "footnotes",
680         "md_in_html",
681         "admonition",
682         "codehilite",
683         "legacy_attrs",
684         "legacy_em",
685         "meta",
686         "nl2br",
687         "sane_lists",
688         "smarty",
689         "toc",
690         "wikilinks"
691     ]
692
693     blocklist = {
694         "silenced": list(),
695         "reject"  : list(),
696     }
697
698     source_domain = "raw.githubusercontent.com"
699     if sources.is_recent(source_domain):
700         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
701         return 1
702     else:
703         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
704         sources.update(source_domain)
705
706     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
707     raw = utils.fetch_url(
708         f"https://{source_domain}/chaossocial/meta/master/federation.md",
709         network.web_headers,
710         (config.get("connection_timeout"), config.get("read_timeout"))
711     ).text
712     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
713
714     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
715     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
716
717     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
718     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
719     blocklist["silenced"] = federation.find_domains(silenced)
720
721     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
722     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
723     blocklist["reject"] = federation.find_domains(blocked)
724
725     blocking = blocklist["silenced"] + blocklist["reject"]
726     blocker = "chaos.social"
727
728     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
729     instances.set_last_blocked(blocker)
730     instances.set_total_blocks(blocker, blocking)
731
732     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
733     if len(blocking) > 0:
734         blockdict = list()
735         for block_level in blocklist:
736             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
737
738             for row in blocklist[block_level]:
739                 logger.debug("row[%s]='%s'", type(row), row)
740                 if not "domain" in row:
741                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
742                     continue
743                 elif not instances.is_registered(row["domain"]):
744                     try:
745                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
746                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
747                     except network.exceptions as exception:
748                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
749                         instances.set_last_error(row["domain"], exception)
750
751                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
752                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
753                     blockdict.append({
754                         "blocked": row["domain"],
755                         "reason" : row["reason"],
756                     })
757
758         logger.debug("Invoking commit() ...")
759         database.connection.commit()
760
761         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
762         if config.get("bot_enabled") and len(blockdict) > 0:
763             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
764             network.send_bot_post(blocker, blockdict)
765
766     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
767     if instances.has_pending(blocker):
768         logger.debug("Flushing updates for blocker='%s' ...", blocker)
769         instances.update(blocker)
770
771     logger.debug("Success! - EXIT!")
772     return 0
773
774 def fetch_fba_rss(args: argparse.Namespace) -> int:
775     logger.debug("args[]='%s' - CALLED!", type(args))
776
777     domains = list()
778
779     logger.debug("Invoking locking.acquire() ...")
780     locking.acquire()
781
782     components = urlparse(args.feed)
783     domain = components.netloc.lower().split(":")[0]
784
785     logger.debug("domain='%s'", domain)
786     if sources.is_recent(domain):
787         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
788         return 0
789     else:
790         logger.debug("domain='%s' has not been recently used, marking ...", domain)
791         sources.update(domain)
792
793     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
794     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
795
796     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
797     if response.ok and response.status_code == 200 and len(response.text) > 0:
798         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
799         rss = atoma.parse_rss_bytes(response.content)
800
801         logger.debug("rss[]='%s'", type(rss))
802         for item in rss.items:
803             logger.debug("item[%s]='%s'", type(item), item)
804             domain = item.link.split("=")[1]
805             domain = tidyup.domain(domain) if domain not in[None, ""] else None
806
807             logger.debug("domain='%s' - AFTER!", domain)
808             if domain is None or domain == "":
809                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
810                 continue
811
812             logger.debug("domain='%s' - BEFORE!", domain)
813             domain = domain.encode("idna").decode("utf-8")
814             logger.debug("domain='%s' - AFTER!", domain)
815
816             if not domain_helper.is_wanted(domain):
817                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
818                 continue
819             elif domain in domains:
820                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
821                 continue
822             elif instances.is_registered(domain):
823                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
824                 continue
825             elif instances.is_recent(domain):
826                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
827                 continue
828
829             logger.debug("Adding domain='%s'", domain)
830             domains.append(domain)
831
832     logger.debug("domains()=%d", len(domains))
833     if len(domains) > 0:
834         logger.info("Adding %d new instances ...", len(domains))
835         for domain in domains:
836             logger.debug("domain='%s'", domain)
837             try:
838                 logger.info("Fetching instances from domain='%s' ...", domain)
839                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
840             except network.exceptions as exception:
841                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
842                 instances.set_last_error(domain, exception)
843                 return 100
844
845     logger.debug("Success! - EXIT!")
846     return 0
847
848 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
849     logger.debug("args[]='%s' - CALLED!", type(args))
850
851     logger.debug("Invoking locking.acquire() ...")
852     locking.acquire()
853
854     source_domain = "ryona.agency"
855     feed = f"https://{source_domain}/users/fba/feed.atom"
856
857     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
858     if args.feed is not None and validators.url(args.feed):
859         logger.debug("Setting feed='%s' ...", args.feed)
860         feed = str(args.feed)
861         source_domain = urlparse(args.feed).netloc
862
863     if sources.is_recent(source_domain):
864         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
865         return 1
866     else:
867         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
868         sources.update(source_domain)
869
870     domains = list()
871
872     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
873     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
874
875     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
876     if response.ok and response.status_code == 200 and len(response.text) > 0:
877         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
878         atom = atoma.parse_atom_bytes(response.content)
879
880         logger.debug("atom[]='%s'", type(atom))
881         for entry in atom.entries:
882             logger.debug("entry[]='%s'", type(entry))
883             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
884             logger.debug("doc[]='%s'", type(doc))
885
886             for element in doc.findAll("a"):
887                 logger.debug("element[]='%s'", type(element))
888                 for href in element["href"].split(","):
889                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
890                     domain = tidyup.domain(href) if href not in [None, ""] else None
891
892                     logger.debug("domain='%s' - AFTER!", domain)
893                     if domain is None or domain == "":
894                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
895                         continue
896
897                     logger.debug("domain='%s' - BEFORE!", domain)
898                     domain = domain.encode("idna").decode("utf-8")
899                     logger.debug("domain='%s' - AFTER!", domain)
900
901                     if not domain_helper.is_wanted(domain):
902                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
903                         continue
904                     elif domain in domains:
905                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
906                         continue
907                     elif instances.is_registered(domain):
908                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
909                         continue
910                     elif instances.is_recent(domain):
911                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
912                         continue
913
914                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
915                     domains.append(domain)
916
917     logger.debug("domains()=%d", len(domains))
918     if len(domains) > 0:
919         logger.info("Adding %d new instances ...", len(domains))
920         for domain in domains:
921             logger.debug("domain='%s'", domain)
922             try:
923                 logger.info("Fetching instances from domain='%s' ...", domain)
924                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
925             except network.exceptions as exception:
926                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
927                 instances.set_last_error(domain, exception)
928                 return 100
929
930     logger.debug("Success! - EXIT!")
931     return 0
932
933 def fetch_instances(args: argparse.Namespace) -> int:
934     logger.debug("args[]='%s' - CALLED!", type(args))
935
936     logger.debug("args.domain='%s' - checking ...", args.domain)
937     if not validators.domain(args.domain):
938         logger.warning("args.domain='%s' is not valid.", args.domain)
939         return 100
940     elif blacklist.is_blacklisted(args.domain):
941         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
942         return 101
943
944     logger.debug("Invoking locking.acquire() ...")
945     locking.acquire()
946
947     # Initialize values
948     domain = tidyup.domain(args.domain)
949     origin = software = None
950
951     # Fetch record
952     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
953     row = database.cursor.fetchone()
954     if row is not None:
955         origin = row["origin"]
956         software = row["software"]
957
958     if software_helper.is_relay(software):
959         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
960         return 102
961
962     # Initial fetch
963     try:
964         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
965         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
966     except network.exceptions as exception:
967         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
968         instances.set_last_error(args.domain, exception)
969         instances.update(args.domain)
970         return 100
971
972     if args.single:
973         logger.debug("Not fetching more instances - EXIT!")
974         return 0
975
976     # Loop through some instances
977     database.cursor.execute(
978         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
979     )
980
981     rows = database.cursor.fetchall()
982     logger.info("Checking %d entries ...", len(rows))
983     for row in rows:
984         logger.debug("row[domain]='%s'", row["domain"])
985         if row["domain"] == "":
986             logger.debug("row[domain] is empty - SKIPPED!")
987             continue
988
989         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
990         domain = row["domain"].encode("idna").decode("utf-8")
991         logger.debug("domain='%s' - AFTER!", domain)
992
993         if not domain_helper.is_wanted(domain):
994             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
995             continue
996
997         try:
998             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
999             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
1000         except network.exceptions as exception:
1001             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1002             instances.set_last_error(domain, exception)
1003
1004     logger.debug("Success - EXIT!")
1005     return 0
1006
1007 def fetch_csv(args: argparse.Namespace) -> int:
1008     logger.debug("args[]='%s' - CALLED!", type(args))
1009
1010     logger.debug("Invoking locking.acquire() ...")
1011     locking.acquire()
1012
1013     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1014     for block in blocklists.csv_files:
1015         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1016
1017         # Is domain given and not equal blocker?
1018         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1019             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1020             continue
1021
1022         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1023         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1024
1025     logger.debug("Success - EXIT!")
1026     return 0
1027
1028 def fetch_oliphant(args: argparse.Namespace) -> int:
1029     logger.debug("args[]='%s' - CALLED!", type(args))
1030
1031     logger.debug("Invoking locking.acquire() ...")
1032     locking.acquire()
1033
1034     source_domain = "codeberg.org"
1035     if sources.is_recent(source_domain):
1036         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1037         return 1
1038     else:
1039         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1040         sources.update(source_domain)
1041
1042     # Base URL
1043     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1044
1045     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1046     for block in blocklists.oliphant_blocklists:
1047         # Is domain given and not equal blocker?
1048         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1049         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1050             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1051             continue
1052
1053         url = f"{base_url}/{block['csv_url']}"
1054
1055         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1056         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1057
1058     logger.debug("Success! - EXIT!")
1059     return 0
1060
1061 def fetch_txt(args: argparse.Namespace) -> int:
1062     logger.debug("args[]='%s' - CALLED!", type(args))
1063
1064     logger.debug("Invoking locking.acquire() ...")
1065     locking.acquire()
1066
1067     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1068     for row in blocklists.txt_files:
1069         logger.debug("Fetching row[url]='%s' ...", row["url"])
1070         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1071
1072         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1073         if response.ok and response.status_code == 200 and response.text != "":
1074             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1075             domains = response.text.strip().split("\n")
1076
1077             logger.info("Processing %d domains ...", len(domains))
1078             for domain in domains:
1079                 logger.debug("domain='%s' - BEFORE!", domain)
1080                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1081
1082                 logger.debug("domain='%s' - AFTER!", domain)
1083                 if domain is None or domain == "":
1084                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1085                     continue
1086                 elif not domain_helper.is_wanted(domain):
1087                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1088                     continue
1089                 elif instances.is_recent(domain):
1090                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1091                     continue
1092
1093                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1094                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1095
1096                 logger.debug("processed='%s'", processed)
1097                 if not processed:
1098                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1099                     continue
1100
1101     logger.debug("Success! - EXIT!")
1102     return 0
1103
1104 def fetch_fedipact(args: argparse.Namespace) -> int:
1105     logger.debug("args[]='%s' - CALLED!", type(args))
1106
1107     logger.debug("Invoking locking.acquire() ...")
1108     locking.acquire()
1109
1110     source_domain = "fedipact.online"
1111     if sources.is_recent(source_domain):
1112         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1113         return 1
1114     else:
1115         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1116         sources.update(source_domain)
1117
1118     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1119     response = utils.fetch_url(
1120         f"https://{source_domain}",
1121         network.web_headers,
1122         (config.get("connection_timeout"), config.get("read_timeout"))
1123     )
1124
1125     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1126     if response.ok and response.status_code == 200 and response.text != "":
1127         logger.debug("Parsing %d Bytes ...", len(response.text))
1128
1129         doc = bs4.BeautifulSoup(response.text, "html.parser")
1130         logger.debug("doc[]='%s'", type(doc))
1131
1132         rows = doc.findAll("li")
1133         logger.info("Checking %d row(s) ...", len(rows))
1134         for row in rows:
1135             logger.debug("row[]='%s'", type(row))
1136             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1137
1138             logger.debug("domain='%s' - AFTER!", domain)
1139             if domain is None or domain == "":
1140                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1141                 continue
1142
1143             logger.debug("domain='%s' - BEFORE!", domain)
1144             domain = domain.encode("idna").decode("utf-8")
1145             logger.debug("domain='%s' - AFTER!", domain)
1146
1147             if not domain_helper.is_wanted(domain):
1148                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1149                 continue
1150             elif instances.is_registered(domain):
1151                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1152                 continue
1153             elif instances.is_recent(domain):
1154                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1155                 continue
1156
1157             logger.info("Fetching domain='%s' ...", domain)
1158             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1159
1160     logger.debug("Success! - EXIT!")
1161     return 0
1162
1163 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1164     logger.debug("args[]='%s' - CALLED!", type(args))
1165
1166     logger.debug("Invoking locking.acquire() ...")
1167     locking.acquire()
1168
1169     source_domain = "instances.joinmobilizon.org"
1170     if sources.is_recent(source_domain):
1171         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1172         return 1
1173     else:
1174         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1175         sources.update(source_domain)
1176
1177     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1178     raw = utils.fetch_url(
1179         f"https://{source_domain}/api/v1/instances",
1180         network.web_headers,
1181         (config.get("connection_timeout"), config.get("read_timeout"))
1182     ).text
1183     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1184
1185     parsed = json.loads(raw)
1186     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1187
1188     if "data" not in parsed:
1189         logger.warning("parsed()=%d does not contain key 'data'")
1190         return 1
1191
1192     logger.info("Checking %d instances ...", len(parsed["data"]))
1193     for row in parsed["data"]:
1194         logger.debug("row[]='%s'", type(row))
1195         if "host" not in row:
1196             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1197             continue
1198         elif not domain_helper.is_wanted(row["host"]):
1199             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1200             continue
1201         elif instances.is_registered(row["host"]):
1202             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1203             continue
1204
1205         logger.info("Fetching row[host]='%s' ...", row["host"])
1206         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1207
1208     logger.debug("Success! - EXIT!")
1209     return 0
1210
1211 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1212     logger.debug("args[]='%s' - CALLED!", type(args))
1213
1214     logger.debug("Invoking locking.acquire() ...")
1215     locking.acquire()
1216
1217     source_domain = "instanceapp.misskey.page"
1218     if sources.is_recent(source_domain):
1219         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1220         return 1
1221     else:
1222         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1223         sources.update(source_domain)
1224
1225     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1226     raw = utils.fetch_url(
1227         f"https://{source_domain}/instances.json",
1228         network.web_headers,
1229         (config.get("connection_timeout"), config.get("read_timeout"))
1230     ).text
1231     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1232
1233     parsed = json.loads(raw)
1234     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1235
1236     if "instancesInfos" not in parsed:
1237         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1238         return 1
1239
1240     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1241     for row in parsed["instancesInfos"]:
1242         logger.debug("row[%s]='%s'", type(row), row)
1243         if "url" not in row:
1244             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1245             continue
1246         elif not domain_helper.is_wanted(row["url"]):
1247             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1248             continue
1249         elif instances.is_registered(row["url"]):
1250             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1251             continue
1252
1253         logger.info("Fetching row[url]='%s' ...", row["url"])
1254         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1255
1256     logger.debug("Success! - EXIT!")
1257     return 0
1258
1259 def recheck_obfuscation(args: argparse.Namespace) -> int:
1260     logger.debug("args[]='%s' - CALLED!", type(args))
1261
1262     logger.debug("Invoking locking.acquire() ...")
1263     locking.acquire()
1264
1265     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1266         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1267     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1268         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1269     else:
1270         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1271
1272     rows = database.cursor.fetchall()
1273     logger.info("Checking %d domains ...", len(rows))
1274     for row in rows:
1275         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1276         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1277             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1278             continue
1279
1280         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1281         blocking = federation.fetch_blocks(row["domain"])
1282
1283         logger.debug("blocking()=%d", len(blocking))
1284         if len(blocking) == 0:
1285             if row["software"] == "pleroma":
1286                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1287                 blocking = pleroma.fetch_blocks(row["domain"])
1288             elif row["software"] == "mastodon":
1289                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1290                 blocking = mastodon.fetch_blocks(row["domain"])
1291             elif row["software"] == "lemmy":
1292                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1293                 blocking = lemmy.fetch_blocks(row["domain"])
1294             elif row["software"] == "friendica":
1295                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1296                 blocking = friendica.fetch_blocks(row["domain"])
1297             elif row["software"] == "misskey":
1298                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1299                 blocking = misskey.fetch_blocks(row["domain"])
1300             else:
1301                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1302
1303         # c.s isn't part of oliphant's "hidden" blocklists
1304         logger.debug("row[domain]='%s'", row["domain"])
1305         if row["domain"] != "chaos.social" and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1306             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1307             instances.set_last_blocked(row["domain"])
1308             instances.set_total_blocks(row["domain"], blocking)
1309
1310         obfuscated = 0
1311         blockdict = list()
1312
1313         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1314         for block in blocking:
1315             logger.debug("block[blocked]='%s'", block["blocked"])
1316             blocked = None
1317
1318             if block["blocked"] == "":
1319                 logger.debug("block[blocked] is empty - SKIPPED!")
1320                 continue
1321             elif block["blocked"].endswith(".onion"):
1322                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1323                 continue
1324             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
1325                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1326                 continue
1327             elif block["blocked"].endswith(".arpa"):
1328                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1329                 continue
1330             elif block["blocked"].endswith(".tld"):
1331                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1332                 continue
1333             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1334                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1335                 obfuscated = obfuscated + 1
1336                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1337             elif not domain_helper.is_wanted(block["blocked"]):
1338                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1339                 continue
1340             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1341                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1342                 continue
1343
1344             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1345             if blocked is not None and blocked != block["blocked"]:
1346                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1347                 obfuscated = obfuscated - 1
1348
1349                 if blacklist.is_blacklisted(blocked):
1350                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1351                     continue
1352                 elif blacklist.is_blacklisted(row["domain"]):
1353                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1354                     continue
1355                 elif blocks.is_instance_blocked(row["domain"], blocked):
1356                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1357                     continue
1358
1359                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1360
1361                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1362                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1363                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1364                     blockdict.append({
1365                         "blocked": blocked,
1366                         "reason" : block["reason"],
1367                     })
1368
1369         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1370         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1371         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1372
1373         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1374         if instances.has_pending(row["domain"]):
1375             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1376             instances.update(row["domain"])
1377
1378         logger.debug("Invoking commit() ...")
1379         database.connection.commit()
1380
1381         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1382         if config.get("bot_enabled") and len(blockdict) > 0:
1383             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1384             network.send_bot_post(row["domain"], blockdict)
1385
1386     logger.debug("Success! - EXIT!")
1387     return 0
1388
1389 def fetch_fedilist(args: argparse.Namespace) -> int:
1390     logger.debug("args[]='%s' - CALLED!", type(args))
1391
1392     logger.debug("Invoking locking.acquire() ...")
1393     locking.acquire()
1394
1395     source_domain = "demo.fedilist.com"
1396     if sources.is_recent(source_domain):
1397         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1398         return 1
1399     else:
1400         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1401         sources.update(source_domain)
1402
1403     url = f"http://{source_domain}/instance/csv?onion=not"
1404     if args.software is not None and args.software != "":
1405         logger.debug("args.software='%s'", args.software)
1406         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1407
1408     logger.info("Fetching url='%s' ...", url)
1409     response = reqto.get(
1410         url,
1411         headers=network.web_headers,
1412         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1413         allow_redirects=False
1414     )
1415
1416     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1417     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1418         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1419         return 1
1420
1421     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1422
1423     logger.debug("reader[]='%s'", type(reader))
1424     if reader is None:
1425         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1426         return 2
1427
1428     rows = list(reader)
1429
1430     logger.info("Checking %d rows ...", len(rows))
1431     for row in rows:
1432         logger.debug("row[]='%s'", type(row))
1433         if "hostname" not in row:
1434             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1435             continue
1436
1437         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1438         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1439         logger.debug("domain='%s' - AFTER!", domain)
1440
1441         if domain is None or domain == "":
1442             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1443             continue
1444
1445         logger.debug("domain='%s' - BEFORE!", domain)
1446         domain = domain.encode("idna").decode("utf-8")
1447         logger.debug("domain='%s' - AFTER!", domain)
1448
1449         if not domain_helper.is_wanted(domain):
1450             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1451             continue
1452         elif (args.force is None or not args.force) and instances.is_registered(domain):
1453             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1454             continue
1455         elif instances.is_recent(domain):
1456             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1457             continue
1458
1459         logger.info("Fetching instances from domain='%s' ...", domain)
1460         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1461
1462     logger.debug("Success! - EXIT!")
1463     return 0
1464
1465 def update_nodeinfo(args: argparse.Namespace) -> int:
1466     logger.debug("args[]='%s' - CALLED!", type(args))
1467
1468     logger.debug("Invoking locking.acquire() ...")
1469     locking.acquire()
1470
1471     if args.domain is not None and args.domain != "":
1472         logger.debug("Fetching args.domain='%s'", args.domain)
1473         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1474     elif args.software is not None and args.software != "":
1475         logger.info("Fetching domains for args.software='%s'", args.software)
1476         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1477     elif args.mode is not None and args.mode != "":
1478         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1479         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1480     elif args.no_software:
1481         logger.info("Fetching domains with no software type detected ...")
1482         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1483     elif args.no_auto:
1484         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1485         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1486     elif args.no_detection:
1487         logger.info("Fetching domains with no detection mode being set ...")
1488         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1489     else:
1490         logger.info("Fetching domains for recently updated ...")
1491         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1492
1493     domains = database.cursor.fetchall()
1494
1495     logger.info("Checking %d domain(s) ...", len(domains))
1496     cnt = 0
1497     for row in domains:
1498         logger.debug("row[]='%s'", type(row))
1499         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1500             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1501             continue
1502
1503         try:
1504             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1505             software = federation.determine_software(row["domain"])
1506
1507             logger.debug("Determined software='%s'", software)
1508             if (software != row["software"] and software is not None) or args.force is True:
1509                 logger.debug("software='%s'", software)
1510                 if software is None:
1511                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1512                     instances.set_nodeinfo_url(row["domain"], None)
1513
1514                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1515                 instances.set_software(row["domain"], software)
1516
1517             if software is not None:
1518                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1519                 instances.set_success(row["domain"])
1520         except network.exceptions as exception:
1521             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1522             instances.set_last_error(row["domain"], exception)
1523
1524         instances.set_last_nodeinfo(row["domain"])
1525         instances.update(row["domain"])
1526         cnt = cnt + 1
1527
1528     logger.debug("Success! - EXIT!")
1529     return 0
1530
1531 def fetch_instances_social(args: argparse.Namespace) -> int:
1532     logger.debug("args[]='%s' - CALLED!", type(args))
1533
1534     logger.debug("Invoking locking.acquire() ...")
1535     locking.acquire()
1536
1537     source_domain = "instances.social"
1538
1539     if config.get("instances_social_api_key") == "":
1540         logger.error("API key not set. Please set in your config.json file.")
1541         return 1
1542     elif sources.is_recent(source_domain):
1543         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1544         return 2
1545     else:
1546         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1547         sources.update(source_domain)
1548
1549     headers = {
1550         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1551     }
1552
1553     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1554     fetched = network.get_json_api(
1555         source_domain,
1556         "/api/1.0/instances/list?count=0&sort_by=name",
1557         headers=headers,
1558         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1559     )
1560     logger.debug("fetched[]='%s'", type(fetched))
1561
1562     if "error_message" in fetched:
1563         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1564         return 2
1565     elif "exception" in fetched:
1566         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1567         return 3
1568     elif "json" not in fetched:
1569         logger.warning("fetched has no element 'json' - EXIT!")
1570         return 4
1571     elif "instances" not in fetched["json"]:
1572         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1573         return 5
1574
1575     domains = list()
1576     rows = fetched["json"]["instances"]
1577
1578     logger.info("Checking %d row(s) ...", len(rows))
1579     for row in rows:
1580         logger.debug("row[]='%s'", type(row))
1581         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1582         logger.debug("domain='%s' - AFTER!", domain)
1583
1584         if domain is None and domain == "":
1585             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1586             continue
1587
1588         logger.debug("domain='%s' - BEFORE!", domain)
1589         domain = domain.encode("idna").decode("utf-8")
1590         logger.debug("domain='%s' - AFTER!", domain)
1591
1592         if not domain_helper.is_wanted(domain):
1593             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1594             continue
1595         elif domain in domains:
1596             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1597             continue
1598         elif instances.is_registered(domain):
1599             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1600             continue
1601         elif instances.is_recent(domain):
1602             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1603             continue
1604
1605         logger.info("Fetching instances from domain='%s'", domain)
1606         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1607
1608     logger.debug("Success! - EXIT!")
1609     return 0
1610
1611 def fetch_relaylist(args: argparse.Namespace) -> int:
1612     logger.debug("args[]='%s' - CALLED!", type(args))
1613
1614     logger.debug("Invoking locking.acquire() ...")
1615     locking.acquire()
1616
1617     source_domain = "api.relaylist.com"
1618
1619     if sources.is_recent(source_domain):
1620         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1621         return 1
1622     else:
1623         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1624         sources.update(source_domain)
1625
1626     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1627     fetched = network.get_json_api(
1628         source_domain,
1629         "/relays",
1630         {},
1631         (config.get("connection_timeout"), config.get("read_timeout"))
1632     )
1633     logger.debug("fetched[]='%s'", type(fetched))
1634
1635     if "error_message" in fetched:
1636         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1637         return 2
1638     elif "exception" in fetched:
1639         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1640         return 3
1641     elif "json" not in fetched:
1642         logger.warning("fetched has no element 'json' - EXIT!")
1643         return 4
1644
1645     domains = list()
1646
1647     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1648     for row in fetched["json"]:
1649         logger.debug("row[]='%s'", type(row))
1650         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1651         logger.debug("domain='%s' - AFTER!", domain)
1652
1653         if domain is None and domain == "":
1654             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1655             continue
1656
1657         logger.debug("domain='%s' - BEFORE!", domain)
1658         domain = domain.encode("idna").decode("utf-8")
1659         logger.debug("domain='%s' - AFTER!", domain)
1660
1661         if not domain_helper.is_wanted(domain):
1662             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1663             continue
1664         elif domain in domains:
1665             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1666             continue
1667         elif instances.is_registered(domain):
1668             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1669             continue
1670         elif instances.is_recent(domain):
1671             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1672             continue
1673
1674         logger.info("Fetching instances from domain='%s'", domain)
1675         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1676
1677     logger.debug("Success! - EXIT!")
1678     return 0
1679
1680 def fetch_relays(args: argparse.Namespace) -> int:
1681     logger.debug("args[]='%s' - CALLED!", type(args))
1682
1683     logger.debug("Invoking locking.acquire() ...")
1684     locking.acquire()
1685
1686     if args.domain is not None and args.domain != "":
1687         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1688     elif args.software is not None and args.software != "":
1689         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1690     else:
1691         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1692
1693     domains = list()
1694     rows = database.cursor.fetchall()
1695
1696     logger.info("Checking %d relays ...", len(rows))
1697     for row in rows:
1698         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1699         peers = list()
1700         if not args.force and instances.is_recent(row["domain"]):
1701             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1702             continue
1703
1704         try:
1705             if row["software"] == "pub-relay":
1706                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1707                 raw = network.fetch_api_url(
1708                     row["nodeinfo_url"],
1709                     (config.get("connection_timeout"), config.get("read_timeout"))
1710                 )
1711
1712                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1713                 if "exception" in raw:
1714                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1715                     raise raw["exception"]
1716                 elif "error_message" in raw:
1717                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1718                     instances.set_last_error(row["domain"], raw)
1719                     instances.set_last_instance_fetch(row["domain"])
1720                     instances.update(row["domain"])
1721                     continue
1722                 elif "json" not in raw:
1723                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1724                     continue
1725                 elif not "metadata" in raw["json"]:
1726                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1727                     continue
1728                 elif not "peers" in raw["json"]["metadata"]:
1729                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1730                     continue
1731             else:
1732                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1733                 raw = utils.fetch_url(
1734                     f"https://{row['domain']}",
1735                     network.web_headers,
1736                     (config.get("connection_timeout"), config.get("read_timeout"))
1737                 ).text
1738                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1739
1740                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1741                 logger.debug("doc[]='%s'", type(doc))
1742
1743         except network.exceptions as exception:
1744             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1745             instances.set_last_error(row["domain"], exception)
1746             instances.set_last_instance_fetch(row["domain"])
1747             instances.update(row["domain"])
1748             continue
1749
1750         logger.debug("row[software]='%s'", row["software"])
1751         if row["software"] == "activityrelay":
1752             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1753             tags = doc.findAll("p")
1754
1755             logger.debug("Checking %d paragraphs ...", len(tags))
1756             for tag in tags:
1757                 logger.debug("tag[]='%s'", type(tag))
1758                 if len(tag.contents) == 0:
1759                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1760                     continue
1761                 elif "registered instances" not in tag.contents[0]:
1762                     logger.debug("Skipping paragraph, text not found.")
1763                     continue
1764
1765                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1766                 for domain in tag.contents:
1767                     logger.debug("domain[%s]='%s'", type(domain), domain)
1768                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1769                         continue
1770
1771                     domain = str(domain)
1772                     logger.debug("domain='%s'", domain)
1773                     if not domain_helper.is_wanted(domain):
1774                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1775                         continue
1776
1777                     logger.debug("domain='%s' - BEFORE!", domain)
1778                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1779                     logger.debug("domain='%s' - AFTER!", domain)
1780
1781                     if domain is None or domain == "":
1782                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1783                         continue
1784                     elif domain not in peers:
1785                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1786                         peers.append(domain)
1787
1788                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1789                     if dict_helper.has_key(domains, "domain", domain):
1790                         logger.debug("domain='%s' already added", domain)
1791                         continue
1792
1793                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1794                     domains.append({
1795                         "domain": domain,
1796                         "origin": row["domain"],
1797                     })
1798         elif row["software"] in ["aoderelay", "selective-relay"]:
1799             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1800             if row["software"] == "aoderelay":
1801                 tags = doc.findAll("section", {"class": "instance"})
1802             else:
1803                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1804
1805             logger.debug("Checking %d tags ...", len(tags))
1806             for tag in tags:
1807                 logger.debug("tag[]='%s'", type(tag))
1808
1809                 link = tag.find("a")
1810                 logger.debug("link[%s]='%s'", type(link), link)
1811                 if not isinstance(link, bs4.element.Tag):
1812                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1813                     continue
1814
1815                 components = urlparse(link.get("href"))
1816                 logger.debug("components(%d)='%s'", len(components), components)
1817                 domain = components.netloc.lower().split(":")[0]
1818
1819                 logger.debug("domain='%s' - BEFORE!", domain)
1820                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1821                 logger.debug("domain='%s' - AFTER!", domain)
1822
1823                 if domain is None or domain == "":
1824                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1825                     continue
1826                 elif domain not in peers:
1827                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1828                     peers.append(domain)
1829
1830                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1831                 if dict_helper.has_key(domains, "domain", domain):
1832                     logger.debug("domain='%s' already added", domain)
1833                     continue
1834
1835                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1836                 domains.append({
1837                     "domain": domain,
1838                     "origin": row["domain"],
1839                 })
1840         elif row["software"] == "pub-relay":
1841             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1842             for domain in raw["json"]["metadata"]["peers"]:
1843                 logger.debug("domain='%s' - BEFORE!", domain)
1844                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1845                 logger.debug("domain='%s' - AFTER!", domain)
1846
1847                 if domain is None or domain == "":
1848                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1849                     continue
1850                 elif domain not in peers:
1851                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1852                     peers.append(domain)
1853
1854                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1855                 if dict_helper.has_key(domains, "domain", domain):
1856                     logger.debug("domain='%s' already added", domain)
1857                     continue
1858
1859                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1860                 domains.append({
1861                     "domain": domain,
1862                     "origin": row["domain"],
1863                 })
1864         else:
1865             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1866             continue
1867
1868         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1869         instances.set_last_instance_fetch(row["domain"])
1870
1871         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1872         instances.set_total_peers(row["domain"], peers)
1873
1874         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1875         instances.update(row["domain"])
1876
1877     logger.info("Checking %d domains ...", len(domains))
1878     for row in domains:
1879         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1880         if not domain_helper.is_wanted(row["domain"]):
1881             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1882             continue
1883         elif instances.is_registered(row["domain"]):
1884             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1885             continue
1886
1887         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1888         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1889
1890     logger.debug("Success! - EXIT!")
1891     return 0
1892
1893 def convert_idna(args: argparse.Namespace) -> int:
1894     logger.debug("args[]='%s' - CALLED!", type(args))
1895
1896     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1897     rows = database.cursor.fetchall()
1898
1899     logger.debug("rows[]='%s'", type(rows))
1900     instances.translate_idnas(rows, "domain")
1901
1902     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1903     rows = database.cursor.fetchall()
1904
1905     logger.debug("rows[]='%s'", type(rows))
1906     instances.translate_idnas(rows, "origin")
1907
1908     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1909     rows = database.cursor.fetchall()
1910
1911     logger.debug("rows[]='%s'", type(rows))
1912     blocks.translate_idnas(rows, "blocker")
1913
1914     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1915     rows = database.cursor.fetchall()
1916
1917     logger.debug("rows[]='%s'", type(rows))
1918     blocks.translate_idnas(rows, "blocked")
1919
1920     logger.debug("Success! - EXIT!")
1921     return 0
1922
1923 def remove_invalid(args: argparse.Namespace) -> int:
1924     logger.debug("args[]='%s' - CALLED!", type(args))
1925
1926     logger.debug("Invoking locking.acquire() ...")
1927     locking.acquire()
1928
1929     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1930     rows = database.cursor.fetchall()
1931
1932     logger.info("Checking %d domains ...", len(rows))
1933     for row in rows:
1934         logger.debug("row[domain]='%s'", row["domain"])
1935         if not validators.domain(row["domain"].split("/")[0]):
1936             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1937             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1938             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1939
1940     logger.debug("Invoking commit() ...")
1941     database.connection.commit()
1942
1943     logger.info("Vaccum cleaning database ...")
1944     database.cursor.execute("VACUUM")
1945
1946     logger.debug("Success! - EXIT!")
1947     return 0