]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s' - EXIT!", fetched["error_message"])
207             return 100
208         elif "json" not in fetched:
209             logger.warning("post_json_api() returned fetched[]='%s' with missing 'json' element - EXIT!", type(fetched))
210             return 101
211         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
212             logger.warning("post_json_api() returned error: '%s' - EXIT!", fetched["json"]["error"]["message"])
213             return 102
214
215         rows = fetched["json"]
216
217         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
218         if len(rows) == 0:
219             raise Exception("WARNING: Returned no records")
220         elif "data" not in rows:
221             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
222         elif "nodeinfo" not in rows["data"]:
223             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
224
225         for entry in rows["data"]["nodeinfo"]:
226             logger.debug("entry[%s]='%s'", type(entry), entry)
227             if "domain" not in entry:
228                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
229                 continue
230             elif entry["domain"] in [None, ""]:
231                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
232                 continue
233             elif not domain_helper.is_wanted(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_registered(entry["domain"]):
237                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
238                 continue
239             elif instances.is_recent(entry["domain"]):
240                 logger.debug("entry[domain]='%s' has recently been crawled - SKIPPED!", entry["domain"])
241                 continue
242
243             logger.debug("Adding domain='%s' ...", entry["domain"])
244             domains.append(entry["domain"])
245
246     except network.exceptions as exception:
247         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
248         return 102
249
250     logger.debug("domains()=%d", len(domains))
251     if len(domains) > 0:
252         logger.info("Adding %d new instances ...", len(domains))
253         for domain in domains:
254             logger.debug("domain='%s' - BEFORE!", domain)
255             domain = domain.encode("idna").decode("utf-8")
256             logger.debug("domain='%s' - AFTER!", domain)
257
258             try:
259                 logger.info("Fetching instances from domain='%s' ...", domain)
260                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
261             except network.exceptions as exception:
262                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
263                 instances.set_last_error(domain, exception)
264                 return 100
265
266     logger.debug("Success - EXIT!")
267     return 0
268
269 def fetch_blocks(args: argparse.Namespace) -> int:
270     logger.debug("args[]='%s' - CALLED!", type(args))
271     if args.domain is not None and args.domain != "":
272         logger.debug("args.domain='%s' - checking ...", args.domain)
273         if not validators.domain(args.domain):
274             logger.warning("args.domain='%s' is not valid.", args.domain)
275             return 100
276         elif blacklist.is_blacklisted(args.domain):
277             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
278             return 101
279         elif not instances.is_registered(args.domain):
280             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
281             return 102
282
283     logger.debug("Invoking locking.acquire() ...")
284     locking.acquire()
285
286     if args.domain is not None and args.domain != "":
287         # Re-check single domain
288         logger.debug("Querying database for args.domain='%s' ...", args.domain)
289         database.cursor.execute(
290             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
291         )
292     elif args.software is not None and args.software != "":
293         # Re-check single software
294         logger.debug("Querying database for args.software='%s' ...", args.software)
295         database.cursor.execute(
296             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_blocked ASC", [args.software]
297         )
298     elif args.only_none:
299         # Check only entries with total_blocked=None
300         database.cursor.execute(
301             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'piefed') AND nodeinfo_url IS NOT NULL AND total_blocks IS NULL ORDER BY last_blocked ASC, total_blocks DESC"
302         )
303     else:
304         # Re-check after "timeout" (aka. minimum interval)
305         database.cursor.execute(
306             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'piefed') AND nodeinfo_url IS NOT NULL ORDER BY last_blocked ASC, total_blocks DESC"
307         )
308
309     rows = database.cursor.fetchall()
310     logger.info("Checking %d entries ...", len(rows))
311     for blocker, software, origin, nodeinfo_url in rows:
312         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
313
314         if not domain_helper.is_wanted(blocker):
315             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
316             continue
317         elif not args.force and instances.is_recent(blocker, "last_blocked"):
318             logger.debug("blocker='%s' has recently been crawled - SKIPPED!", blocker)
319             continue
320
321         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
322         instances.set_last_blocked(blocker)
323         instances.set_has_obfuscation(blocker, False)
324
325         # c.s isn't part of oliphant's "hidden" blocklists
326         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
327             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
328             continue
329
330         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
331         blocking = federation.fetch_blocks(blocker)
332
333         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
334         if len(blocking) == 0:
335             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
336             if software == "pleroma":
337                 blocking = pleroma.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "mastodon":
340                 blocking = mastodon.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "lemmy":
343                 blocking = lemmy.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "friendica":
346                 blocking = friendica.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             elif software == "misskey":
349                 blocking = misskey.fetch_blocks(blocker)
350                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
351             else:
352                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
353
354         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
355         instances.set_total_blocks(blocker, blocking)
356
357         blockdict = list()
358         deobfuscated = obfuscated = 0
359
360         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
361         for block in blocking:
362             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
363
364             if block["block_level"] == "":
365                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
366                 continue
367
368             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
369             block["blocked"] = tidyup.domain(block["blocked"])
370             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
371             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
372
373             if block["blocked"] in [None, ""]:
374                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
375                 continue
376             elif block["blocked"].endswith(".onion"):
377                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
380                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".arpa"):
383                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].endswith(".tld"):
386                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
387                 continue
388             elif block["blocked"].find("*") >= 0:
389                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
390                 instances.set_has_obfuscation(blocker, True)
391                 obfuscated = obfuscated + 1
392
393                 # Some friendica servers also obscure domains without hash
394                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
395
396                 logger.debug("row[]='%s'", type(row))
397                 if row is None:
398                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
399                     continue
400
401                 deobfuscated = deobfuscated + 1
402                 block["blocked"] = row["domain"]
403                 origin           = row["origin"]
404                 nodeinfo_url     = row["nodeinfo_url"]
405             elif block["blocked"].find("?") >= 0:
406                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
407                 instances.set_has_obfuscation(blocker, True)
408                 obfuscated = obfuscated + 1
409
410                 # Some obscure them with question marks, not sure if that's dependent on version or not
411                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
412
413                 logger.debug("row[]='%s'", type(row))
414                 if row is None:
415                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
416                     continue
417
418                 deobfuscated = deobfuscated + 1
419                 block["blocked"] = row["domain"]
420                 origin           = row["origin"]
421                 nodeinfo_url     = row["nodeinfo_url"]
422
423             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
424             if block["blocked"] in [None, ""]:
425                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
426                 continue
427
428             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
429             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
430             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
431
432             if not domain_helper.is_wanted(block["blocked"]):
433                 logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
434                 continue
435             elif block["block_level"] in ["accept", "accepted"]:
436                 logger.debug("block[blocked]='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
437                 continue
438             elif not instances.is_registered(block["blocked"]):
439                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
440                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
441
442             block["block_level"] = blocks.alias_block_level(block["block_level"])
443
444             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] in ["reject", "suspend"] and config.get("bot_enabled"):
445                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
446                 blockdict.append({
447                     "blocked": block["blocked"],
448                     "reason" : block["reason"],
449                 })
450
451             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
452             cookies.clear(block["blocked"])
453
454         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
455         instances.set_obfuscated_blocks(blocker, obfuscated)
456
457         logger.debug("Flushing updates for blocker='%s' ...", blocker)
458         instances.update(blocker)
459
460         logger.debug("Invoking commit() ...")
461         database.connection.commit()
462
463         logger.debug("Invoking cookies.clear(%s) ...", blocker)
464         cookies.clear(blocker)
465
466         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
467         if config.get("bot_enabled") and len(blockdict) > 0:
468             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
469             network.send_bot_post(blocker, blockdict)
470
471     logger.debug("Success! - EXIT!")
472     return 0
473
474 def fetch_observer(args: argparse.Namespace) -> int:
475     logger.debug("args[]='%s' - CALLED!", type(args))
476
477     logger.debug("Invoking locking.acquire() ...")
478     locking.acquire()
479
480     source_domain = "fediverse.observer"
481     if sources.is_recent(source_domain):
482         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
483         return 1
484     else:
485         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
486         sources.update(source_domain)
487
488     types = list()
489     if args.software is None:
490         logger.info("Fetching software list ...")
491         raw = network.fetch_url(
492             f"https://{source_domain}",
493             network.web_headers,
494             (config.get("connection_timeout"), config.get("read_timeout"))
495         ).text
496         logger.debug("raw[%s]()=%d", type(raw), len(raw))
497
498         doc = bs4.BeautifulSoup(raw, features="html.parser")
499         logger.debug("doc[]='%s'", type(doc))
500
501         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
502         logger.debug("navbar[]='%s'", type(navbar))
503         if navbar is None:
504             logger.warning("Cannot find navigation bar, cannot continue!")
505             return 1
506
507         items = navbar.findAll("a", {"class": "dropdown-item"})
508         logger.debug("items[]='%s'", type(items))
509
510         logger.info("Checking %d menu items ...", len(items))
511         for item in items:
512             logger.debug("item[%s]='%s'", type(item), item)
513             if item.text.lower() == "all":
514                 logger.debug("Skipping 'All' menu entry ...")
515                 continue
516
517             logger.debug("Appending item.text='%s' ...", item.text)
518             types.append(tidyup.domain(item.text))
519     else:
520         logger.info("Adding args.software='%s' as type ...", args.software)
521         types.append(args.software)
522
523     logger.info("Fetching %d different table data ...", len(types))
524     for software in types:
525         logger.debug("software='%s'", software)
526
527         if args.software is not None and args.software != software:
528             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
529             continue
530
531         items = list()
532         try:
533             logger.debug("Fetching table data for software='%s' ...", software)
534             raw = network.post_json_api(
535                 f"api.{source_domain}",
536                 "/",
537                 json.dumps({
538                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
539                 })
540             )
541
542             logger.debug("raw[%s]()=%d", type(raw), len(raw))
543             if "exception" in raw:
544                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
545                 raise raw["exception"]
546             elif "error_message" in raw:
547                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
548                 continue
549             elif not "data" in raw["json"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
551                 continue
552             elif not "nodes" in raw["json"]["data"]:
553                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
554                 continue
555
556             items = raw["json"]["data"]["nodes"]
557             logger.debug("items()=%d", len(items))
558
559         except network.exceptions as exception:
560             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
561             continue
562
563         logger.info("Checking %d items,software='%s' ...", len(items), software)
564         for item in items:
565             logger.debug("item[]='%s'", type(item))
566             if not "domain" in item:
567                 logger.debug("item()=%d has not element 'domain'", len(item))
568                 continue
569
570             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
571             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
572             logger.debug("domain='%s' - AFTER!", domain)
573
574             if domain in [None, ""]:
575                 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): item[domain]='%s' - SKIPPED!", type(domain), domain, item["domain"])
576                 continue
577
578             logger.debug("domain='%s' - BEFORE!", domain)
579             domain = domain.encode("idna").decode("utf-8")
580             logger.debug("domain='%s' - AFTER!", domain)
581
582             if not domain_helper.is_wanted(domain):
583                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
584                 continue
585             elif instances.is_registered(domain):
586                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
587                 continue
588             elif instances.is_recent(domain):
589                 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
590                 continue
591
592             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
593             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
594
595     logger.debug("Success! - EXIT!")
596     return 0
597
598 def fetch_todon_wiki(args: argparse.Namespace) -> int:
599     logger.debug("args[]='%s' - CALLED!", type(args))
600
601     logger.debug("Invoking locking.acquire() ...")
602     locking.acquire()
603
604     source_domain = "wiki.todon.eu"
605     if sources.is_recent(source_domain):
606         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
607         return 1
608     else:
609         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
610         sources.update(source_domain)
611
612     blocklist = {
613         "silenced": list(),
614         "reject": list(),
615     }
616
617     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
618     raw = network.fetch_url(
619         f"https://{source_domain}/todon/domainblocks",
620         network.web_headers,
621         (config.get("connection_timeout"), config.get("read_timeout"))
622     ).text
623     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
624
625     doc = bs4.BeautifulSoup(raw, "html.parser")
626     logger.debug("doc[]='%s'", type(doc))
627
628     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
629     logger.info("Checking %d silenced/limited entries ...", len(silenced))
630     blocklist["silenced"] = utils.find_domains(silenced, "div")
631
632     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
633     logger.info("Checking %d suspended entries ...", len(suspended))
634     blocklist["reject"] = utils.find_domains(suspended, "div")
635
636     blocking = blocklist["silenced"] + blocklist["reject"]
637     blocker = "todon.eu"
638
639     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
640     instances.set_last_blocked(blocker)
641     instances.set_total_blocks(blocker, blocking)
642
643     blockdict = list()
644     for block_level in blocklist:
645         blockers = blocklist[block_level]
646
647         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
648         for blocked in blockers:
649             logger.debug("blocked='%s'", blocked)
650
651             if not domain_helper.is_wanted(blocked):
652                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
653                 continue
654             elif not domain_helper.is_wanted(blocker):
655                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
656                 continue
657             elif blocks.is_instance_blocked(blocker, blocked, block_level):
658                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
659                 continue
660             elif not instances.is_registered(blocked):
661                 try:
662                     logger.info("Fetching instances from domain='%s' ...", blocked)
663                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
664                 except network.exceptions as exception:
665                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
666                     instances.set_last_error(blocked, exception)
667
668
669             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
670             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
671                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
672                 blockdict.append({
673                     "blocked": blocked,
674                     "reason" : None,
675                 })
676
677         logger.debug("Invoking commit() ...")
678         database.connection.commit()
679
680         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
681         if config.get("bot_enabled") and len(blockdict) > 0:
682             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
683             network.send_bot_post(blocker, blockdict)
684
685     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
686     if instances.has_pending(blocker):
687         logger.debug("Flushing updates for blocker='%s' ...", blocker)
688         instances.update(blocker)
689
690     logger.debug("Success! - EXIT!")
691     return 0
692
693 def fetch_cs(args: argparse.Namespace):
694     logger.debug("args[]='%s' - CALLED!", type(args))
695
696     logger.debug("Invoking locking.acquire() ...")
697     locking.acquire()
698
699     extensions = [
700         "extra",
701         "abbr",
702         "attr_list",
703         "def_list",
704         "fenced_code",
705         "footnotes",
706         "md_in_html",
707         "admonition",
708         "codehilite",
709         "legacy_attrs",
710         "legacy_em",
711         "meta",
712         "nl2br",
713         "sane_lists",
714         "smarty",
715         "toc",
716         "wikilinks"
717     ]
718
719     blocklist = {
720         "silenced": list(),
721         "reject"  : list(),
722     }
723
724     source_domain = "raw.githubusercontent.com"
725     if sources.is_recent(source_domain):
726         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
727         return 1
728     else:
729         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
730         sources.update(source_domain)
731
732     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
733     raw = network.fetch_url(
734         f"https://{source_domain}/chaossocial/meta/master/federation.md",
735         network.web_headers,
736         (config.get("connection_timeout"), config.get("read_timeout"))
737     ).text
738     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
739
740     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
741     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
742
743     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
744     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
745     blocklist["silenced"] = federation.find_domains(silenced)
746
747     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
748     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
749     blocklist["reject"] = federation.find_domains(blocked)
750
751     blocking = blocklist["silenced"] + blocklist["reject"]
752     blocker = "chaos.social"
753
754     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
755     instances.set_last_blocked(blocker)
756     instances.set_total_blocks(blocker, blocking)
757
758     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
759     if len(blocking) > 0:
760         blockdict = list()
761         for block_level in blocklist:
762             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
763
764             for row in blocklist[block_level]:
765                 logger.debug("row[%s]='%s'", type(row), row)
766                 if not "domain" in row:
767                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
768                     continue
769                 elif not instances.is_registered(row["domain"]):
770                     try:
771                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
772                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
773                     except network.exceptions as exception:
774                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
775                         instances.set_last_error(row["domain"], exception)
776
777                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
778                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
779                     blockdict.append({
780                         "blocked": row["domain"],
781                         "reason" : row["reason"],
782                     })
783
784         logger.debug("Invoking commit() ...")
785         database.connection.commit()
786
787         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
788         if config.get("bot_enabled") and len(blockdict) > 0:
789             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
790             network.send_bot_post(blocker, blockdict)
791
792     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
793     if instances.has_pending(blocker):
794         logger.debug("Flushing updates for blocker='%s' ...", blocker)
795         instances.update(blocker)
796
797     logger.debug("Success! - EXIT!")
798     return 0
799
800 def fetch_fba_rss(args: argparse.Namespace) -> int:
801     logger.debug("args[]='%s' - CALLED!", type(args))
802
803     domains = list()
804
805     logger.debug("Invoking locking.acquire() ...")
806     locking.acquire()
807
808     components = urlparse(args.feed)
809     domain = components.netloc.lower().split(":")[0]
810
811     logger.debug("domain='%s'", domain)
812     if sources.is_recent(domain):
813         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
814         return 0
815     else:
816         logger.debug("domain='%s' has not been recently used, marking ...", domain)
817         sources.update(domain)
818
819     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
820     response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
821
822     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
823     if response.ok and response.status_code == 200 and len(response.text) > 0:
824         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
825         rss = atoma.parse_rss_bytes(response.content)
826
827         logger.debug("rss[]='%s'", type(rss))
828         for item in rss.items:
829             logger.debug("item[%s]='%s'", type(item), item)
830             domain = item.link.split("=")[1]
831             domain = tidyup.domain(domain) if domain not in[None, ""] else None
832
833             logger.debug("domain='%s' - AFTER!", domain)
834             if domain in [None, ""]:
835                 logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
836                 continue
837
838             logger.debug("domain='%s' - BEFORE!", domain)
839             domain = domain.encode("idna").decode("utf-8")
840             logger.debug("domain='%s' - AFTER!", domain)
841
842             if not domain_helper.is_wanted(domain):
843                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
844                 continue
845             elif domain in domains:
846                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
847                 continue
848             elif instances.is_registered(domain):
849                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
850                 continue
851             elif instances.is_recent(domain):
852                 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
853                 continue
854
855             logger.debug("Adding domain='%s'", domain)
856             domains.append(domain)
857
858     logger.debug("domains()=%d", len(domains))
859     if len(domains) > 0:
860         logger.info("Adding %d new instances ...", len(domains))
861         for domain in domains:
862             logger.debug("domain='%s'", domain)
863             try:
864                 logger.info("Fetching instances from domain='%s' ...", domain)
865                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
866             except network.exceptions as exception:
867                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
868                 instances.set_last_error(domain, exception)
869                 return 100
870
871     logger.debug("Success! - EXIT!")
872     return 0
873
874 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
875     logger.debug("args[]='%s' - CALLED!", type(args))
876
877     logger.debug("Invoking locking.acquire() ...")
878     locking.acquire()
879
880     source_domain = "ryona.agency"
881     feed = f"https://{source_domain}/users/fba/feed.atom"
882
883     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
884     if args.feed is not None and validators.url(args.feed):
885         logger.debug("Setting feed='%s' ...", args.feed)
886         feed = str(args.feed)
887         source_domain = urlparse(args.feed).netloc
888
889     if sources.is_recent(source_domain):
890         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
891         return 1
892     else:
893         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
894         sources.update(source_domain)
895
896     domains = list()
897
898     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
899     response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
900
901     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
902     if response.ok and response.status_code == 200 and len(response.text) > 0:
903         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
904         atom = atoma.parse_atom_bytes(response.content)
905
906         logger.debug("atom[]='%s'", type(atom))
907         for entry in atom.entries:
908             logger.debug("entry[]='%s'", type(entry))
909             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
910             logger.debug("doc[]='%s'", type(doc))
911             elements = doc.findAll("a")
912
913             logger.debug("Checking %d element(s) ...", len(elements))
914             for element in elements:
915                 logger.debug("element[%s]='%s'", type(element), element)
916                 for href in element["href"].split(","):
917                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
918                     domain = tidyup.domain(href) if href not in [None, ""] else None
919
920                     logger.debug("domain='%s' - AFTER!", domain)
921                     if domain in [None, ""]:
922                         logger.debug("domain[%s]='%s' is empty after tidyup.domain(): href='%s' - SKIPPED!", type(domain), domain, href)
923                         continue
924
925                     logger.debug("domain='%s' - BEFORE!", domain)
926                     domain = domain.encode("idna").decode("utf-8")
927                     logger.debug("domain='%s' - AFTER!", domain)
928
929                     if not domain_helper.is_wanted(domain):
930                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
931                         continue
932                     elif domain in domains:
933                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
934                         continue
935                     elif instances.is_registered(domain):
936                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
937                         continue
938                     elif instances.is_recent(domain):
939                         logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
940                         continue
941
942                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
943                     domains.append(domain)
944
945     logger.debug("domains()=%d", len(domains))
946     if len(domains) > 0:
947         logger.info("Adding %d new instances ...", len(domains))
948         for domain in domains:
949             logger.debug("domain='%s'", domain)
950             try:
951                 logger.info("Fetching instances from domain='%s' ...", domain)
952                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
953             except network.exceptions as exception:
954                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
955                 instances.set_last_error(domain, exception)
956                 return 100
957
958     logger.debug("Success! - EXIT!")
959     return 0
960
961 def fetch_instances(args: argparse.Namespace) -> int:
962     logger.debug("args[]='%s' - CALLED!", type(args))
963
964     logger.debug("Invoking locking.acquire() ...")
965     locking.acquire()
966
967     # Init variables
968     rows = list()
969
970     # Is domain or software set?
971     if args.domain not in [None, ""]:
972         logger.debug("args.domain='%s' - checking ...", args.domain)
973         if not validators.domain(args.domain):
974             logger.warning("args.domain='%s' is not valid.", args.domain)
975             return 100
976         elif blacklist.is_blacklisted(args.domain):
977             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
978             return 101
979
980         logger.debug("args.domain='%s' - BEFORE!", args.domain)
981         domain = tidyup.domain(args.domain)
982         logger.debug("domain='%s' - AFTER!", domain)
983
984         # Fetch record
985         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
986         rows = database.cursor.fetchall()
987     elif args.software not in [None, ""]:
988         logger.debug("args.software='%s' - BEFORE!", args.software)
989         software = software_helper.alias(args.software)
990         logger.debug("software='%s' - AFTER!", software)
991
992         # Fetch records
993         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE software = ? ORDER BY last_instance_fetch ASC", [software])
994         rows = database.cursor.fetchall()
995
996     logger.info("Checking %d entries ...", len(rows))
997     for row in rows:
998         logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
999         if row["software"] is None and instances.is_registered(row["domain"]) :
1000             logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
1001             continue
1002         elif software_helper.is_relay(row["software"]) and instances.is_registered(row["domain"]):
1003             logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
1004             continue
1005         elif not args.force and not args.software in [None, ""]and instances.is_recent(row["domain"]):
1006             logger.debug("row[domain]='%s' has recently been crawled - SKIPPED!", row["domain"])
1007             continue
1008
1009         # Initial fetch
1010         try:
1011             logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
1012             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1013         except network.exceptions as exception:
1014             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
1015             instances.set_last_error(row["domain"], exception)
1016             instances.update(row["domain"])
1017             continue
1018
1019         if args.single:
1020             logger.debug("Not fetching more instances - BREAK!")
1021             break
1022
1023     # Loop through some instances
1024     database.cursor.execute(
1025         "SELECT domain, origin, software \
1026 FROM instances \
1027 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen', 'vebinet', 'hugo', 'toki') \
1028 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC"
1029     )
1030
1031     rows = database.cursor.fetchall()
1032     logger.info("Checking %d entries ...", len(rows))
1033     for row in rows:
1034         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1035         domain = row["domain"].encode("idna").decode("utf-8")
1036         logger.debug("domain='%s' - AFTER!", domain)
1037
1038         if not domain_helper.is_wanted(domain):
1039             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1040             continue
1041         elif instances.is_recent(domain):
1042             logger.debug("domain='%s' has recently been crawled - SKIPPED!")
1043             continue
1044
1045         try:
1046             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1047             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1048         except network.exceptions as exception:
1049             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1050             instances.set_last_error(domain, exception)
1051
1052     logger.debug("Success - EXIT!")
1053     return 0
1054
1055 def fetch_csv(args: argparse.Namespace) -> int:
1056     logger.debug("args[]='%s' - CALLED!", type(args))
1057
1058     logger.debug("Invoking locking.acquire() ...")
1059     locking.acquire()
1060
1061     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1062     for block in blocklists.csv_files:
1063         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1064
1065         # Is domain given and not equal blocker?
1066         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1067             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1068             continue
1069
1070         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1071         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1072
1073     logger.debug("Success - EXIT!")
1074     return 0
1075
1076 def fetch_oliphant(args: argparse.Namespace) -> int:
1077     logger.debug("args[]='%s' - CALLED!", type(args))
1078
1079     logger.debug("Invoking locking.acquire() ...")
1080     locking.acquire()
1081
1082     source_domain = "codeberg.org"
1083     if sources.is_recent(source_domain):
1084         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1085         return 1
1086     else:
1087         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1088         sources.update(source_domain)
1089
1090     # Base URL
1091     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1092
1093     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1094     for block in blocklists.oliphant_blocklists:
1095         # Is domain given and not equal blocker?
1096         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1097         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1098             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1099             continue
1100
1101         url = f"{base_url}/{block['csv_url']}"
1102
1103         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1104         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1105
1106     logger.debug("Success! - EXIT!")
1107     return 0
1108
1109 def fetch_txt(args: argparse.Namespace) -> int:
1110     logger.debug("args[]='%s' - CALLED!", type(args))
1111
1112     logger.debug("Invoking locking.acquire() ...")
1113     locking.acquire()
1114
1115     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1116     for row in blocklists.txt_files:
1117         logger.debug("Fetching row[url]='%s' ...", row["url"])
1118         response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1119
1120         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1121         if response.ok and response.status_code == 200 and response.text != "":
1122             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1123             domains = response.text.strip().split("\n")
1124
1125             logger.info("Processing %d domains ...", len(domains))
1126             for domain in domains:
1127                 logger.debug("domain='%s' - BEFORE!", domain)
1128                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1129                 logger.debug("domain='%s' - AFTER!", domain)
1130
1131                 if domain in [None, ""]:
1132                     logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1133                     continue
1134                 elif not domain_helper.is_wanted(domain):
1135                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1136                     continue
1137                 elif not args.force and instances.is_registered(domain):
1138                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1139                     continue
1140
1141                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1142                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1143                 logger.debug("processed='%s'", processed)
1144
1145     logger.debug("Success! - EXIT!")
1146     return 0
1147
1148 def fetch_fedipact(args: argparse.Namespace) -> int:
1149     logger.debug("args[]='%s' - CALLED!", type(args))
1150
1151     logger.debug("Invoking locking.acquire() ...")
1152     locking.acquire()
1153
1154     source_domain = "fedipact.online"
1155     if sources.is_recent(source_domain):
1156         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1157         return 1
1158     else:
1159         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1160         sources.update(source_domain)
1161
1162     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1163     response = network.fetch_url(
1164         f"https://{source_domain}",
1165         network.web_headers,
1166         (config.get("connection_timeout"), config.get("read_timeout"))
1167     )
1168
1169     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1170     if response.ok and response.status_code == 200 and response.text != "":
1171         logger.debug("Parsing %d Bytes ...", len(response.text))
1172
1173         doc = bs4.BeautifulSoup(response.text, "html.parser")
1174         logger.debug("doc[]='%s'", type(doc))
1175
1176         rows = doc.findAll("li")
1177         logger.info("Checking %d row(s) ...", len(rows))
1178         for row in rows:
1179             logger.debug("row[]='%s'", type(row))
1180             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1181
1182             logger.debug("domain='%s' - AFTER!", domain)
1183             if domain in [None, ""]:
1184                 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): row.contents[0]='%s' - SKIPPED!", type(domain), domain, row.contents[0])
1185                 continue
1186
1187             logger.debug("domain='%s' - BEFORE!", domain)
1188             domain = domain.encode("idna").decode("utf-8")
1189             logger.debug("domain='%s' - AFTER!", domain)
1190
1191             if not domain_helper.is_wanted(domain):
1192                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1193                 continue
1194             elif instances.is_registered(domain):
1195                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1196                 continue
1197             elif instances.is_recent(domain):
1198                 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1199                 continue
1200
1201             logger.info("Fetching domain='%s' ...", domain)
1202             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1203
1204     logger.debug("Success! - EXIT!")
1205     return 0
1206
1207 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1208     logger.debug("args[]='%s' - CALLED!", type(args))
1209
1210     logger.debug("Invoking locking.acquire() ...")
1211     locking.acquire()
1212
1213     source_domain = "instances.joinmobilizon.org"
1214     if sources.is_recent(source_domain):
1215         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1216         return 1
1217     else:
1218         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1219         sources.update(source_domain)
1220
1221     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1222     raw = network.fetch_url(
1223         f"https://{source_domain}/api/v1/instances",
1224         network.web_headers,
1225         (config.get("connection_timeout"), config.get("read_timeout"))
1226     ).text
1227     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1228
1229     parsed = json.loads(raw)
1230     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1231
1232     if "data" not in parsed:
1233         logger.warning("parsed()=%d does not contain key 'data'")
1234         return 1
1235
1236     logger.info("Checking %d instances ...", len(parsed["data"]))
1237     for row in parsed["data"]:
1238         logger.debug("row[]='%s'", type(row))
1239         if "host" not in row:
1240             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1241             continue
1242         elif not domain_helper.is_wanted(row["host"]):
1243             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1244             continue
1245         elif instances.is_registered(row["host"]):
1246             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1247             continue
1248         elif instances.is_recent(row["host"]):
1249             logger.debug("row[host]='%s' has recently been crawled - SKIPPED!", row["host"])
1250             continue
1251
1252         logger.info("Fetching row[host]='%s' ...", row["host"])
1253         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1254
1255     logger.debug("Success! - EXIT!")
1256     return 0
1257
1258 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1259     logger.debug("args[]='%s' - CALLED!", type(args))
1260
1261     logger.debug("Invoking locking.acquire() ...")
1262     locking.acquire()
1263
1264     source_domain = "instanceapp.misskey.page"
1265     if sources.is_recent(source_domain):
1266         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1267         return 1
1268     else:
1269         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1270         sources.update(source_domain)
1271
1272     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1273     raw = network.fetch_url(
1274         f"https://{source_domain}/instances.json",
1275         network.web_headers,
1276         (config.get("connection_timeout"), config.get("read_timeout"))
1277     ).text
1278     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1279
1280     parsed = json.loads(raw)
1281     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1282
1283     if "instancesInfos" not in parsed:
1284         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1285         return 1
1286
1287     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1288     for row in parsed["instancesInfos"]:
1289         logger.debug("row[%s]='%s'", type(row), row)
1290         if "url" not in row:
1291             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1292             continue
1293         elif not domain_helper.is_wanted(row["url"]):
1294             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1295             continue
1296         elif instances.is_registered(row["url"]):
1297             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1298             continue
1299         elif instances.is_recent(row["url"]):
1300             logger.debug("row[url]='%s' has recently been crawled - SKIPPED!", row["url"])
1301             continue
1302
1303         logger.info("Fetching row[url]='%s' ...", row["url"])
1304         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1305
1306     logger.debug("Success! - EXIT!")
1307     return 0
1308
1309 def recheck_obfuscation(args: argparse.Namespace) -> int:
1310     logger.debug("args[]='%s' - CALLED!", type(args))
1311
1312     logger.debug("Invoking locking.acquire() ...")
1313     locking.acquire()
1314
1315     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1316         logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1317         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1318     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1319         logger.debug("Fetching records for args.software='%s' ...", args.software)
1320         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1321     else:
1322         logger.debug("Fetching records where domains have obfuscated block entries ...")
1323         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1324
1325     rows = database.cursor.fetchall()
1326     logger.info("Checking %d domains ...", len(rows))
1327     for row in rows:
1328         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1329         if not domain_helper.is_wanted(row["domain"]):
1330             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1331             if args.delete_unwanted:
1332                 logger.info("Deleting unwanted row[domain]='%s' ...", row["domain"])
1333                 instances.delete(row["domain"])
1334                 blocks.delete(row["domain"])
1335             continue
1336         elif blacklist.is_blacklisted(row["domain"]):
1337             logger.warning("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1338             continue
1339         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1340             logger.debug("row[domain]='%s' has recently been checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1341             continue
1342
1343         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1344         blocking = federation.fetch_blocks(row["domain"])
1345
1346         logger.debug("blocking()=%d", len(blocking))
1347         if len(blocking) == 0:
1348             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1349             if row["software"] == "pleroma":
1350                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1351                 blocking = pleroma.fetch_blocks(row["domain"])
1352             elif row["software"] == "mastodon":
1353                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1354                 blocking = mastodon.fetch_blocks(row["domain"])
1355             elif row["software"] == "lemmy":
1356                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1357                 blocking = lemmy.fetch_blocks(row["domain"])
1358             elif row["software"] == "friendica":
1359                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1360                 blocking = friendica.fetch_blocks(row["domain"])
1361             elif row["software"] == "misskey":
1362                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1363                 blocking = misskey.fetch_blocks(row["domain"])
1364             else:
1365                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1366
1367         # c.s isn't part of oliphant's "hidden" blocklists
1368         logger.debug("row[domain]='%s'", row["domain"])
1369         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1370             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1371             instances.set_last_blocked(row["domain"])
1372             instances.set_total_blocks(row["domain"], blocking)
1373
1374         obfuscated = 0
1375         blockdict = list()
1376
1377         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1378         for block in blocking:
1379             logger.debug("block[blocked]='%s'", block["blocked"])
1380             blocked = None
1381
1382             if block["blocked"] == "":
1383                 logger.debug("block[blocked] is empty - SKIPPED!")
1384                 continue
1385             elif block["blocked"].endswith(".onion"):
1386                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1387                 continue
1388             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1389                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1390                 continue
1391             elif block["blocked"].endswith(".arpa"):
1392                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1393                 continue
1394             elif block["blocked"].endswith(".tld"):
1395                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1396                 continue
1397             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1398                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1399                 obfuscated = obfuscated + 1
1400                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1401             elif not domain_helper.is_wanted(block["blocked"]):
1402                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1403                 continue
1404             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1405                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1406                 continue
1407
1408             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1409             if blocked is not None and blocked != block["blocked"]:
1410                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1411                 obfuscated = obfuscated - 1
1412
1413                 if blacklist.is_blacklisted(blocked):
1414                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1415                     continue
1416                 elif blacklist.is_blacklisted(row["domain"]):
1417                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1418                     continue
1419                 elif blocks.is_instance_blocked(row["domain"], blocked):
1420                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1421                     continue
1422
1423                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1424
1425                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1426                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1427                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1428                     blockdict.append({
1429                         "blocked": blocked,
1430                         "reason" : block["reason"],
1431                     })
1432
1433         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1434         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1435         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1436
1437         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1438         if instances.has_pending(row["domain"]):
1439             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1440             instances.update(row["domain"])
1441
1442         logger.debug("Invoking commit() ...")
1443         database.connection.commit()
1444
1445         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1446         if config.get("bot_enabled") and len(blockdict) > 0:
1447             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1448             network.send_bot_post(row["domain"], blockdict)
1449
1450     logger.debug("Success! - EXIT!")
1451     return 0
1452
1453 def fetch_fedilist(args: argparse.Namespace) -> int:
1454     logger.debug("args[]='%s' - CALLED!", type(args))
1455
1456     logger.debug("Invoking locking.acquire() ...")
1457     locking.acquire()
1458
1459     source_domain = "demo.fedilist.com"
1460     if sources.is_recent(source_domain):
1461         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1462         return 1
1463     else:
1464         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1465         sources.update(source_domain)
1466
1467     url = f"http://{source_domain}/instance/csv?onion=not"
1468     if args.software is not None and args.software != "":
1469         logger.debug("args.software='%s'", args.software)
1470         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1471
1472     logger.info("Fetching url='%s' ...", url)
1473     response = reqto.get(
1474         url,
1475         headers=network.web_headers,
1476         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1477         allow_redirects=False
1478     )
1479
1480     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1481     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1482         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1483         return 1
1484
1485     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1486
1487     logger.debug("reader[]='%s'", type(reader))
1488     if reader is None:
1489         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1490         return 2
1491
1492     rows = list(reader)
1493
1494     logger.info("Checking %d rows ...", len(rows))
1495     for row in rows:
1496         logger.debug("row[]='%s'", type(row))
1497         if "hostname" not in row:
1498             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1499             continue
1500
1501         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1502         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1503         logger.debug("domain='%s' - AFTER!", domain)
1504
1505         if domain in [None, ""]:
1506             logger.debug("domain[%s]='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", type(domain), domain, row["hostname"])
1507             continue
1508
1509         logger.debug("domain='%s' - BEFORE!", domain)
1510         domain = domain.encode("idna").decode("utf-8")
1511         logger.debug("domain='%s' - AFTER!", domain)
1512
1513         if not domain_helper.is_wanted(domain):
1514             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1515             continue
1516         elif (args.force is None or not args.force) and instances.is_registered(domain):
1517             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1518             continue
1519         elif instances.is_recent(domain):
1520             logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1521             continue
1522
1523         logger.info("Fetching instances from domain='%s' ...", domain)
1524         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1525
1526     logger.debug("Success! - EXIT!")
1527     return 0
1528
1529 def update_nodeinfo(args: argparse.Namespace) -> int:
1530     logger.debug("args[]='%s' - CALLED!", type(args))
1531
1532     logger.debug("Invoking locking.acquire() ...")
1533     locking.acquire()
1534
1535     if args.domain is not None and args.domain != "":
1536         logger.debug("Fetching args.domain='%s'", args.domain)
1537         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1538     elif args.software is not None and args.software != "":
1539         logger.info("Fetching domains for args.software='%s'", args.software)
1540         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1541     elif args.mode is not None and args.mode != "":
1542         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1543         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1544     elif args.no_software:
1545         logger.info("Fetching domains with no software type detected ...")
1546         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1547     elif args.with_software:
1548         logger.info("Fetching domains with any software type detected ...")
1549         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1550     elif args.no_auto:
1551         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1552         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1553     elif args.no_detection:
1554         logger.info("Fetching domains with no detection mode being set ...")
1555         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1556     elif args.same:
1557         logger.info("Fetching domains with domain name and software being the same ...")
1558         database.cursor.execute("SELECT domain, software FROM instances WHERE domain=software ORDER BY last_updated ASC")
1559     else:
1560         logger.info("Fetching domains for recently updated ...")
1561         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1562
1563     domains = database.cursor.fetchall()
1564
1565     logger.info("Checking %d domain(s) ...", len(domains))
1566     cnt = 0
1567     for row in domains:
1568         logger.debug("row[]='%s'", type(row))
1569         if row["domain"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1570             logger.debug("row[domain]='%s' is an I2P address - SKIPPED", row["domain"])
1571             continue
1572         elif row["domain"].endswith(".onion"):
1573             logger.debug("row[domain]='%s' is a TOR .onion domain - SKIPPED", row["domain"])
1574             continue
1575         elif row["domain"].endswith(".arpa"):
1576             logger.debug("row[domain]='%s' is a reverse IP address - SKIPPED", row["domain"])
1577             continue
1578         elif row["domain"].endswith(".tld"):
1579             logger.debug("row[domain]='%s' is a fake domain - SKIPPED", row["domain"])
1580             continue
1581         elif blacklist.is_blacklisted(row["domain"]):
1582             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1583             continue
1584         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1585             logger.debug("row[domain]='%s' has recently been checked - SKIPPED!", row["domain"])
1586             continue
1587
1588         try:
1589             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1590             software = federation.determine_software(row["domain"])
1591
1592             logger.debug("Determined software='%s'", software)
1593             if (software != row["software"] and software is not None) or args.force is True:
1594                 logger.debug("software='%s'", software)
1595                 if software is None:
1596                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1597                     instances.set_nodeinfo_url(row["domain"], None)
1598
1599                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1600                 instances.set_software(row["domain"], software)
1601
1602             if software is not None:
1603                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1604                 instances.set_success(row["domain"])
1605         except network.exceptions as exception:
1606             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1607             instances.set_last_error(row["domain"], exception)
1608
1609         instances.set_last_nodeinfo(row["domain"])
1610         instances.update(row["domain"])
1611         cnt = cnt + 1
1612
1613     logger.debug("Success! - EXIT!")
1614     return 0
1615
1616 def fetch_instances_social(args: argparse.Namespace) -> int:
1617     logger.debug("args[]='%s' - CALLED!", type(args))
1618
1619     logger.debug("Invoking locking.acquire() ...")
1620     locking.acquire()
1621
1622     source_domain = "instances.social"
1623
1624     if config.get("instances_social_api_key") == "":
1625         logger.error("API key not set. Please set in your config.json file.")
1626         return 1
1627     elif sources.is_recent(source_domain):
1628         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1629         return 2
1630     else:
1631         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1632         sources.update(source_domain)
1633
1634     headers = {
1635         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1636     }
1637
1638     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1639     fetched = network.get_json_api(
1640         source_domain,
1641         "/api/1.0/instances/list?count=0&sort_by=name",
1642         headers=headers,
1643         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1644     )
1645     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1646
1647     if "error_message" in fetched:
1648         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1649         return 2
1650     elif "exception" in fetched:
1651         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1652         return 3
1653     elif "json" not in fetched:
1654         logger.warning("fetched has no element 'json' - EXIT!")
1655         return 4
1656     elif "instances" not in fetched["json"]:
1657         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1658         return 5
1659
1660     domains = list()
1661     rows = fetched["json"]["instances"]
1662
1663     logger.info("Checking %d row(s) ...", len(rows))
1664     for row in rows:
1665         logger.debug("row[]='%s'", type(row))
1666         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1667         logger.debug("domain='%s' - AFTER!", domain)
1668
1669         if domain in [None, ""]:
1670             logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1671             continue
1672
1673         logger.debug("domain='%s' - BEFORE!", domain)
1674         domain = domain.encode("idna").decode("utf-8")
1675         logger.debug("domain='%s' - AFTER!", domain)
1676
1677         if not domain_helper.is_wanted(domain):
1678             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1679             continue
1680         elif domain in domains:
1681             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1682             continue
1683         elif instances.is_registered(domain):
1684             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1685             continue
1686         elif instances.is_recent(domain):
1687             logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1688             continue
1689
1690         logger.info("Fetching instances from domain='%s' ...", domain)
1691         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1692
1693     logger.debug("Success! - EXIT!")
1694     return 0
1695
1696 def fetch_relaylist(args: argparse.Namespace) -> int:
1697     logger.debug("args[]='%s' - CALLED!", type(args))
1698
1699     logger.debug("Invoking locking.acquire() ...")
1700     locking.acquire()
1701
1702     source_domain = "api.relaylist.com"
1703
1704     if sources.is_recent(source_domain):
1705         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1706         return 1
1707     else:
1708         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1709         sources.update(source_domain)
1710
1711     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1712     fetched = network.get_json_api(
1713         source_domain,
1714         "/relays",
1715         {},
1716         (config.get("connection_timeout"), config.get("read_timeout"))
1717     )
1718     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1719
1720     if "error_message" in fetched:
1721         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1722         return 2
1723     elif "exception" in fetched:
1724         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1725         return 3
1726     elif "json" not in fetched:
1727         logger.warning("fetched has no element 'json' - EXIT!")
1728         return 4
1729
1730     domains = list()
1731
1732     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1733     for row in fetched["json"]:
1734         logger.debug("row[]='%s'", type(row))
1735         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1736         logger.debug("domain='%s' - AFTER!", domain)
1737
1738         if domain in [None, ""]:
1739             logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1740             continue
1741
1742         logger.debug("domain='%s' - BEFORE!", domain)
1743         domain = domain.encode("idna").decode("utf-8")
1744         logger.debug("domain='%s' - AFTER!", domain)
1745
1746         if not domain_helper.is_wanted(domain):
1747             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1748             continue
1749         elif domain in domains:
1750             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1751             continue
1752         elif instances.is_registered(domain):
1753             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1754             continue
1755         elif instances.is_recent(domain):
1756             logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1757             continue
1758
1759         logger.info("Fetching instances from domain='%s'", domain)
1760         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1761
1762     logger.debug("Success! - EXIT!")
1763     return 0
1764
1765 def fetch_relays(args: argparse.Namespace) -> int:
1766     logger.debug("args[]='%s' - CALLED!", type(args))
1767
1768     logger.debug("Invoking locking.acquire() ...")
1769     locking.acquire()
1770
1771     if args.domain is not None and args.domain != "":
1772         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1773         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1774     elif args.software is not None and args.software != "":
1775         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1776         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1777     else:
1778         logger.debug("Fetch all relay instances ...")
1779         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1780
1781     domains = list()
1782     rows = database.cursor.fetchall()
1783
1784     logger.info("Checking %d relays ...", len(rows))
1785     for row in rows:
1786         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1787         if not args.force and instances.is_recent(row["domain"]):
1788             logger.debug("row[domain]='%s' has recently been fetched - SKIPPED!", row["domain"])
1789             continue
1790         elif row["nodeinfo_url"] is None:
1791             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1792             continue
1793
1794         peers = list()
1795         try:
1796             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1797             if row["software"] == "pub-relay":
1798                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1799                 raw = network.fetch_api_url(
1800                     row["nodeinfo_url"],
1801                     (config.get("connection_timeout"), config.get("read_timeout"))
1802                 )
1803
1804                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1805                 if "exception" in raw:
1806                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1807                     raise raw["exception"]
1808                 elif "error_message" in raw:
1809                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1810                     instances.set_last_error(row["domain"], raw)
1811                     instances.set_last_instance_fetch(row["domain"])
1812                     instances.update(row["domain"])
1813                     continue
1814                 elif "json" not in raw:
1815                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1816                     continue
1817                 elif not "metadata" in raw["json"]:
1818                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1819                     continue
1820                 elif not "peers" in raw["json"]["metadata"]:
1821                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1822                     continue
1823             else:
1824                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1825                 raw = network.fetch_url(
1826                     f"https://{row['domain']}",
1827                     network.web_headers,
1828                     (config.get("connection_timeout"), config.get("read_timeout"))
1829                 ).text
1830                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1831
1832                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1833                 logger.debug("doc[]='%s'", type(doc))
1834
1835         except network.exceptions as exception:
1836             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1837             instances.set_last_error(row["domain"], exception)
1838             instances.set_last_instance_fetch(row["domain"])
1839             instances.update(row["domain"])
1840             continue
1841
1842         logger.debug("row[software]='%s'", row["software"])
1843         if row["software"] == "activityrelay":
1844             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1845             tags = doc.findAll("p")
1846
1847             logger.debug("Checking %d paragraphs ...", len(tags))
1848             for tag in tags:
1849                 logger.debug("tag[]='%s'", type(tag))
1850                 if len(tag.contents) == 0:
1851                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1852                     continue
1853                 elif "registered instances" not in tag.contents[0]:
1854                     logger.debug("Skipping paragraph, text not found.")
1855                     continue
1856
1857                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1858                 for domain in tag.contents:
1859                     logger.debug("domain[%s]='%s'", type(domain), domain)
1860                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1861                         continue
1862
1863                     domain = str(domain)
1864                     logger.debug("domain='%s'", domain)
1865                     if not domain_helper.is_wanted(domain):
1866                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1867                         continue
1868
1869                     logger.debug("domain='%s' - BEFORE!", domain)
1870                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1871                     logger.debug("domain='%s' - AFTER!", domain)
1872
1873                     if domain in [None, ""]:
1874                         logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1875                         continue
1876                     elif domain not in peers:
1877                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1878                         peers.append(domain)
1879
1880                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1881                     if dict_helper.has_key(domains, "domain", domain):
1882                         logger.debug("domain='%s' already added", domain)
1883                         continue
1884
1885                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1886                     domains.append({
1887                         "domain": domain,
1888                         "origin": row["domain"],
1889                     })
1890         elif row["software"] in ["aoderelay", "selective-relay"]:
1891             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1892             if row["software"] == "aoderelay":
1893                 tags = doc.findAll("section", {"class": "instance"})
1894             else:
1895                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1896
1897             logger.debug("Checking %d tags ...", len(tags))
1898             for tag in tags:
1899                 logger.debug("tag[]='%s'", type(tag))
1900
1901                 link = tag.find("a")
1902                 logger.debug("link[%s]='%s'", type(link), link)
1903                 if not isinstance(link, bs4.element.Tag):
1904                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1905                     continue
1906
1907                 components = urlparse(link.get("href"))
1908                 logger.debug("components(%d)='%s'", len(components), components)
1909                 domain = components.netloc.lower().split(":")[0]
1910
1911                 logger.debug("domain='%s' - BEFORE!", domain)
1912                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1913                 logger.debug("domain='%s' - AFTER!", domain)
1914
1915                 if domain in [None, ""]:
1916                     logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1917                     continue
1918                 elif domain not in peers:
1919                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1920                     peers.append(domain)
1921
1922                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1923                 if dict_helper.has_key(domains, "domain", domain):
1924                     logger.debug("domain='%s' already added", domain)
1925                     continue
1926
1927                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1928                 domains.append({
1929                     "domain": domain,
1930                     "origin": row["domain"],
1931                 })
1932         elif row["software"] == "pub-relay":
1933             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1934             for domain in raw["json"]["metadata"]["peers"]:
1935                 logger.debug("domain='%s' - BEFORE!", domain)
1936                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1937                 logger.debug("domain='%s' - AFTER!", domain)
1938
1939                 if domain in [None, ""]:
1940                     logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1941                     continue
1942                 elif domain not in peers:
1943                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1944                     peers.append(domain)
1945
1946                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1947                 if dict_helper.has_key(domains, "domain", domain):
1948                     logger.debug("domain='%s' already added", domain)
1949                     continue
1950
1951                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1952                 domains.append({
1953                     "domain": domain,
1954                     "origin": row["domain"],
1955                 })
1956         else:
1957             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1958             continue
1959
1960         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1961         instances.set_last_instance_fetch(row["domain"])
1962
1963         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1964         instances.set_total_peers(row["domain"], peers)
1965
1966         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1967         instances.update(row["domain"])
1968
1969     logger.info("Checking %d domains ...", len(domains))
1970     for row in domains:
1971         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1972         if not domain_helper.is_wanted(row["domain"]):
1973             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1974             continue
1975         elif instances.is_registered(row["domain"]):
1976             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1977             continue
1978         elif instances.is_recent(row["domain"]):
1979             logger.debug("row[domain]='%s' has recently been crawled - SKIPPED!", row["domain"])
1980             continue
1981
1982         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1983         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1984
1985     logger.debug("Success! - EXIT!")
1986     return 0
1987
1988 def convert_idna(args: argparse.Namespace) -> int:
1989     logger.debug("args[]='%s' - CALLED!", type(args))
1990
1991     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1992     rows = database.cursor.fetchall()
1993
1994     logger.debug("rows[]='%s'", type(rows))
1995     instances.translate_idnas(rows, "domain")
1996
1997     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1998     rows = database.cursor.fetchall()
1999
2000     logger.debug("rows[]='%s'", type(rows))
2001     instances.translate_idnas(rows, "origin")
2002
2003     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2004     rows = database.cursor.fetchall()
2005
2006     logger.debug("rows[]='%s'", type(rows))
2007     blocks.translate_idnas(rows, "blocker")
2008
2009     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2010     rows = database.cursor.fetchall()
2011
2012     logger.debug("rows[]='%s'", type(rows))
2013     blocks.translate_idnas(rows, "blocked")
2014
2015     logger.debug("Success! - EXIT!")
2016     return 0
2017
2018 def remove_invalid(args: argparse.Namespace) -> int:
2019     logger.debug("args[]='%s' - CALLED!", type(args))
2020
2021     logger.debug("Invoking locking.acquire() ...")
2022     locking.acquire()
2023
2024     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2025     rows = database.cursor.fetchall()
2026
2027     logger.info("Checking %d domains ...", len(rows))
2028     for row in rows:
2029         logger.debug("row[domain]='%s'", row["domain"])
2030         if not validators.domain(row["domain"].split("/")[0]):
2031             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2032             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2033             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2034
2035     logger.debug("Invoking commit() ...")
2036     database.connection.commit()
2037
2038     logger.info("Vaccum cleaning database ...")
2039     database.cursor.execute("VACUUM")
2040
2041     logger.debug("Success! - EXIT!")
2042     return 0