]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = network.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         items = list()
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = network.post_json_api(
532                 f"api.{source_domain}",
533                 "/",
534                 json.dumps({
535                     "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
536                 })
537             )
538
539             logger.debug("raw[%s]()=%d", type(raw), len(raw))
540             if "exception" in raw:
541                 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
542                 raise raw["exception"]
543             elif "error_message" in raw:
544                 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
545                 continue
546             elif not "data" in raw["json"]:
547                 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
548                 continue
549             elif not "nodes" in raw["json"]["data"]:
550                 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
551                 continue
552
553             items = raw["json"]["data"]["nodes"]
554             logger.debug("items()=%d", len(items))
555
556         except network.exceptions as exception:
557             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
558             continue
559
560         logger.info("Checking %d items,software='%s' ...", len(items), software)
561         for item in items:
562             logger.debug("item[]='%s'", type(item))
563             if not "domain" in item:
564                 logger.debug("item()=%d has not element 'domain'", len(item))
565                 continue
566
567             logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
568             domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
569             logger.debug("domain='%s' - AFTER!", domain)
570
571             if domain in [None, ""]:
572                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
573                 continue
574
575             logger.debug("domain='%s' - BEFORE!", domain)
576             domain = domain.encode("idna").decode("utf-8")
577             logger.debug("domain='%s' - AFTER!", domain)
578
579             if not domain_helper.is_wanted(domain):
580                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
581                 continue
582             elif instances.is_registered(domain):
583                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
584                 continue
585
586             logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
587             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
588
589     logger.debug("Success! - EXIT!")
590     return 0
591
592 def fetch_todon_wiki(args: argparse.Namespace) -> int:
593     logger.debug("args[]='%s' - CALLED!", type(args))
594
595     logger.debug("Invoking locking.acquire() ...")
596     locking.acquire()
597
598     source_domain = "wiki.todon.eu"
599     if sources.is_recent(source_domain):
600         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
601         return 1
602     else:
603         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
604         sources.update(source_domain)
605
606     blocklist = {
607         "silenced": list(),
608         "reject": list(),
609     }
610
611     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
612     raw = network.fetch_url(
613         f"https://{source_domain}/todon/domainblocks",
614         network.web_headers,
615         (config.get("connection_timeout"), config.get("read_timeout"))
616     ).text
617     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
618
619     doc = bs4.BeautifulSoup(raw, "html.parser")
620     logger.debug("doc[]='%s'", type(doc))
621
622     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
623     logger.info("Checking %d silenced/limited entries ...", len(silenced))
624     blocklist["silenced"] = utils.find_domains(silenced, "div")
625
626     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
627     logger.info("Checking %d suspended entries ...", len(suspended))
628     blocklist["reject"] = utils.find_domains(suspended, "div")
629
630     blocking = blocklist["silenced"] + blocklist["reject"]
631     blocker = "todon.eu"
632
633     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
634     instances.set_last_blocked(blocker)
635     instances.set_total_blocks(blocker, blocking)
636
637     blockdict = list()
638     for block_level in blocklist:
639         blockers = blocklist[block_level]
640
641         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
642         for blocked in blockers:
643             logger.debug("blocked='%s'", blocked)
644
645             if not instances.is_registered(blocked):
646                 try:
647                     logger.info("Fetching instances from domain='%s' ...", blocked)
648                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
649                 except network.exceptions as exception:
650                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
651                     instances.set_last_error(blocked, exception)
652
653             if not domain_helper.is_wanted(blocked):
654                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
655                 continue
656             elif not domain_helper.is_wanted(blocker):
657                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
658                 continue
659             elif blocks.is_instance_blocked(blocker, blocked, block_level):
660                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
661                 continue
662
663             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
664             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
665                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
666                 blockdict.append({
667                     "blocked": blocked,
668                     "reason" : None,
669                 })
670
671         logger.debug("Invoking commit() ...")
672         database.connection.commit()
673
674         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
675         if config.get("bot_enabled") and len(blockdict) > 0:
676             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
677             network.send_bot_post(blocker, blockdict)
678
679     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
680     if instances.has_pending(blocker):
681         logger.debug("Flushing updates for blocker='%s' ...", blocker)
682         instances.update(blocker)
683
684     logger.debug("Success! - EXIT!")
685     return 0
686
687 def fetch_cs(args: argparse.Namespace):
688     logger.debug("args[]='%s' - CALLED!", type(args))
689
690     logger.debug("Invoking locking.acquire() ...")
691     locking.acquire()
692
693     extensions = [
694         "extra",
695         "abbr",
696         "attr_list",
697         "def_list",
698         "fenced_code",
699         "footnotes",
700         "md_in_html",
701         "admonition",
702         "codehilite",
703         "legacy_attrs",
704         "legacy_em",
705         "meta",
706         "nl2br",
707         "sane_lists",
708         "smarty",
709         "toc",
710         "wikilinks"
711     ]
712
713     blocklist = {
714         "silenced": list(),
715         "reject"  : list(),
716     }
717
718     source_domain = "raw.githubusercontent.com"
719     if sources.is_recent(source_domain):
720         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
721         return 1
722     else:
723         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
724         sources.update(source_domain)
725
726     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
727     raw = network.fetch_url(
728         f"https://{source_domain}/chaossocial/meta/master/federation.md",
729         network.web_headers,
730         (config.get("connection_timeout"), config.get("read_timeout"))
731     ).text
732     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
733
734     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
735     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
736
737     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
738     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
739     blocklist["silenced"] = federation.find_domains(silenced)
740
741     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
742     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
743     blocklist["reject"] = federation.find_domains(blocked)
744
745     blocking = blocklist["silenced"] + blocklist["reject"]
746     blocker = "chaos.social"
747
748     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
749     instances.set_last_blocked(blocker)
750     instances.set_total_blocks(blocker, blocking)
751
752     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
753     if len(blocking) > 0:
754         blockdict = list()
755         for block_level in blocklist:
756             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
757
758             for row in blocklist[block_level]:
759                 logger.debug("row[%s]='%s'", type(row), row)
760                 if not "domain" in row:
761                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
762                     continue
763                 elif not instances.is_registered(row["domain"]):
764                     try:
765                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
766                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
767                     except network.exceptions as exception:
768                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
769                         instances.set_last_error(row["domain"], exception)
770
771                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
772                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
773                     blockdict.append({
774                         "blocked": row["domain"],
775                         "reason" : row["reason"],
776                     })
777
778         logger.debug("Invoking commit() ...")
779         database.connection.commit()
780
781         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
782         if config.get("bot_enabled") and len(blockdict) > 0:
783             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
784             network.send_bot_post(blocker, blockdict)
785
786     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
787     if instances.has_pending(blocker):
788         logger.debug("Flushing updates for blocker='%s' ...", blocker)
789         instances.update(blocker)
790
791     logger.debug("Success! - EXIT!")
792     return 0
793
794 def fetch_fba_rss(args: argparse.Namespace) -> int:
795     logger.debug("args[]='%s' - CALLED!", type(args))
796
797     domains = list()
798
799     logger.debug("Invoking locking.acquire() ...")
800     locking.acquire()
801
802     components = urlparse(args.feed)
803     domain = components.netloc.lower().split(":")[0]
804
805     logger.debug("domain='%s'", domain)
806     if sources.is_recent(domain):
807         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
808         return 0
809     else:
810         logger.debug("domain='%s' has not been recently used, marking ...", domain)
811         sources.update(domain)
812
813     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
814     response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
815
816     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
817     if response.ok and response.status_code == 200 and len(response.text) > 0:
818         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
819         rss = atoma.parse_rss_bytes(response.content)
820
821         logger.debug("rss[]='%s'", type(rss))
822         for item in rss.items:
823             logger.debug("item[%s]='%s'", type(item), item)
824             domain = item.link.split("=")[1]
825             domain = tidyup.domain(domain) if domain not in[None, ""] else None
826
827             logger.debug("domain='%s' - AFTER!", domain)
828             if domain in [None, ""]:
829                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
830                 continue
831
832             logger.debug("domain='%s' - BEFORE!", domain)
833             domain = domain.encode("idna").decode("utf-8")
834             logger.debug("domain='%s' - AFTER!", domain)
835
836             if not domain_helper.is_wanted(domain):
837                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
838                 continue
839             elif domain in domains:
840                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
841                 continue
842             elif instances.is_registered(domain):
843                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
844                 continue
845             elif instances.is_recent(domain):
846                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
847                 continue
848
849             logger.debug("Adding domain='%s'", domain)
850             domains.append(domain)
851
852     logger.debug("domains()=%d", len(domains))
853     if len(domains) > 0:
854         logger.info("Adding %d new instances ...", len(domains))
855         for domain in domains:
856             logger.debug("domain='%s'", domain)
857             try:
858                 logger.info("Fetching instances from domain='%s' ...", domain)
859                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
860             except network.exceptions as exception:
861                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
862                 instances.set_last_error(domain, exception)
863                 return 100
864
865     logger.debug("Success! - EXIT!")
866     return 0
867
868 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
869     logger.debug("args[]='%s' - CALLED!", type(args))
870
871     logger.debug("Invoking locking.acquire() ...")
872     locking.acquire()
873
874     source_domain = "ryona.agency"
875     feed = f"https://{source_domain}/users/fba/feed.atom"
876
877     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
878     if args.feed is not None and validators.url(args.feed):
879         logger.debug("Setting feed='%s' ...", args.feed)
880         feed = str(args.feed)
881         source_domain = urlparse(args.feed).netloc
882
883     if sources.is_recent(source_domain):
884         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
885         return 1
886     else:
887         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
888         sources.update(source_domain)
889
890     domains = list()
891
892     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
893     response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
894
895     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
896     if response.ok and response.status_code == 200 and len(response.text) > 0:
897         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
898         atom = atoma.parse_atom_bytes(response.content)
899
900         logger.debug("atom[]='%s'", type(atom))
901         for entry in atom.entries:
902             logger.debug("entry[]='%s'", type(entry))
903             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
904             logger.debug("doc[]='%s'", type(doc))
905             elements = doc.findAll("a")
906
907             logger.debug("Checking %d element(s) ...", len(elements))
908             for element in elements:
909                 logger.debug("element[%s]='%s'", type(element), element)
910                 for href in element["href"].split(","):
911                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
912                     domain = tidyup.domain(href) if href not in [None, ""] else None
913
914                     logger.debug("domain='%s' - AFTER!", domain)
915                     if domain in [None, ""]:
916                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
917                         continue
918
919                     logger.debug("domain='%s' - BEFORE!", domain)
920                     domain = domain.encode("idna").decode("utf-8")
921                     logger.debug("domain='%s' - AFTER!", domain)
922
923                     if not domain_helper.is_wanted(domain):
924                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
925                         continue
926                     elif domain in domains:
927                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
928                         continue
929                     elif instances.is_registered(domain):
930                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
931                         continue
932                     elif instances.is_recent(domain):
933                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
934                         continue
935
936                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
937                     domains.append(domain)
938
939     logger.debug("domains()=%d", len(domains))
940     if len(domains) > 0:
941         logger.info("Adding %d new instances ...", len(domains))
942         for domain in domains:
943             logger.debug("domain='%s'", domain)
944             try:
945                 logger.info("Fetching instances from domain='%s' ...", domain)
946                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
947             except network.exceptions as exception:
948                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
949                 instances.set_last_error(domain, exception)
950                 return 100
951
952     logger.debug("Success! - EXIT!")
953     return 0
954
955 def fetch_instances(args: argparse.Namespace) -> int:
956     logger.debug("args[]='%s' - CALLED!", type(args))
957
958     logger.debug("Invoking locking.acquire() ...")
959     locking.acquire()
960
961     # Init variables
962     rows = list()
963
964     # Is domain or software set?
965     if args.domain not in [None, ""]:
966         logger.debug("args.domain='%s' - checking ...", args.domain)
967         if not validators.domain(args.domain):
968             logger.warning("args.domain='%s' is not valid.", args.domain)
969             return 100
970         elif blacklist.is_blacklisted(args.domain):
971             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
972             return 101
973
974         logger.debug("args.domain='%s' - BEFORE!", args.domain)
975         domain = tidyup.domain(args.domain)
976         logger.debug("domain='%s' - AFTER!", domain)
977
978         # Fetch record
979         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
980         rows = database.cursor.fetchall()
981     elif args.software not in [None, ""]:
982         logger.debug("args.software='%s' - BEFORE!", args.software)
983         software = software_helper.alias(args.software)
984         logger.debug("software='%s' - AFTER!", software)
985
986         # Fetch records
987         database.cursor.execute("SELECT domain, origin, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [software])
988         rows = database.cursor.fetchall()
989
990     logger.info("Checking %d entries ...", len(rows))
991     for row in rows:
992         logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
993         if instances.is_registered(row["domain"]) and row["software"] is None:
994             logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
995             continue
996         elif instances.is_registered(row["domain"]) and software_helper.is_relay(row["software"]):
997             logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
998             continue
999         elif not args.force and not args.software in [None, ""]and instances.is_recent(row["domain"]):
1000             logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
1001             continue
1002
1003         # Initial fetch
1004         try:
1005             logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
1006             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1007         except network.exceptions as exception:
1008             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
1009             instances.set_last_error(row["domain"], exception)
1010             instances.update(row["domain"])
1011             continue
1012
1013         if args.single:
1014             logger.debug("Not fetching more instances - BREAK!")
1015             break
1016
1017     # Loop through some instances
1018     database.cursor.execute(
1019         "SELECT domain, origin, software \
1020 FROM instances \
1021 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen') \
1022 AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) \
1023 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
1024     )
1025
1026     rows = database.cursor.fetchall()
1027     logger.info("Checking %d entries ...", len(rows))
1028     for row in rows:
1029         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1030         domain = row["domain"].encode("idna").decode("utf-8")
1031         logger.debug("domain='%s' - AFTER!", domain)
1032
1033         if not domain_helper.is_wanted(domain):
1034             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1035             continue
1036
1037         try:
1038             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1039             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1040         except network.exceptions as exception:
1041             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1042             instances.set_last_error(domain, exception)
1043
1044     logger.debug("Success - EXIT!")
1045     return 0
1046
1047 def fetch_csv(args: argparse.Namespace) -> int:
1048     logger.debug("args[]='%s' - CALLED!", type(args))
1049
1050     logger.debug("Invoking locking.acquire() ...")
1051     locking.acquire()
1052
1053     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1054     for block in blocklists.csv_files:
1055         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1056
1057         # Is domain given and not equal blocker?
1058         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1059             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1060             continue
1061
1062         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1063         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1064
1065     logger.debug("Success - EXIT!")
1066     return 0
1067
1068 def fetch_oliphant(args: argparse.Namespace) -> int:
1069     logger.debug("args[]='%s' - CALLED!", type(args))
1070
1071     logger.debug("Invoking locking.acquire() ...")
1072     locking.acquire()
1073
1074     source_domain = "codeberg.org"
1075     if sources.is_recent(source_domain):
1076         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1077         return 1
1078     else:
1079         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1080         sources.update(source_domain)
1081
1082     # Base URL
1083     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1084
1085     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1086     for block in blocklists.oliphant_blocklists:
1087         # Is domain given and not equal blocker?
1088         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1089         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1090             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1091             continue
1092
1093         url = f"{base_url}/{block['csv_url']}"
1094
1095         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1096         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1097
1098     logger.debug("Success! - EXIT!")
1099     return 0
1100
1101 def fetch_txt(args: argparse.Namespace) -> int:
1102     logger.debug("args[]='%s' - CALLED!", type(args))
1103
1104     logger.debug("Invoking locking.acquire() ...")
1105     locking.acquire()
1106
1107     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1108     for row in blocklists.txt_files:
1109         logger.debug("Fetching row[url]='%s' ...", row["url"])
1110         response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1111
1112         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1113         if response.ok and response.status_code == 200 and response.text != "":
1114             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1115             domains = response.text.strip().split("\n")
1116
1117             logger.info("Processing %d domains ...", len(domains))
1118             for domain in domains:
1119                 logger.debug("domain='%s' - BEFORE!", domain)
1120                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1121                 logger.debug("domain='%s' - AFTER!", domain)
1122
1123                 if domain in [None, ""]:
1124                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1125                     continue
1126                 elif not domain_helper.is_wanted(domain):
1127                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1128                     continue
1129                 elif not args.force and instances.is_registered(domain):
1130                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1131                     continue
1132
1133                 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1134                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1135                 logger.debug("processed='%s'", processed)
1136
1137     logger.debug("Success! - EXIT!")
1138     return 0
1139
1140 def fetch_fedipact(args: argparse.Namespace) -> int:
1141     logger.debug("args[]='%s' - CALLED!", type(args))
1142
1143     logger.debug("Invoking locking.acquire() ...")
1144     locking.acquire()
1145
1146     source_domain = "fedipact.online"
1147     if sources.is_recent(source_domain):
1148         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1149         return 1
1150     else:
1151         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1152         sources.update(source_domain)
1153
1154     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1155     response = network.fetch_url(
1156         f"https://{source_domain}",
1157         network.web_headers,
1158         (config.get("connection_timeout"), config.get("read_timeout"))
1159     )
1160
1161     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1162     if response.ok and response.status_code == 200 and response.text != "":
1163         logger.debug("Parsing %d Bytes ...", len(response.text))
1164
1165         doc = bs4.BeautifulSoup(response.text, "html.parser")
1166         logger.debug("doc[]='%s'", type(doc))
1167
1168         rows = doc.findAll("li")
1169         logger.info("Checking %d row(s) ...", len(rows))
1170         for row in rows:
1171             logger.debug("row[]='%s'", type(row))
1172             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1173
1174             logger.debug("domain='%s' - AFTER!", domain)
1175             if domain in [None, ""]:
1176                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1177                 continue
1178
1179             logger.debug("domain='%s' - BEFORE!", domain)
1180             domain = domain.encode("idna").decode("utf-8")
1181             logger.debug("domain='%s' - AFTER!", domain)
1182
1183             if not domain_helper.is_wanted(domain):
1184                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1185                 continue
1186             elif instances.is_registered(domain):
1187                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1188                 continue
1189             elif instances.is_recent(domain):
1190                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1191                 continue
1192
1193             logger.info("Fetching domain='%s' ...", domain)
1194             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1195
1196     logger.debug("Success! - EXIT!")
1197     return 0
1198
1199 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1200     logger.debug("args[]='%s' - CALLED!", type(args))
1201
1202     logger.debug("Invoking locking.acquire() ...")
1203     locking.acquire()
1204
1205     source_domain = "instances.joinmobilizon.org"
1206     if sources.is_recent(source_domain):
1207         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1208         return 1
1209     else:
1210         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1211         sources.update(source_domain)
1212
1213     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1214     raw = network.fetch_url(
1215         f"https://{source_domain}/api/v1/instances",
1216         network.web_headers,
1217         (config.get("connection_timeout"), config.get("read_timeout"))
1218     ).text
1219     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1220
1221     parsed = json.loads(raw)
1222     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1223
1224     if "data" not in parsed:
1225         logger.warning("parsed()=%d does not contain key 'data'")
1226         return 1
1227
1228     logger.info("Checking %d instances ...", len(parsed["data"]))
1229     for row in parsed["data"]:
1230         logger.debug("row[]='%s'", type(row))
1231         if "host" not in row:
1232             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1233             continue
1234         elif not domain_helper.is_wanted(row["host"]):
1235             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1236             continue
1237         elif instances.is_registered(row["host"]):
1238             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1239             continue
1240
1241         logger.info("Fetching row[host]='%s' ...", row["host"])
1242         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1243
1244     logger.debug("Success! - EXIT!")
1245     return 0
1246
1247 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1248     logger.debug("args[]='%s' - CALLED!", type(args))
1249
1250     logger.debug("Invoking locking.acquire() ...")
1251     locking.acquire()
1252
1253     source_domain = "instanceapp.misskey.page"
1254     if sources.is_recent(source_domain):
1255         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1256         return 1
1257     else:
1258         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1259         sources.update(source_domain)
1260
1261     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1262     raw = network.fetch_url(
1263         f"https://{source_domain}/instances.json",
1264         network.web_headers,
1265         (config.get("connection_timeout"), config.get("read_timeout"))
1266     ).text
1267     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1268
1269     parsed = json.loads(raw)
1270     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1271
1272     if "instancesInfos" not in parsed:
1273         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1274         return 1
1275
1276     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1277     for row in parsed["instancesInfos"]:
1278         logger.debug("row[%s]='%s'", type(row), row)
1279         if "url" not in row:
1280             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1281             continue
1282         elif not domain_helper.is_wanted(row["url"]):
1283             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1284             continue
1285         elif instances.is_registered(row["url"]):
1286             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1287             continue
1288
1289         logger.info("Fetching row[url]='%s' ...", row["url"])
1290         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1291
1292     logger.debug("Success! - EXIT!")
1293     return 0
1294
1295 def recheck_obfuscation(args: argparse.Namespace) -> int:
1296     logger.debug("args[]='%s' - CALLED!", type(args))
1297
1298     logger.debug("Invoking locking.acquire() ...")
1299     locking.acquire()
1300
1301     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1302         logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1303         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1304     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1305         logger.debug("Fetching records for args.software='%s' ...", args.software)
1306         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1307     else:
1308         logger.debug("Fetching records where domains have obfuscated block entries ...")
1309         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1310
1311     rows = database.cursor.fetchall()
1312     logger.info("Checking %d domains ...", len(rows))
1313     for row in rows:
1314         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1315         if blacklist.is_blacklisted(row["domain"]):
1316             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1317             continue
1318         elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1319             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1320             continue
1321
1322         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1323         blocking = federation.fetch_blocks(row["domain"])
1324
1325         logger.debug("blocking()=%d", len(blocking))
1326         if len(blocking) == 0:
1327             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1328             if row["software"] == "pleroma":
1329                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1330                 blocking = pleroma.fetch_blocks(row["domain"])
1331             elif row["software"] == "mastodon":
1332                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1333                 blocking = mastodon.fetch_blocks(row["domain"])
1334             elif row["software"] == "lemmy":
1335                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1336                 blocking = lemmy.fetch_blocks(row["domain"])
1337             elif row["software"] == "friendica":
1338                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1339                 blocking = friendica.fetch_blocks(row["domain"])
1340             elif row["software"] == "misskey":
1341                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1342                 blocking = misskey.fetch_blocks(row["domain"])
1343             else:
1344                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1345
1346         # c.s isn't part of oliphant's "hidden" blocklists
1347         logger.debug("row[domain]='%s'", row["domain"])
1348         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1349             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1350             instances.set_last_blocked(row["domain"])
1351             instances.set_total_blocks(row["domain"], blocking)
1352
1353         obfuscated = 0
1354         blockdict = list()
1355
1356         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1357         for block in blocking:
1358             logger.debug("block[blocked]='%s'", block["blocked"])
1359             blocked = None
1360
1361             if block["blocked"] == "":
1362                 logger.debug("block[blocked] is empty - SKIPPED!")
1363                 continue
1364             elif block["blocked"].endswith(".onion"):
1365                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1366                 continue
1367             elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1368                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1369                 continue
1370             elif block["blocked"].endswith(".arpa"):
1371                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1372                 continue
1373             elif block["blocked"].endswith(".tld"):
1374                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1375                 continue
1376             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1377                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1378                 obfuscated = obfuscated + 1
1379                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1380             elif not domain_helper.is_wanted(block["blocked"]):
1381                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1382                 continue
1383             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1384                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1385                 continue
1386
1387             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1388             if blocked is not None and blocked != block["blocked"]:
1389                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1390                 obfuscated = obfuscated - 1
1391
1392                 if blacklist.is_blacklisted(blocked):
1393                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1394                     continue
1395                 elif blacklist.is_blacklisted(row["domain"]):
1396                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1397                     continue
1398                 elif blocks.is_instance_blocked(row["domain"], blocked):
1399                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1400                     continue
1401
1402                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1403
1404                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1405                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1406                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1407                     blockdict.append({
1408                         "blocked": blocked,
1409                         "reason" : block["reason"],
1410                     })
1411
1412         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1413         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1414         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1415
1416         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1417         if instances.has_pending(row["domain"]):
1418             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1419             instances.update(row["domain"])
1420
1421         logger.debug("Invoking commit() ...")
1422         database.connection.commit()
1423
1424         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1425         if config.get("bot_enabled") and len(blockdict) > 0:
1426             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1427             network.send_bot_post(row["domain"], blockdict)
1428
1429     logger.debug("Success! - EXIT!")
1430     return 0
1431
1432 def fetch_fedilist(args: argparse.Namespace) -> int:
1433     logger.debug("args[]='%s' - CALLED!", type(args))
1434
1435     logger.debug("Invoking locking.acquire() ...")
1436     locking.acquire()
1437
1438     source_domain = "demo.fedilist.com"
1439     if sources.is_recent(source_domain):
1440         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1441         return 1
1442     else:
1443         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1444         sources.update(source_domain)
1445
1446     url = f"http://{source_domain}/instance/csv?onion=not"
1447     if args.software is not None and args.software != "":
1448         logger.debug("args.software='%s'", args.software)
1449         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1450
1451     logger.info("Fetching url='%s' ...", url)
1452     response = reqto.get(
1453         url,
1454         headers=network.web_headers,
1455         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1456         allow_redirects=False
1457     )
1458
1459     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1460     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1461         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1462         return 1
1463
1464     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1465
1466     logger.debug("reader[]='%s'", type(reader))
1467     if reader is None:
1468         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1469         return 2
1470
1471     rows = list(reader)
1472
1473     logger.info("Checking %d rows ...", len(rows))
1474     for row in rows:
1475         logger.debug("row[]='%s'", type(row))
1476         if "hostname" not in row:
1477             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1478             continue
1479
1480         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1481         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1482         logger.debug("domain='%s' - AFTER!", domain)
1483
1484         if domain in [None, ""]:
1485             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1486             continue
1487
1488         logger.debug("domain='%s' - BEFORE!", domain)
1489         domain = domain.encode("idna").decode("utf-8")
1490         logger.debug("domain='%s' - AFTER!", domain)
1491
1492         if not domain_helper.is_wanted(domain):
1493             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1494             continue
1495         elif (args.force is None or not args.force) and instances.is_registered(domain):
1496             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1497             continue
1498         elif instances.is_recent(domain):
1499             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1500             continue
1501
1502         logger.info("Fetching instances from domain='%s' ...", domain)
1503         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1504
1505     logger.debug("Success! - EXIT!")
1506     return 0
1507
1508 def update_nodeinfo(args: argparse.Namespace) -> int:
1509     logger.debug("args[]='%s' - CALLED!", type(args))
1510
1511     logger.debug("Invoking locking.acquire() ...")
1512     locking.acquire()
1513
1514     if args.domain is not None and args.domain != "":
1515         logger.debug("Fetching args.domain='%s'", args.domain)
1516         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1517     elif args.software is not None and args.software != "":
1518         logger.info("Fetching domains for args.software='%s'", args.software)
1519         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1520     elif args.mode is not None and args.mode != "":
1521         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1522         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1523     elif args.no_software:
1524         logger.info("Fetching domains with no software type detected ...")
1525         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1526     elif args.with_software:
1527         logger.info("Fetching domains with any software type detected ...")
1528         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1529     elif args.no_auto:
1530         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1531         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1532     elif args.no_detection:
1533         logger.info("Fetching domains with no detection mode being set ...")
1534         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1535     else:
1536         logger.info("Fetching domains for recently updated ...")
1537         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1538
1539     domains = database.cursor.fetchall()
1540
1541     logger.info("Checking %d domain(s) ...", len(domains))
1542     cnt = 0
1543     for row in domains:
1544         logger.debug("row[]='%s'", type(row))
1545         if blacklist.is_blacklisted(row["domain"]):
1546             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1547             continue
1548         elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1549             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1550             continue
1551
1552         try:
1553             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1554             software = federation.determine_software(row["domain"])
1555
1556             logger.debug("Determined software='%s'", software)
1557             if (software != row["software"] and software is not None) or args.force is True:
1558                 logger.debug("software='%s'", software)
1559                 if software is None:
1560                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1561                     instances.set_nodeinfo_url(row["domain"], None)
1562
1563                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1564                 instances.set_software(row["domain"], software)
1565
1566             if software is not None:
1567                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1568                 instances.set_success(row["domain"])
1569         except network.exceptions as exception:
1570             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1571             instances.set_last_error(row["domain"], exception)
1572
1573         instances.set_last_nodeinfo(row["domain"])
1574         instances.update(row["domain"])
1575         cnt = cnt + 1
1576
1577     logger.debug("Success! - EXIT!")
1578     return 0
1579
1580 def fetch_instances_social(args: argparse.Namespace) -> int:
1581     logger.debug("args[]='%s' - CALLED!", type(args))
1582
1583     logger.debug("Invoking locking.acquire() ...")
1584     locking.acquire()
1585
1586     source_domain = "instances.social"
1587
1588     if config.get("instances_social_api_key") == "":
1589         logger.error("API key not set. Please set in your config.json file.")
1590         return 1
1591     elif sources.is_recent(source_domain):
1592         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1593         return 2
1594     else:
1595         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1596         sources.update(source_domain)
1597
1598     headers = {
1599         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1600     }
1601
1602     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1603     fetched = network.get_json_api(
1604         source_domain,
1605         "/api/1.0/instances/list?count=0&sort_by=name",
1606         headers=headers,
1607         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1608     )
1609     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1610
1611     if "error_message" in fetched:
1612         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1613         return 2
1614     elif "exception" in fetched:
1615         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1616         return 3
1617     elif "json" not in fetched:
1618         logger.warning("fetched has no element 'json' - EXIT!")
1619         return 4
1620     elif "instances" not in fetched["json"]:
1621         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1622         return 5
1623
1624     domains = list()
1625     rows = fetched["json"]["instances"]
1626
1627     logger.info("Checking %d row(s) ...", len(rows))
1628     for row in rows:
1629         logger.debug("row[]='%s'", type(row))
1630         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1631         logger.debug("domain='%s' - AFTER!", domain)
1632
1633         if domain is None and domain == "":
1634             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1635             continue
1636
1637         logger.debug("domain='%s' - BEFORE!", domain)
1638         domain = domain.encode("idna").decode("utf-8")
1639         logger.debug("domain='%s' - AFTER!", domain)
1640
1641         if not domain_helper.is_wanted(domain):
1642             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1643             continue
1644         elif domain in domains:
1645             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1646             continue
1647         elif instances.is_registered(domain):
1648             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1649             continue
1650         elif instances.is_recent(domain):
1651             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1652             continue
1653
1654         logger.info("Fetching instances from domain='%s' ...", domain)
1655         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1656
1657     logger.debug("Success! - EXIT!")
1658     return 0
1659
1660 def fetch_relaylist(args: argparse.Namespace) -> int:
1661     logger.debug("args[]='%s' - CALLED!", type(args))
1662
1663     logger.debug("Invoking locking.acquire() ...")
1664     locking.acquire()
1665
1666     source_domain = "api.relaylist.com"
1667
1668     if sources.is_recent(source_domain):
1669         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1670         return 1
1671     else:
1672         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1673         sources.update(source_domain)
1674
1675     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1676     fetched = network.get_json_api(
1677         source_domain,
1678         "/relays",
1679         {},
1680         (config.get("connection_timeout"), config.get("read_timeout"))
1681     )
1682     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1683
1684     if "error_message" in fetched:
1685         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1686         return 2
1687     elif "exception" in fetched:
1688         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1689         return 3
1690     elif "json" not in fetched:
1691         logger.warning("fetched has no element 'json' - EXIT!")
1692         return 4
1693
1694     domains = list()
1695
1696     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1697     for row in fetched["json"]:
1698         logger.debug("row[]='%s'", type(row))
1699         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1700         logger.debug("domain='%s' - AFTER!", domain)
1701
1702         if domain is None and domain == "":
1703             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1704             continue
1705
1706         logger.debug("domain='%s' - BEFORE!", domain)
1707         domain = domain.encode("idna").decode("utf-8")
1708         logger.debug("domain='%s' - AFTER!", domain)
1709
1710         if not domain_helper.is_wanted(domain):
1711             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1712             continue
1713         elif domain in domains:
1714             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1715             continue
1716         elif instances.is_registered(domain):
1717             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1718             continue
1719         elif instances.is_recent(domain):
1720             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1721             continue
1722
1723         logger.info("Fetching instances from domain='%s'", domain)
1724         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1725
1726     logger.debug("Success! - EXIT!")
1727     return 0
1728
1729 def fetch_relays(args: argparse.Namespace) -> int:
1730     logger.debug("args[]='%s' - CALLED!", type(args))
1731
1732     logger.debug("Invoking locking.acquire() ...")
1733     locking.acquire()
1734
1735     if args.domain is not None and args.domain != "":
1736         logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1737         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1738     elif args.software is not None and args.software != "":
1739         logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1740         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1741     else:
1742         logger.debug("Fetch all relay instances ...")
1743         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1744
1745     domains = list()
1746     rows = database.cursor.fetchall()
1747
1748     logger.info("Checking %d relays ...", len(rows))
1749     for row in rows:
1750         logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1751         if not args.force and instances.is_recent(row["domain"]):
1752             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1753             continue
1754         elif row["nodeinfo_url"] is None:
1755             logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1756             continue
1757
1758         peers = list()
1759         try:
1760             logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1761             if row["software"] == "pub-relay":
1762                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1763                 raw = network.fetch_api_url(
1764                     row["nodeinfo_url"],
1765                     (config.get("connection_timeout"), config.get("read_timeout"))
1766                 )
1767
1768                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1769                 if "exception" in raw:
1770                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1771                     raise raw["exception"]
1772                 elif "error_message" in raw:
1773                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1774                     instances.set_last_error(row["domain"], raw)
1775                     instances.set_last_instance_fetch(row["domain"])
1776                     instances.update(row["domain"])
1777                     continue
1778                 elif "json" not in raw:
1779                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1780                     continue
1781                 elif not "metadata" in raw["json"]:
1782                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1783                     continue
1784                 elif not "peers" in raw["json"]["metadata"]:
1785                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1786                     continue
1787             else:
1788                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1789                 raw = network.fetch_url(
1790                     f"https://{row['domain']}",
1791                     network.web_headers,
1792                     (config.get("connection_timeout"), config.get("read_timeout"))
1793                 ).text
1794                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1795
1796                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1797                 logger.debug("doc[]='%s'", type(doc))
1798
1799         except network.exceptions as exception:
1800             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1801             instances.set_last_error(row["domain"], exception)
1802             instances.set_last_instance_fetch(row["domain"])
1803             instances.update(row["domain"])
1804             continue
1805
1806         logger.debug("row[software]='%s'", row["software"])
1807         if row["software"] == "activityrelay":
1808             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1809             tags = doc.findAll("p")
1810
1811             logger.debug("Checking %d paragraphs ...", len(tags))
1812             for tag in tags:
1813                 logger.debug("tag[]='%s'", type(tag))
1814                 if len(tag.contents) == 0:
1815                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1816                     continue
1817                 elif "registered instances" not in tag.contents[0]:
1818                     logger.debug("Skipping paragraph, text not found.")
1819                     continue
1820
1821                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1822                 for domain in tag.contents:
1823                     logger.debug("domain[%s]='%s'", type(domain), domain)
1824                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1825                         continue
1826
1827                     domain = str(domain)
1828                     logger.debug("domain='%s'", domain)
1829                     if not domain_helper.is_wanted(domain):
1830                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1831                         continue
1832
1833                     logger.debug("domain='%s' - BEFORE!", domain)
1834                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1835                     logger.debug("domain='%s' - AFTER!", domain)
1836
1837                     if domain in [None, ""]:
1838                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1839                         continue
1840                     elif domain not in peers:
1841                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1842                         peers.append(domain)
1843
1844                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1845                     if dict_helper.has_key(domains, "domain", domain):
1846                         logger.debug("domain='%s' already added", domain)
1847                         continue
1848
1849                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1850                     domains.append({
1851                         "domain": domain,
1852                         "origin": row["domain"],
1853                     })
1854         elif row["software"] in ["aoderelay", "selective-relay"]:
1855             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1856             if row["software"] == "aoderelay":
1857                 tags = doc.findAll("section", {"class": "instance"})
1858             else:
1859                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1860
1861             logger.debug("Checking %d tags ...", len(tags))
1862             for tag in tags:
1863                 logger.debug("tag[]='%s'", type(tag))
1864
1865                 link = tag.find("a")
1866                 logger.debug("link[%s]='%s'", type(link), link)
1867                 if not isinstance(link, bs4.element.Tag):
1868                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1869                     continue
1870
1871                 components = urlparse(link.get("href"))
1872                 logger.debug("components(%d)='%s'", len(components), components)
1873                 domain = components.netloc.lower().split(":")[0]
1874
1875                 logger.debug("domain='%s' - BEFORE!", domain)
1876                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1877                 logger.debug("domain='%s' - AFTER!", domain)
1878
1879                 if domain in [None, ""]:
1880                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1881                     continue
1882                 elif domain not in peers:
1883                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1884                     peers.append(domain)
1885
1886                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1887                 if dict_helper.has_key(domains, "domain", domain):
1888                     logger.debug("domain='%s' already added", domain)
1889                     continue
1890
1891                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1892                 domains.append({
1893                     "domain": domain,
1894                     "origin": row["domain"],
1895                 })
1896         elif row["software"] == "pub-relay":
1897             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1898             for domain in raw["json"]["metadata"]["peers"]:
1899                 logger.debug("domain='%s' - BEFORE!", domain)
1900                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1901                 logger.debug("domain='%s' - AFTER!", domain)
1902
1903                 if domain in [None, ""]:
1904                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1905                     continue
1906                 elif domain not in peers:
1907                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1908                     peers.append(domain)
1909
1910                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1911                 if dict_helper.has_key(domains, "domain", domain):
1912                     logger.debug("domain='%s' already added", domain)
1913                     continue
1914
1915                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1916                 domains.append({
1917                     "domain": domain,
1918                     "origin": row["domain"],
1919                 })
1920         else:
1921             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1922             continue
1923
1924         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1925         instances.set_last_instance_fetch(row["domain"])
1926
1927         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1928         instances.set_total_peers(row["domain"], peers)
1929
1930         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1931         instances.update(row["domain"])
1932
1933     logger.info("Checking %d domains ...", len(domains))
1934     for row in domains:
1935         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1936         if not domain_helper.is_wanted(row["domain"]):
1937             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1938             continue
1939         elif instances.is_registered(row["domain"]):
1940             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1941             continue
1942
1943         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1944         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1945
1946     logger.debug("Success! - EXIT!")
1947     return 0
1948
1949 def convert_idna(args: argparse.Namespace) -> int:
1950     logger.debug("args[]='%s' - CALLED!", type(args))
1951
1952     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1953     rows = database.cursor.fetchall()
1954
1955     logger.debug("rows[]='%s'", type(rows))
1956     instances.translate_idnas(rows, "domain")
1957
1958     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1959     rows = database.cursor.fetchall()
1960
1961     logger.debug("rows[]='%s'", type(rows))
1962     instances.translate_idnas(rows, "origin")
1963
1964     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1965     rows = database.cursor.fetchall()
1966
1967     logger.debug("rows[]='%s'", type(rows))
1968     blocks.translate_idnas(rows, "blocker")
1969
1970     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1971     rows = database.cursor.fetchall()
1972
1973     logger.debug("rows[]='%s'", type(rows))
1974     blocks.translate_idnas(rows, "blocked")
1975
1976     logger.debug("Success! - EXIT!")
1977     return 0
1978
1979 def remove_invalid(args: argparse.Namespace) -> int:
1980     logger.debug("args[]='%s' - CALLED!", type(args))
1981
1982     logger.debug("Invoking locking.acquire() ...")
1983     locking.acquire()
1984
1985     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1986     rows = database.cursor.fetchall()
1987
1988     logger.info("Checking %d domains ...", len(rows))
1989     for row in rows:
1990         logger.debug("row[domain]='%s'", row["domain"])
1991         if not validators.domain(row["domain"].split("/")[0]):
1992             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1993             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1994             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1995
1996     logger.debug("Invoking commit() ...")
1997     database.connection.commit()
1998
1999     logger.info("Vaccum cleaning database ...")
2000     database.cursor.execute("VACUUM")
2001
2002     logger.debug("Success! - EXIT!")
2003     return 0