]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] in [None, ""]:
152                 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] in [None, ""]:
228                 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.only_none:
296         # Check only entries with total_blocked=None
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND total_blocks IS NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314         elif not args.force and instances.is_recent(blocker, "last_blocked"):
315             logger.debug("blocker='%s' has been recently accessed - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         # c.s isn't part of oliphant's "hidden" blocklists
323         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
324             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
325             continue
326
327         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
328         blocking = federation.fetch_blocks(blocker)
329
330         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
331         if len(blocking) == 0:
332             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
333             if software == "pleroma":
334                 blocking = pleroma.fetch_blocks(blocker)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "mastodon":
337                 blocking = mastodon.fetch_blocks(blocker)
338                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339             elif software == "lemmy":
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 blocking = friendica.fetch_blocks(blocker)
344                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345             elif software == "misskey":
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352         instances.set_total_blocks(blocker, blocking)
353
354         blockdict = list()
355         deobfuscated = obfuscated = 0
356
357         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] in [None, ""]:
371                 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
377                 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".arpa"):
380                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].endswith(".tld"):
383                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
384                 continue
385             elif block["blocked"].find("*") >= 0:
386                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
387                 instances.set_has_obfuscation(blocker, True)
388                 obfuscated = obfuscated + 1
389
390                 # Some friendica servers also obscure domains without hash
391                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
392
393                 logger.debug("row[]='%s'", type(row))
394                 if row is None:
395                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
396                     continue
397
398                 deobfuscated = deobfuscated + 1
399                 block["blocked"] = row["domain"]
400                 origin           = row["origin"]
401                 nodeinfo_url     = row["nodeinfo_url"]
402             elif block["blocked"].find("?") >= 0:
403                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
404                 instances.set_has_obfuscation(blocker, True)
405                 obfuscated = obfuscated + 1
406
407                 # Some obscure them with question marks, not sure if that's dependent on version or not
408                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
409
410                 logger.debug("row[]='%s'", type(row))
411                 if row is None:
412                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
413                     continue
414
415                 deobfuscated = deobfuscated + 1
416                 block["blocked"] = row["domain"]
417                 origin           = row["origin"]
418                 nodeinfo_url     = row["nodeinfo_url"]
419
420             logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
421             if block["blocked"] in [None, ""]:
422                 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
423                 continue
424
425             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
426             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
427             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
428
429             if not domain_helper.is_wanted(block["blocked"]):
430                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
431                 continue
432             elif block["block_level"] in ["accept", "accepted"]:
433                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
434                 continue
435             elif not instances.is_registered(block["blocked"]):
436                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
437                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
438
439             block["block_level"] = blocks.alias_block_level(block["block_level"])
440
441             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
442                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
443                 blockdict.append({
444                     "blocked": block["blocked"],
445                     "reason" : block["reason"],
446                 })
447
448             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
449             cookies.clear(block["blocked"])
450
451         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
452         instances.set_obfuscated_blocks(blocker, obfuscated)
453
454         logger.debug("Flushing updates for blocker='%s' ...", blocker)
455         instances.update(blocker)
456
457         logger.debug("Invoking commit() ...")
458         database.connection.commit()
459
460         logger.debug("Invoking cookies.clear(%s) ...", blocker)
461         cookies.clear(blocker)
462
463         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
464         if config.get("bot_enabled") and len(blockdict) > 0:
465             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
466             network.send_bot_post(blocker, blockdict)
467
468     logger.debug("Success! - EXIT!")
469     return 0
470
471 def fetch_observer(args: argparse.Namespace) -> int:
472     logger.debug("args[]='%s' - CALLED!", type(args))
473
474     logger.debug("Invoking locking.acquire() ...")
475     locking.acquire()
476
477     source_domain = "fediverse.observer"
478     if sources.is_recent(source_domain):
479         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
480         return 1
481     else:
482         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
483         sources.update(source_domain)
484
485     types = list()
486     if args.software is None:
487         logger.info("Fetching software list ...")
488         raw = utils.fetch_url(
489             f"https://{source_domain}",
490             network.web_headers,
491             (config.get("connection_timeout"), config.get("read_timeout"))
492         ).text
493         logger.debug("raw[%s]()=%d", type(raw), len(raw))
494
495         doc = bs4.BeautifulSoup(raw, features="html.parser")
496         logger.debug("doc[]='%s'", type(doc))
497
498         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
499         logger.debug("navbar[]='%s'", type(navbar))
500         if navbar is None:
501             logger.warning("Cannot find navigation bar, cannot continue!")
502             return 1
503
504         items = navbar.findAll("a", {"class": "dropdown-item"})
505         logger.debug("items[]='%s'", type(items))
506
507         logger.info("Checking %d menu items ...", len(items))
508         for item in items:
509             logger.debug("item[%s]='%s'", type(item), item)
510             if item.text.lower() == "all":
511                 logger.debug("Skipping 'All' menu entry ...")
512                 continue
513
514             logger.debug("Appending item.text='%s' ...", item.text)
515             types.append(tidyup.domain(item.text))
516     else:
517         logger.info("Adding args.software='%s' as type ...", args.software)
518         types.append(args.software)
519
520     logger.info("Fetching %d different table data ...", len(types))
521     for software in types:
522         logger.debug("software='%s'", software)
523
524         if args.software is not None and args.software != software:
525             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
526             continue
527
528         doc = None
529         try:
530             logger.debug("Fetching table data for software='%s' ...", software)
531             raw = utils.fetch_url(
532                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
533                 network.web_headers,
534                 (config.get("connection_timeout"), config.get("read_timeout"))
535             ).text
536             logger.debug("raw[%s]()=%d", type(raw), len(raw))
537
538             doc = bs4.BeautifulSoup(raw, features="html.parser")
539             logger.debug("doc[]='%s'", type(doc))
540         except network.exceptions as exception:
541             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
542             continue
543
544         items = doc.findAll("a", {"class": "url"})
545         logger.info("Checking %d items,software='%s' ...", len(items), software)
546         for item in items:
547             logger.debug("item[]='%s'", type(item))
548             domain = item.decode_contents()
549             logger.debug("domain[%s]='%s'", type(domain), domain)
550             domain = tidyup.domain(domain) if domain not in [None, ""] else None
551             logger.debug("domain='%s' - AFTER!", domain)
552
553             if domain in [None, ""]:
554                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
555                 continue
556
557             logger.debug("domain='%s' - BEFORE!", domain)
558             domain = domain.encode("idna").decode("utf-8")
559             logger.debug("domain='%s' - AFTER!", domain)
560
561             if not domain_helper.is_wanted(domain):
562                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
563                 continue
564             elif instances.is_registered(domain):
565                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
566                 continue
567
568             logger.info("Fetching instances for domain='%s'", domain)
569             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
570
571     logger.debug("Success! - EXIT!")
572     return 0
573
574 def fetch_todon_wiki(args: argparse.Namespace) -> int:
575     logger.debug("args[]='%s' - CALLED!", type(args))
576
577     logger.debug("Invoking locking.acquire() ...")
578     locking.acquire()
579
580     source_domain = "wiki.todon.eu"
581     if sources.is_recent(source_domain):
582         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
583         return 1
584     else:
585         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
586         sources.update(source_domain)
587
588     blocklist = {
589         "silenced": list(),
590         "reject": list(),
591     }
592
593     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
594     raw = utils.fetch_url(
595         f"https://{source_domain}/todon/domainblocks",
596         network.web_headers,
597         (config.get("connection_timeout"), config.get("read_timeout"))
598     ).text
599     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
600
601     doc = bs4.BeautifulSoup(raw, "html.parser")
602     logger.debug("doc[]='%s'", type(doc))
603
604     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
605     logger.info("Checking %d silenced/limited entries ...", len(silenced))
606     blocklist["silenced"] = utils.find_domains(silenced, "div")
607
608     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
609     logger.info("Checking %d suspended entries ...", len(suspended))
610     blocklist["reject"] = utils.find_domains(suspended, "div")
611
612     blocking = blocklist["silenced"] + blocklist["reject"]
613     blocker = "todon.eu"
614
615     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
616     instances.set_last_blocked(blocker)
617     instances.set_total_blocks(blocker, blocking)
618
619     blockdict = list()
620     for block_level in blocklist:
621         blockers = blocklist[block_level]
622
623         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
624         for blocked in blockers:
625             logger.debug("blocked='%s'", blocked)
626
627             if not instances.is_registered(blocked):
628                 try:
629                     logger.info("Fetching instances from domain='%s' ...", blocked)
630                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
631                 except network.exceptions as exception:
632                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
633                     instances.set_last_error(blocked, exception)
634
635             if not domain_helper.is_wanted(blocked):
636                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
637                 continue
638             elif not domain_helper.is_wanted(blocker):
639                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
640                 continue
641             elif blocks.is_instance_blocked(blocker, blocked, block_level):
642                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
643                 continue
644
645             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
646             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
647                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
648                 blockdict.append({
649                     "blocked": blocked,
650                     "reason" : None,
651                 })
652
653         logger.debug("Invoking commit() ...")
654         database.connection.commit()
655
656         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
657         if config.get("bot_enabled") and len(blockdict) > 0:
658             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
659             network.send_bot_post(blocker, blockdict)
660
661     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
662     if instances.has_pending(blocker):
663         logger.debug("Flushing updates for blocker='%s' ...", blocker)
664         instances.update(blocker)
665
666     logger.debug("Success! - EXIT!")
667     return 0
668
669 def fetch_cs(args: argparse.Namespace):
670     logger.debug("args[]='%s' - CALLED!", type(args))
671
672     logger.debug("Invoking locking.acquire() ...")
673     locking.acquire()
674
675     extensions = [
676         "extra",
677         "abbr",
678         "attr_list",
679         "def_list",
680         "fenced_code",
681         "footnotes",
682         "md_in_html",
683         "admonition",
684         "codehilite",
685         "legacy_attrs",
686         "legacy_em",
687         "meta",
688         "nl2br",
689         "sane_lists",
690         "smarty",
691         "toc",
692         "wikilinks"
693     ]
694
695     blocklist = {
696         "silenced": list(),
697         "reject"  : list(),
698     }
699
700     source_domain = "raw.githubusercontent.com"
701     if sources.is_recent(source_domain):
702         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
703         return 1
704     else:
705         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
706         sources.update(source_domain)
707
708     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
709     raw = utils.fetch_url(
710         f"https://{source_domain}/chaossocial/meta/master/federation.md",
711         network.web_headers,
712         (config.get("connection_timeout"), config.get("read_timeout"))
713     ).text
714     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
715
716     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
717     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
718
719     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
720     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
721     blocklist["silenced"] = federation.find_domains(silenced)
722
723     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
724     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
725     blocklist["reject"] = federation.find_domains(blocked)
726
727     blocking = blocklist["silenced"] + blocklist["reject"]
728     blocker = "chaos.social"
729
730     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
731     instances.set_last_blocked(blocker)
732     instances.set_total_blocks(blocker, blocking)
733
734     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
735     if len(blocking) > 0:
736         blockdict = list()
737         for block_level in blocklist:
738             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
739
740             for row in blocklist[block_level]:
741                 logger.debug("row[%s]='%s'", type(row), row)
742                 if not "domain" in row:
743                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
744                     continue
745                 elif not instances.is_registered(row["domain"]):
746                     try:
747                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
748                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
749                     except network.exceptions as exception:
750                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
751                         instances.set_last_error(row["domain"], exception)
752
753                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
754                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
755                     blockdict.append({
756                         "blocked": row["domain"],
757                         "reason" : row["reason"],
758                     })
759
760         logger.debug("Invoking commit() ...")
761         database.connection.commit()
762
763         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
764         if config.get("bot_enabled") and len(blockdict) > 0:
765             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
766             network.send_bot_post(blocker, blockdict)
767
768     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
769     if instances.has_pending(blocker):
770         logger.debug("Flushing updates for blocker='%s' ...", blocker)
771         instances.update(blocker)
772
773     logger.debug("Success! - EXIT!")
774     return 0
775
776 def fetch_fba_rss(args: argparse.Namespace) -> int:
777     logger.debug("args[]='%s' - CALLED!", type(args))
778
779     domains = list()
780
781     logger.debug("Invoking locking.acquire() ...")
782     locking.acquire()
783
784     components = urlparse(args.feed)
785     domain = components.netloc.lower().split(":")[0]
786
787     logger.debug("domain='%s'", domain)
788     if sources.is_recent(domain):
789         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
790         return 0
791     else:
792         logger.debug("domain='%s' has not been recently used, marking ...", domain)
793         sources.update(domain)
794
795     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
796     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
797
798     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
799     if response.ok and response.status_code == 200 and len(response.text) > 0:
800         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
801         rss = atoma.parse_rss_bytes(response.content)
802
803         logger.debug("rss[]='%s'", type(rss))
804         for item in rss.items:
805             logger.debug("item[%s]='%s'", type(item), item)
806             domain = item.link.split("=")[1]
807             domain = tidyup.domain(domain) if domain not in[None, ""] else None
808
809             logger.debug("domain='%s' - AFTER!", domain)
810             if domain in [None, ""]:
811                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
812                 continue
813
814             logger.debug("domain='%s' - BEFORE!", domain)
815             domain = domain.encode("idna").decode("utf-8")
816             logger.debug("domain='%s' - AFTER!", domain)
817
818             if not domain_helper.is_wanted(domain):
819                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
820                 continue
821             elif domain in domains:
822                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
823                 continue
824             elif instances.is_registered(domain):
825                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
826                 continue
827             elif instances.is_recent(domain):
828                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
829                 continue
830
831             logger.debug("Adding domain='%s'", domain)
832             domains.append(domain)
833
834     logger.debug("domains()=%d", len(domains))
835     if len(domains) > 0:
836         logger.info("Adding %d new instances ...", len(domains))
837         for domain in domains:
838             logger.debug("domain='%s'", domain)
839             try:
840                 logger.info("Fetching instances from domain='%s' ...", domain)
841                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
842             except network.exceptions as exception:
843                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
844                 instances.set_last_error(domain, exception)
845                 return 100
846
847     logger.debug("Success! - EXIT!")
848     return 0
849
850 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
851     logger.debug("args[]='%s' - CALLED!", type(args))
852
853     logger.debug("Invoking locking.acquire() ...")
854     locking.acquire()
855
856     source_domain = "ryona.agency"
857     feed = f"https://{source_domain}/users/fba/feed.atom"
858
859     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
860     if args.feed is not None and validators.url(args.feed):
861         logger.debug("Setting feed='%s' ...", args.feed)
862         feed = str(args.feed)
863         source_domain = urlparse(args.feed).netloc
864
865     if sources.is_recent(source_domain):
866         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
867         return 1
868     else:
869         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
870         sources.update(source_domain)
871
872     domains = list()
873
874     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
875     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
876
877     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
878     if response.ok and response.status_code == 200 and len(response.text) > 0:
879         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
880         atom = atoma.parse_atom_bytes(response.content)
881
882         logger.debug("atom[]='%s'", type(atom))
883         for entry in atom.entries:
884             logger.debug("entry[]='%s'", type(entry))
885             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
886             logger.debug("doc[]='%s'", type(doc))
887
888             for element in doc.findAll("a"):
889                 logger.debug("element[]='%s'", type(element))
890                 for href in element["href"].split(","):
891                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
892                     domain = tidyup.domain(href) if href not in [None, ""] else None
893
894                     logger.debug("domain='%s' - AFTER!", domain)
895                     if domain in [None, ""]:
896                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
897                         continue
898
899                     logger.debug("domain='%s' - BEFORE!", domain)
900                     domain = domain.encode("idna").decode("utf-8")
901                     logger.debug("domain='%s' - AFTER!", domain)
902
903                     if not domain_helper.is_wanted(domain):
904                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
905                         continue
906                     elif domain in domains:
907                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
908                         continue
909                     elif instances.is_registered(domain):
910                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
911                         continue
912                     elif instances.is_recent(domain):
913                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
914                         continue
915
916                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
917                     domains.append(domain)
918
919     logger.debug("domains()=%d", len(domains))
920     if len(domains) > 0:
921         logger.info("Adding %d new instances ...", len(domains))
922         for domain in domains:
923             logger.debug("domain='%s'", domain)
924             try:
925                 logger.info("Fetching instances from domain='%s' ...", domain)
926                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
927             except network.exceptions as exception:
928                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
929                 instances.set_last_error(domain, exception)
930                 return 100
931
932     logger.debug("Success! - EXIT!")
933     return 0
934
935 def fetch_instances(args: argparse.Namespace) -> int:
936     logger.debug("args[]='%s' - CALLED!", type(args))
937
938     logger.debug("args.domain='%s' - checking ...", args.domain)
939     if not validators.domain(args.domain):
940         logger.warning("args.domain='%s' is not valid.", args.domain)
941         return 100
942     elif blacklist.is_blacklisted(args.domain):
943         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
944         return 101
945
946     logger.debug("Invoking locking.acquire() ...")
947     locking.acquire()
948
949     # Initialize values
950     domain = tidyup.domain(args.domain)
951     origin = software = None
952
953     # Fetch record
954     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
955     row = database.cursor.fetchone()
956     if row is not None:
957         origin = row["origin"]
958         software = row["software"]
959
960     if software is None:
961         logger.warning("args.domain='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated.", args.domain, args.domain)
962         return 102
963     elif software_helper.is_relay(software):
964         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
965         return 103
966
967     # Initial fetch
968     try:
969         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
970         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
971     except network.exceptions as exception:
972         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
973         instances.set_last_error(args.domain, exception)
974         instances.update(args.domain)
975         return 104
976
977     if args.single:
978         logger.debug("Not fetching more instances - EXIT!")
979         return 0
980
981     # Loop through some instances
982     database.cursor.execute(
983         "SELECT domain, origin, software FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
984     )
985
986     rows = database.cursor.fetchall()
987     logger.info("Checking %d entries ...", len(rows))
988     for row in rows:
989         logger.debug("row[domain]='%s'", row["domain"])
990         if row["domain"] == "":
991             logger.debug("row[domain] is empty - SKIPPED!")
992             continue
993
994         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
995         domain = row["domain"].encode("idna").decode("utf-8")
996         logger.debug("domain='%s' - AFTER!", domain)
997
998         if not domain_helper.is_wanted(domain):
999             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
1000             continue
1001
1002         try:
1003             logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1004             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1005         except network.exceptions as exception:
1006             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1007             instances.set_last_error(domain, exception)
1008
1009     logger.debug("Success - EXIT!")
1010     return 0
1011
1012 def fetch_csv(args: argparse.Namespace) -> int:
1013     logger.debug("args[]='%s' - CALLED!", type(args))
1014
1015     logger.debug("Invoking locking.acquire() ...")
1016     locking.acquire()
1017
1018     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1019     for block in blocklists.csv_files:
1020         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1021
1022         # Is domain given and not equal blocker?
1023         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1024             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1025             continue
1026
1027         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1028         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1029
1030     logger.debug("Success - EXIT!")
1031     return 0
1032
1033 def fetch_oliphant(args: argparse.Namespace) -> int:
1034     logger.debug("args[]='%s' - CALLED!", type(args))
1035
1036     logger.debug("Invoking locking.acquire() ...")
1037     locking.acquire()
1038
1039     source_domain = "codeberg.org"
1040     if sources.is_recent(source_domain):
1041         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1042         return 1
1043     else:
1044         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1045         sources.update(source_domain)
1046
1047     # Base URL
1048     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1049
1050     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1051     for block in blocklists.oliphant_blocklists:
1052         # Is domain given and not equal blocker?
1053         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1054         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1055             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1056             continue
1057
1058         url = f"{base_url}/{block['csv_url']}"
1059
1060         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1061         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1062
1063     logger.debug("Success! - EXIT!")
1064     return 0
1065
1066 def fetch_txt(args: argparse.Namespace) -> int:
1067     logger.debug("args[]='%s' - CALLED!", type(args))
1068
1069     logger.debug("Invoking locking.acquire() ...")
1070     locking.acquire()
1071
1072     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1073     for row in blocklists.txt_files:
1074         logger.debug("Fetching row[url]='%s' ...", row["url"])
1075         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1076
1077         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1078         if response.ok and response.status_code == 200 and response.text != "":
1079             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1080             domains = response.text.strip().split("\n")
1081
1082             logger.info("Processing %d domains ...", len(domains))
1083             for domain in domains:
1084                 logger.debug("domain='%s' - BEFORE!", domain)
1085                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1086
1087                 logger.debug("domain='%s' - AFTER!", domain)
1088                 if domain in [None, ""]:
1089                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1090                     continue
1091                 elif not domain_helper.is_wanted(domain):
1092                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1093                     continue
1094                 elif instances.is_recent(domain):
1095                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1096                     continue
1097
1098                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1099                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1100
1101                 logger.debug("processed='%s'", processed)
1102                 if not processed:
1103                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1104                     continue
1105
1106     logger.debug("Success! - EXIT!")
1107     return 0
1108
1109 def fetch_fedipact(args: argparse.Namespace) -> int:
1110     logger.debug("args[]='%s' - CALLED!", type(args))
1111
1112     logger.debug("Invoking locking.acquire() ...")
1113     locking.acquire()
1114
1115     source_domain = "fedipact.online"
1116     if sources.is_recent(source_domain):
1117         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1118         return 1
1119     else:
1120         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1121         sources.update(source_domain)
1122
1123     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1124     response = utils.fetch_url(
1125         f"https://{source_domain}",
1126         network.web_headers,
1127         (config.get("connection_timeout"), config.get("read_timeout"))
1128     )
1129
1130     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1131     if response.ok and response.status_code == 200 and response.text != "":
1132         logger.debug("Parsing %d Bytes ...", len(response.text))
1133
1134         doc = bs4.BeautifulSoup(response.text, "html.parser")
1135         logger.debug("doc[]='%s'", type(doc))
1136
1137         rows = doc.findAll("li")
1138         logger.info("Checking %d row(s) ...", len(rows))
1139         for row in rows:
1140             logger.debug("row[]='%s'", type(row))
1141             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1142
1143             logger.debug("domain='%s' - AFTER!", domain)
1144             if domain in [None, ""]:
1145                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1146                 continue
1147
1148             logger.debug("domain='%s' - BEFORE!", domain)
1149             domain = domain.encode("idna").decode("utf-8")
1150             logger.debug("domain='%s' - AFTER!", domain)
1151
1152             if not domain_helper.is_wanted(domain):
1153                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1154                 continue
1155             elif instances.is_registered(domain):
1156                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1157                 continue
1158             elif instances.is_recent(domain):
1159                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1160                 continue
1161
1162             logger.info("Fetching domain='%s' ...", domain)
1163             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1164
1165     logger.debug("Success! - EXIT!")
1166     return 0
1167
1168 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1169     logger.debug("args[]='%s' - CALLED!", type(args))
1170
1171     logger.debug("Invoking locking.acquire() ...")
1172     locking.acquire()
1173
1174     source_domain = "instances.joinmobilizon.org"
1175     if sources.is_recent(source_domain):
1176         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1177         return 1
1178     else:
1179         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1180         sources.update(source_domain)
1181
1182     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1183     raw = utils.fetch_url(
1184         f"https://{source_domain}/api/v1/instances",
1185         network.web_headers,
1186         (config.get("connection_timeout"), config.get("read_timeout"))
1187     ).text
1188     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1189
1190     parsed = json.loads(raw)
1191     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1192
1193     if "data" not in parsed:
1194         logger.warning("parsed()=%d does not contain key 'data'")
1195         return 1
1196
1197     logger.info("Checking %d instances ...", len(parsed["data"]))
1198     for row in parsed["data"]:
1199         logger.debug("row[]='%s'", type(row))
1200         if "host" not in row:
1201             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1202             continue
1203         elif not domain_helper.is_wanted(row["host"]):
1204             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1205             continue
1206         elif instances.is_registered(row["host"]):
1207             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1208             continue
1209
1210         logger.info("Fetching row[host]='%s' ...", row["host"])
1211         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1212
1213     logger.debug("Success! - EXIT!")
1214     return 0
1215
1216 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1217     logger.debug("args[]='%s' - CALLED!", type(args))
1218
1219     logger.debug("Invoking locking.acquire() ...")
1220     locking.acquire()
1221
1222     source_domain = "instanceapp.misskey.page"
1223     if sources.is_recent(source_domain):
1224         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1225         return 1
1226     else:
1227         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1228         sources.update(source_domain)
1229
1230     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1231     raw = utils.fetch_url(
1232         f"https://{source_domain}/instances.json",
1233         network.web_headers,
1234         (config.get("connection_timeout"), config.get("read_timeout"))
1235     ).text
1236     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1237
1238     parsed = json.loads(raw)
1239     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1240
1241     if "instancesInfos" not in parsed:
1242         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1243         return 1
1244
1245     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1246     for row in parsed["instancesInfos"]:
1247         logger.debug("row[%s]='%s'", type(row), row)
1248         if "url" not in row:
1249             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1250             continue
1251         elif not domain_helper.is_wanted(row["url"]):
1252             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1253             continue
1254         elif instances.is_registered(row["url"]):
1255             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1256             continue
1257
1258         logger.info("Fetching row[url]='%s' ...", row["url"])
1259         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1260
1261     logger.debug("Success! - EXIT!")
1262     return 0
1263
1264 def recheck_obfuscation(args: argparse.Namespace) -> int:
1265     logger.debug("args[]='%s' - CALLED!", type(args))
1266
1267     logger.debug("Invoking locking.acquire() ...")
1268     locking.acquire()
1269
1270     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1271         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1272     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1273         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1274     else:
1275         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1276
1277     rows = database.cursor.fetchall()
1278     logger.info("Checking %d domains ...", len(rows))
1279     for row in rows:
1280         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1281         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1282             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1283             continue
1284         elif blacklist.is_blacklisted(row["domain"]):
1285             logger.warning("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1286             continue
1287
1288         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1289         blocking = federation.fetch_blocks(row["domain"])
1290
1291         logger.debug("blocking()=%d", len(blocking))
1292         if len(blocking) == 0:
1293             logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1294             if row["software"] == "pleroma":
1295                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1296                 blocking = pleroma.fetch_blocks(row["domain"])
1297             elif row["software"] == "mastodon":
1298                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1299                 blocking = mastodon.fetch_blocks(row["domain"])
1300             elif row["software"] == "lemmy":
1301                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1302                 blocking = lemmy.fetch_blocks(row["domain"])
1303             elif row["software"] == "friendica":
1304                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1305                 blocking = friendica.fetch_blocks(row["domain"])
1306             elif row["software"] == "misskey":
1307                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1308                 blocking = misskey.fetch_blocks(row["domain"])
1309             else:
1310                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1311
1312         # c.s isn't part of oliphant's "hidden" blocklists
1313         logger.debug("row[domain]='%s'", row["domain"])
1314         if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1315             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1316             instances.set_last_blocked(row["domain"])
1317             instances.set_total_blocks(row["domain"], blocking)
1318
1319         obfuscated = 0
1320         blockdict = list()
1321
1322         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1323         for block in blocking:
1324             logger.debug("block[blocked]='%s'", block["blocked"])
1325             blocked = None
1326
1327             if block["blocked"] == "":
1328                 logger.debug("block[blocked] is empty - SKIPPED!")
1329                 continue
1330             elif block["blocked"].endswith(".onion"):
1331                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1332                 continue
1333             elif block["blocked"].endswith(".i2p") and config.get("allow_i2p_domain") == "true":
1334                 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1335                 continue
1336             elif block["blocked"].endswith(".arpa"):
1337                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1338                 continue
1339             elif block["blocked"].endswith(".tld"):
1340                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1341                 continue
1342             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1343                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1344                 obfuscated = obfuscated + 1
1345                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1346             elif not domain_helper.is_wanted(block["blocked"]):
1347                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1348                 continue
1349             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1350                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1351                 continue
1352
1353             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1354             if blocked is not None and blocked != block["blocked"]:
1355                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1356                 obfuscated = obfuscated - 1
1357
1358                 if blacklist.is_blacklisted(blocked):
1359                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1360                     continue
1361                 elif blacklist.is_blacklisted(row["domain"]):
1362                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1363                     continue
1364                 elif blocks.is_instance_blocked(row["domain"], blocked):
1365                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1366                     continue
1367
1368                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1369
1370                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1371                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1372                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1373                     blockdict.append({
1374                         "blocked": blocked,
1375                         "reason" : block["reason"],
1376                     })
1377
1378         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1379         instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1380         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1381
1382         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1383         if instances.has_pending(row["domain"]):
1384             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1385             instances.update(row["domain"])
1386
1387         logger.debug("Invoking commit() ...")
1388         database.connection.commit()
1389
1390         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1391         if config.get("bot_enabled") and len(blockdict) > 0:
1392             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1393             network.send_bot_post(row["domain"], blockdict)
1394
1395     logger.debug("Success! - EXIT!")
1396     return 0
1397
1398 def fetch_fedilist(args: argparse.Namespace) -> int:
1399     logger.debug("args[]='%s' - CALLED!", type(args))
1400
1401     logger.debug("Invoking locking.acquire() ...")
1402     locking.acquire()
1403
1404     source_domain = "demo.fedilist.com"
1405     if sources.is_recent(source_domain):
1406         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1407         return 1
1408     else:
1409         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1410         sources.update(source_domain)
1411
1412     url = f"http://{source_domain}/instance/csv?onion=not"
1413     if args.software is not None and args.software != "":
1414         logger.debug("args.software='%s'", args.software)
1415         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1416
1417     logger.info("Fetching url='%s' ...", url)
1418     response = reqto.get(
1419         url,
1420         headers=network.web_headers,
1421         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1422         allow_redirects=False
1423     )
1424
1425     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1426     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1427         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1428         return 1
1429
1430     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1431
1432     logger.debug("reader[]='%s'", type(reader))
1433     if reader is None:
1434         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1435         return 2
1436
1437     rows = list(reader)
1438
1439     logger.info("Checking %d rows ...", len(rows))
1440     for row in rows:
1441         logger.debug("row[]='%s'", type(row))
1442         if "hostname" not in row:
1443             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1444             continue
1445
1446         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1447         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1448         logger.debug("domain='%s' - AFTER!", domain)
1449
1450         if domain in [None, ""]:
1451             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1452             continue
1453
1454         logger.debug("domain='%s' - BEFORE!", domain)
1455         domain = domain.encode("idna").decode("utf-8")
1456         logger.debug("domain='%s' - AFTER!", domain)
1457
1458         if not domain_helper.is_wanted(domain):
1459             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1460             continue
1461         elif (args.force is None or not args.force) and instances.is_registered(domain):
1462             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1463             continue
1464         elif instances.is_recent(domain):
1465             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1466             continue
1467
1468         logger.info("Fetching instances from domain='%s' ...", domain)
1469         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1470
1471     logger.debug("Success! - EXIT!")
1472     return 0
1473
1474 def update_nodeinfo(args: argparse.Namespace) -> int:
1475     logger.debug("args[]='%s' - CALLED!", type(args))
1476
1477     logger.debug("Invoking locking.acquire() ...")
1478     locking.acquire()
1479
1480     if args.domain is not None and args.domain != "":
1481         logger.debug("Fetching args.domain='%s'", args.domain)
1482         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1483     elif args.software is not None and args.software != "":
1484         logger.info("Fetching domains for args.software='%s'", args.software)
1485         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1486     elif args.mode is not None and args.mode != "":
1487         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1488         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1489     elif args.no_software:
1490         logger.info("Fetching domains with no software type detected ...")
1491         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1492     elif args.with_software:
1493         logger.info("Fetching domains with any software type detected ...")
1494         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1495     elif args.no_auto:
1496         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1497         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1498     elif args.no_detection:
1499         logger.info("Fetching domains with no detection mode being set ...")
1500         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1501     else:
1502         logger.info("Fetching domains for recently updated ...")
1503         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1504
1505     domains = database.cursor.fetchall()
1506
1507     logger.info("Checking %d domain(s) ...", len(domains))
1508     cnt = 0
1509     for row in domains:
1510         logger.debug("row[]='%s'", type(row))
1511         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1512             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1513             continue
1514         elif blacklist.is_blacklisted(row["domain"]):
1515             logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1516             continue
1517
1518         try:
1519             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1520             software = federation.determine_software(row["domain"])
1521
1522             logger.debug("Determined software='%s'", software)
1523             if (software != row["software"] and software is not None) or args.force is True:
1524                 logger.debug("software='%s'", software)
1525                 if software is None:
1526                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1527                     instances.set_nodeinfo_url(row["domain"], None)
1528
1529                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1530                 instances.set_software(row["domain"], software)
1531
1532             if software is not None:
1533                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1534                 instances.set_success(row["domain"])
1535         except network.exceptions as exception:
1536             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1537             instances.set_last_error(row["domain"], exception)
1538
1539         instances.set_last_nodeinfo(row["domain"])
1540         instances.update(row["domain"])
1541         cnt = cnt + 1
1542
1543     logger.debug("Success! - EXIT!")
1544     return 0
1545
1546 def fetch_instances_social(args: argparse.Namespace) -> int:
1547     logger.debug("args[]='%s' - CALLED!", type(args))
1548
1549     logger.debug("Invoking locking.acquire() ...")
1550     locking.acquire()
1551
1552     source_domain = "instances.social"
1553
1554     if config.get("instances_social_api_key") == "":
1555         logger.error("API key not set. Please set in your config.json file.")
1556         return 1
1557     elif sources.is_recent(source_domain):
1558         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1559         return 2
1560     else:
1561         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1562         sources.update(source_domain)
1563
1564     headers = {
1565         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1566     }
1567
1568     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1569     fetched = network.get_json_api(
1570         source_domain,
1571         "/api/1.0/instances/list?count=0&sort_by=name",
1572         headers=headers,
1573         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1574     )
1575     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1576
1577     if "error_message" in fetched:
1578         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1579         return 2
1580     elif "exception" in fetched:
1581         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1582         return 3
1583     elif "json" not in fetched:
1584         logger.warning("fetched has no element 'json' - EXIT!")
1585         return 4
1586     elif "instances" not in fetched["json"]:
1587         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1588         return 5
1589
1590     domains = list()
1591     rows = fetched["json"]["instances"]
1592
1593     logger.info("Checking %d row(s) ...", len(rows))
1594     for row in rows:
1595         logger.debug("row[]='%s'", type(row))
1596         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1597         logger.debug("domain='%s' - AFTER!", domain)
1598
1599         if domain is None and domain == "":
1600             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1601             continue
1602
1603         logger.debug("domain='%s' - BEFORE!", domain)
1604         domain = domain.encode("idna").decode("utf-8")
1605         logger.debug("domain='%s' - AFTER!", domain)
1606
1607         if not domain_helper.is_wanted(domain):
1608             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1609             continue
1610         elif domain in domains:
1611             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1612             continue
1613         elif instances.is_registered(domain):
1614             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1615             continue
1616         elif instances.is_recent(domain):
1617             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1618             continue
1619
1620         logger.info("Fetching instances from domain='%s'", domain)
1621         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1622
1623     logger.debug("Success! - EXIT!")
1624     return 0
1625
1626 def fetch_relaylist(args: argparse.Namespace) -> int:
1627     logger.debug("args[]='%s' - CALLED!", type(args))
1628
1629     logger.debug("Invoking locking.acquire() ...")
1630     locking.acquire()
1631
1632     source_domain = "api.relaylist.com"
1633
1634     if sources.is_recent(source_domain):
1635         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1636         return 1
1637     else:
1638         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1639         sources.update(source_domain)
1640
1641     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1642     fetched = network.get_json_api(
1643         source_domain,
1644         "/relays",
1645         {},
1646         (config.get("connection_timeout"), config.get("read_timeout"))
1647     )
1648     logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1649
1650     if "error_message" in fetched:
1651         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1652         return 2
1653     elif "exception" in fetched:
1654         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1655         return 3
1656     elif "json" not in fetched:
1657         logger.warning("fetched has no element 'json' - EXIT!")
1658         return 4
1659
1660     domains = list()
1661
1662     logger.info("Checking %d row(s) ...", len(fetched["json"]))
1663     for row in fetched["json"]:
1664         logger.debug("row[]='%s'", type(row))
1665         domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1666         logger.debug("domain='%s' - AFTER!", domain)
1667
1668         if domain is None and domain == "":
1669             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1670             continue
1671
1672         logger.debug("domain='%s' - BEFORE!", domain)
1673         domain = domain.encode("idna").decode("utf-8")
1674         logger.debug("domain='%s' - AFTER!", domain)
1675
1676         if not domain_helper.is_wanted(domain):
1677             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1678             continue
1679         elif domain in domains:
1680             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1681             continue
1682         elif instances.is_registered(domain):
1683             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1684             continue
1685         elif instances.is_recent(domain):
1686             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1687             continue
1688
1689         logger.info("Fetching instances from domain='%s'", domain)
1690         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1691
1692     logger.debug("Success! - EXIT!")
1693     return 0
1694
1695 def fetch_relays(args: argparse.Namespace) -> int:
1696     logger.debug("args[]='%s' - CALLED!", type(args))
1697
1698     logger.debug("Invoking locking.acquire() ...")
1699     locking.acquire()
1700
1701     if args.domain is not None and args.domain != "":
1702         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1703     elif args.software is not None and args.software != "":
1704         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1705     else:
1706         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1707
1708     domains = list()
1709     rows = database.cursor.fetchall()
1710
1711     logger.info("Checking %d relays ...", len(rows))
1712     for row in rows:
1713         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1714         if not args.force and instances.is_recent(row["domain"]):
1715             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1716             continue
1717
1718         peers = list()
1719         try:
1720             if row["software"] == "pub-relay":
1721                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1722                 raw = network.fetch_api_url(
1723                     row["nodeinfo_url"],
1724                     (config.get("connection_timeout"), config.get("read_timeout"))
1725                 )
1726
1727                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1728                 if "exception" in raw:
1729                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1730                     raise raw["exception"]
1731                 elif "error_message" in raw:
1732                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1733                     instances.set_last_error(row["domain"], raw)
1734                     instances.set_last_instance_fetch(row["domain"])
1735                     instances.update(row["domain"])
1736                     continue
1737                 elif "json" not in raw:
1738                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1739                     continue
1740                 elif not "metadata" in raw["json"]:
1741                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1742                     continue
1743                 elif not "peers" in raw["json"]["metadata"]:
1744                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1745                     continue
1746             else:
1747                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1748                 raw = utils.fetch_url(
1749                     f"https://{row['domain']}",
1750                     network.web_headers,
1751                     (config.get("connection_timeout"), config.get("read_timeout"))
1752                 ).text
1753                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1754
1755                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1756                 logger.debug("doc[]='%s'", type(doc))
1757
1758         except network.exceptions as exception:
1759             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1760             instances.set_last_error(row["domain"], exception)
1761             instances.set_last_instance_fetch(row["domain"])
1762             instances.update(row["domain"])
1763             continue
1764
1765         logger.debug("row[software]='%s'", row["software"])
1766         if row["software"] == "activityrelay":
1767             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1768             tags = doc.findAll("p")
1769
1770             logger.debug("Checking %d paragraphs ...", len(tags))
1771             for tag in tags:
1772                 logger.debug("tag[]='%s'", type(tag))
1773                 if len(tag.contents) == 0:
1774                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1775                     continue
1776                 elif "registered instances" not in tag.contents[0]:
1777                     logger.debug("Skipping paragraph, text not found.")
1778                     continue
1779
1780                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1781                 for domain in tag.contents:
1782                     logger.debug("domain[%s]='%s'", type(domain), domain)
1783                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1784                         continue
1785
1786                     domain = str(domain)
1787                     logger.debug("domain='%s'", domain)
1788                     if not domain_helper.is_wanted(domain):
1789                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1790                         continue
1791
1792                     logger.debug("domain='%s' - BEFORE!", domain)
1793                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1794                     logger.debug("domain='%s' - AFTER!", domain)
1795
1796                     if domain in [None, ""]:
1797                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1798                         continue
1799                     elif domain not in peers:
1800                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1801                         peers.append(domain)
1802
1803                     logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1804                     if dict_helper.has_key(domains, "domain", domain):
1805                         logger.debug("domain='%s' already added", domain)
1806                         continue
1807
1808                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1809                     domains.append({
1810                         "domain": domain,
1811                         "origin": row["domain"],
1812                     })
1813         elif row["software"] in ["aoderelay", "selective-relay"]:
1814             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1815             if row["software"] == "aoderelay":
1816                 tags = doc.findAll("section", {"class": "instance"})
1817             else:
1818                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1819
1820             logger.debug("Checking %d tags ...", len(tags))
1821             for tag in tags:
1822                 logger.debug("tag[]='%s'", type(tag))
1823
1824                 link = tag.find("a")
1825                 logger.debug("link[%s]='%s'", type(link), link)
1826                 if not isinstance(link, bs4.element.Tag):
1827                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1828                     continue
1829
1830                 components = urlparse(link.get("href"))
1831                 logger.debug("components(%d)='%s'", len(components), components)
1832                 domain = components.netloc.lower().split(":")[0]
1833
1834                 logger.debug("domain='%s' - BEFORE!", domain)
1835                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1836                 logger.debug("domain='%s' - AFTER!", domain)
1837
1838                 if domain in [None, ""]:
1839                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1840                     continue
1841                 elif domain not in peers:
1842                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1843                     peers.append(domain)
1844
1845                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1846                 if dict_helper.has_key(domains, "domain", domain):
1847                     logger.debug("domain='%s' already added", domain)
1848                     continue
1849
1850                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1851                 domains.append({
1852                     "domain": domain,
1853                     "origin": row["domain"],
1854                 })
1855         elif row["software"] == "pub-relay":
1856             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1857             for domain in raw["json"]["metadata"]["peers"]:
1858                 logger.debug("domain='%s' - BEFORE!", domain)
1859                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1860                 logger.debug("domain='%s' - AFTER!", domain)
1861
1862                 if domain in [None, ""]:
1863                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1864                     continue
1865                 elif domain not in peers:
1866                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1867                     peers.append(domain)
1868
1869                 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1870                 if dict_helper.has_key(domains, "domain", domain):
1871                     logger.debug("domain='%s' already added", domain)
1872                     continue
1873
1874                 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1875                 domains.append({
1876                     "domain": domain,
1877                     "origin": row["domain"],
1878                 })
1879         else:
1880             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1881             continue
1882
1883         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1884         instances.set_last_instance_fetch(row["domain"])
1885
1886         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1887         instances.set_total_peers(row["domain"], peers)
1888
1889         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1890         instances.update(row["domain"])
1891
1892     logger.info("Checking %d domains ...", len(domains))
1893     for row in domains:
1894         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1895         if not domain_helper.is_wanted(row["domain"]):
1896             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1897             continue
1898         elif instances.is_registered(row["domain"]):
1899             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1900             continue
1901
1902         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1903         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1904
1905     logger.debug("Success! - EXIT!")
1906     return 0
1907
1908 def convert_idna(args: argparse.Namespace) -> int:
1909     logger.debug("args[]='%s' - CALLED!", type(args))
1910
1911     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1912     rows = database.cursor.fetchall()
1913
1914     logger.debug("rows[]='%s'", type(rows))
1915     instances.translate_idnas(rows, "domain")
1916
1917     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1918     rows = database.cursor.fetchall()
1919
1920     logger.debug("rows[]='%s'", type(rows))
1921     instances.translate_idnas(rows, "origin")
1922
1923     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1924     rows = database.cursor.fetchall()
1925
1926     logger.debug("rows[]='%s'", type(rows))
1927     blocks.translate_idnas(rows, "blocker")
1928
1929     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1930     rows = database.cursor.fetchall()
1931
1932     logger.debug("rows[]='%s'", type(rows))
1933     blocks.translate_idnas(rows, "blocked")
1934
1935     logger.debug("Success! - EXIT!")
1936     return 0
1937
1938 def remove_invalid(args: argparse.Namespace) -> int:
1939     logger.debug("args[]='%s' - CALLED!", type(args))
1940
1941     logger.debug("Invoking locking.acquire() ...")
1942     locking.acquire()
1943
1944     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1945     rows = database.cursor.fetchall()
1946
1947     logger.info("Checking %d domains ...", len(rows))
1948     for row in rows:
1949         logger.debug("row[domain]='%s'", row["domain"])
1950         if not validators.domain(row["domain"].split("/")[0]):
1951             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1952             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1953             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1954
1955     logger.debug("Invoking commit() ...")
1956     database.connection.commit()
1957
1958     logger.info("Vaccum cleaning database ...")
1959     database.cursor.execute("VACUUM")
1960
1961     logger.debug("Success! - EXIT!")
1962     return 0