]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 1
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] == "":
152                 logger.debug("row[domain] is empty - SKIPPED!")
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 1
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] == "":
228                 logger.debug("entry[domain] is empty - SKIPPED!")
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353         deobfuscated = obfuscated = 0
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         for block in blocking:
357             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358
359             if block["block_level"] == "":
360                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
361                 continue
362
363             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
364             block["blocked"] = tidyup.domain(block["blocked"])
365             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
366             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367
368             if block["blocked"] == "":
369                 logger.warning("blocked is empty, blocker='%s'", blocker)
370                 continue
371             elif block["blocked"].endswith(".onion"):
372                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373                 continue
374             elif block["blocked"].endswith(".arpa"):
375                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".tld"):
378                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].find("*") >= 0:
381                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
382                 instances.set_has_obfuscation(blocker, True)
383                 obfuscated = obfuscated + 1
384
385                 # Some friendica servers also obscure domains without hash
386                 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
387
388                 logger.debug("row[]='%s'", type(row))
389                 if row is None:
390                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
391                     continue
392
393                 deobfuscated = deobfuscated + 1
394                 block["blocked"] = row["domain"]
395                 origin           = row["origin"]
396                 nodeinfo_url     = row["nodeinfo_url"]
397             elif block["blocked"].find("?") >= 0:
398                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
399                 instances.set_has_obfuscation(blocker, True)
400                 obfuscated = obfuscated + 1
401
402                 # Some obscure them with question marks, not sure if that's dependent on version or not
403                 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
404
405                 logger.debug("row[]='%s'", type(row))
406                 if row is None:
407                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
408                     continue
409
410                 deobfuscated = deobfuscated + 1
411                 block["blocked"] = row["domain"]
412                 origin           = row["origin"]
413                 nodeinfo_url     = row["nodeinfo_url"]
414
415             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
416             if block["blocked"] == "":
417                 logger.debug("block[blocked] is empty - SKIPPED!")
418                 continue
419
420             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
421             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
422             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
423
424             if not domain_helper.is_wanted(block["blocked"]):
425                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
426                 continue
427             elif block["block_level"] in ["accept", "accepted"]:
428                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
429                 continue
430             elif not instances.is_registered(block["blocked"]):
431                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
432                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
433
434             block["block_level"] = blocks.alias_block_level(block["block_level"])
435
436             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
437                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
438                 blockdict.append({
439                     "blocked": block["blocked"],
440                     "reason" : block["reason"],
441                 })
442
443             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
444             cookies.clear(block["blocked"])
445
446         logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
447         instances.set_obfuscated_blocks(blocker, obfuscated)
448
449         logger.debug("Flushing updates for blocker='%s' ...", blocker)
450         instances.update(blocker)
451
452         logger.debug("Invoking commit() ...")
453         database.connection.commit()
454
455         logger.debug("Invoking cookies.clear(%s) ...", blocker)
456         cookies.clear(blocker)
457
458         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
459         if config.get("bot_enabled") and len(blockdict) > 0:
460             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
461             network.send_bot_post(blocker, blockdict)
462
463     logger.debug("Success! - EXIT!")
464     return 0
465
466 def fetch_observer(args: argparse.Namespace) -> int:
467     logger.debug("args[]='%s' - CALLED!", type(args))
468
469     logger.debug("Invoking locking.acquire() ...")
470     locking.acquire()
471
472     source_domain = "fediverse.observer"
473     if sources.is_recent(source_domain):
474         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
475         return 1
476     else:
477         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
478         sources.update(source_domain)
479
480     types = list()
481     if args.software is None:
482         logger.info("Fetching software list ...")
483         raw = utils.fetch_url(
484             f"https://{source_domain}",
485             network.web_headers,
486             (config.get("connection_timeout"), config.get("read_timeout"))
487         ).text
488         logger.debug("raw[%s]()=%d", type(raw), len(raw))
489
490         doc = bs4.BeautifulSoup(raw, features="html.parser")
491         logger.debug("doc[]='%s'", type(doc))
492
493         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
494         logger.debug("navbar[]='%s'", type(navbar))
495         if navbar is None:
496             logger.warning("Cannot find navigation bar, cannot continue!")
497             return 1
498
499         items = navbar.findAll("a", {"class": "dropdown-item"})
500         logger.debug("items[]='%s'", type(items))
501
502         logger.info("Checking %d menu items ...", len(items))
503         for item in items:
504             logger.debug("item[%s]='%s'", type(item), item)
505             if item.text.lower() == "all":
506                 logger.debug("Skipping 'All' menu entry ...")
507                 continue
508
509             logger.debug("Appending item.text='%s' ...", item.text)
510             types.append(tidyup.domain(item.text))
511     else:
512         logger.info("Adding args.software='%s' as type ...", args.software)
513         types.append(args.software)
514
515     logger.info("Fetching %d different table data ...", len(types))
516     for software in types:
517         logger.debug("software='%s'", software)
518
519         if args.software is not None and args.software != software:
520             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
521             continue
522
523         doc = None
524         try:
525             logger.debug("Fetching table data for software='%s' ...", software)
526             raw = utils.fetch_url(
527                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
528                 network.web_headers,
529                 (config.get("connection_timeout"), config.get("read_timeout"))
530             ).text
531             logger.debug("raw[%s]()=%d", type(raw), len(raw))
532
533             doc = bs4.BeautifulSoup(raw, features="html.parser")
534             logger.debug("doc[]='%s'", type(doc))
535         except network.exceptions as exception:
536             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
537             continue
538
539         items = doc.findAll("a", {"class": "url"})
540         logger.info("Checking %d items,software='%s' ...", len(items), software)
541         for item in items:
542             logger.debug("item[]='%s'", type(item))
543             domain = item.decode_contents()
544             domain = tidyup.domain(domain) if domain not in [None, ""] else None
545             logger.debug("domain='%s' - AFTER!", domain)
546
547             if domain is None or domain == "":
548                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
549                 continue
550
551             logger.debug("domain='%s' - BEFORE!", domain)
552             domain = domain.encode("idna").decode("utf-8")
553             logger.debug("domain='%s' - AFTER!", domain)
554
555             if not domain_helper.is_wanted(domain):
556                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
557                 continue
558             elif instances.is_registered(domain):
559                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
560                 continue
561
562             logger.info("Fetching instances for domain='%s'", domain)
563             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
564
565     logger.debug("Success! - EXIT!")
566     return 0
567
568 def fetch_todon_wiki(args: argparse.Namespace) -> int:
569     logger.debug("args[]='%s' - CALLED!", type(args))
570
571     logger.debug("Invoking locking.acquire() ...")
572     locking.acquire()
573
574     source_domain = "wiki.todon.eu"
575     if sources.is_recent(source_domain):
576         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
577         return 1
578     else:
579         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
580         sources.update(source_domain)
581
582     blocklist = {
583         "silenced": list(),
584         "reject": list(),
585     }
586
587     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
588     raw = utils.fetch_url(
589         f"https://{source_domain}/todon/domainblocks",
590         network.web_headers,
591         (config.get("connection_timeout"), config.get("read_timeout"))
592     ).text
593     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
594
595     doc = bs4.BeautifulSoup(raw, "html.parser")
596     logger.debug("doc[]='%s'", type(doc))
597
598     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
599     logger.info("Checking %d silenced/limited entries ...", len(silenced))
600     blocklist["silenced"] = utils.find_domains(silenced, "div")
601
602     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
603     logger.info("Checking %d suspended entries ...", len(suspended))
604     blocklist["reject"] = utils.find_domains(suspended, "div")
605
606     blocking = blocklist["silenced"] + blocklist["reject"]
607     blocker = "todon.eu"
608
609     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
610     instances.set_last_blocked(blocker)
611     instances.set_total_blocks(blocker, blocking)
612
613     blockdict = list()
614     for block_level in blocklist:
615         blockers = blocklist[block_level]
616
617         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
618         for blocked in blockers:
619             logger.debug("blocked='%s'", blocked)
620
621             if not instances.is_registered(blocked):
622                 try:
623                     logger.info("Fetching instances from domain='%s' ...", blocked)
624                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
625                 except network.exceptions as exception:
626                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
627                     instances.set_last_error(blocked, exception)
628
629             if not domain_helper.is_wanted(blocked):
630                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
631                 continue
632             elif not domain_helper.is_wanted(blocker):
633                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
634                 continue
635             elif blocks.is_instance_blocked(blocker, blocked, block_level):
636                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
637                 continue
638
639             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
640             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
641                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
642                 blockdict.append({
643                     "blocked": blocked,
644                     "reason" : None,
645                 })
646
647         logger.debug("Invoking commit() ...")
648         database.connection.commit()
649
650         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
651         if config.get("bot_enabled") and len(blockdict) > 0:
652             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
653             network.send_bot_post(blocker, blockdict)
654
655     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
656     if instances.has_pending(blocker):
657         logger.debug("Flushing updates for blocker='%s' ...", blocker)
658         instances.update(blocker)
659
660     logger.debug("Success! - EXIT!")
661     return 0
662
663 def fetch_cs(args: argparse.Namespace):
664     logger.debug("args[]='%s' - CALLED!", type(args))
665
666     logger.debug("Invoking locking.acquire() ...")
667     locking.acquire()
668
669     extensions = [
670         "extra",
671         "abbr",
672         "attr_list",
673         "def_list",
674         "fenced_code",
675         "footnotes",
676         "md_in_html",
677         "admonition",
678         "codehilite",
679         "legacy_attrs",
680         "legacy_em",
681         "meta",
682         "nl2br",
683         "sane_lists",
684         "smarty",
685         "toc",
686         "wikilinks"
687     ]
688
689     blocklist = {
690         "silenced": list(),
691         "reject"  : list(),
692     }
693
694     source_domain = "raw.githubusercontent.com"
695     if sources.is_recent(source_domain):
696         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
697         return 1
698     else:
699         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
700         sources.update(source_domain)
701
702     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
703     raw = utils.fetch_url(
704         f"https://{source_domain}/chaossocial/meta/master/federation.md",
705         network.web_headers,
706         (config.get("connection_timeout"), config.get("read_timeout"))
707     ).text
708     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
709
710     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
711     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
712
713     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
714     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
715     blocklist["silenced"] = federation.find_domains(silenced)
716
717     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
718     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
719     blocklist["reject"] = federation.find_domains(blocked)
720
721     blocking = blocklist["silenced"] + blocklist["reject"]
722     blocker = "chaos.social"
723
724     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
725     instances.set_last_blocked(blocker)
726     instances.set_total_blocks(blocker, blocking)
727
728     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
729     if len(blocking) > 0:
730         blockdict = list()
731         for block_level in blocklist:
732             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
733
734             for row in blocklist[block_level]:
735                 logger.debug("row[%s]='%s'", type(row), row)
736                 if not "domain" in row:
737                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
738                     continue
739                 elif not instances.is_registered(row["domain"]):
740                     try:
741                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
742                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
743                     except network.exceptions as exception:
744                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
745                         instances.set_last_error(row["domain"], exception)
746
747                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
748                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
749                     blockdict.append({
750                         "blocked": row["domain"],
751                         "reason" : row["reason"],
752                     })
753
754         logger.debug("Invoking commit() ...")
755         database.connection.commit()
756
757         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
758         if config.get("bot_enabled") and len(blockdict) > 0:
759             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
760             network.send_bot_post(blocker, blockdict)
761
762     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
763     if instances.has_pending(blocker):
764         logger.debug("Flushing updates for blocker='%s' ...", blocker)
765         instances.update(blocker)
766
767     logger.debug("Success! - EXIT!")
768     return 0
769
770 def fetch_fba_rss(args: argparse.Namespace) -> int:
771     logger.debug("args[]='%s' - CALLED!", type(args))
772
773     domains = list()
774
775     logger.debug("Invoking locking.acquire() ...")
776     locking.acquire()
777
778     components = urlparse(args.feed)
779     domain = components.netloc.lower().split(":")[0]
780
781     logger.debug("domain='%s'", domain)
782     if sources.is_recent(domain):
783         logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
784         return 0
785     else:
786         logger.debug("domain='%s' has not been recently used, marking ...", domain)
787         sources.update(domain)
788
789     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
790     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
791
792     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
793     if response.ok and response.status_code == 200 and len(response.text) > 0:
794         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
795         rss = atoma.parse_rss_bytes(response.content)
796
797         logger.debug("rss[]='%s'", type(rss))
798         for item in rss.items:
799             logger.debug("item[%s]='%s'", type(item), item)
800             domain = item.link.split("=")[1]
801             domain = tidyup.domain(domain) if domain not in[None, ""] else None
802
803             logger.debug("domain='%s' - AFTER!", domain)
804             if domain is None or domain == "":
805                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
806                 continue
807
808             logger.debug("domain='%s' - BEFORE!", domain)
809             domain = domain.encode("idna").decode("utf-8")
810             logger.debug("domain='%s' - AFTER!", domain)
811
812             if not domain_helper.is_wanted(domain):
813                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
814                 continue
815             elif domain in domains:
816                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
817                 continue
818             elif instances.is_registered(domain):
819                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
820                 continue
821             elif instances.is_recent(domain):
822                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
823                 continue
824
825             logger.debug("Adding domain='%s'", domain)
826             domains.append(domain)
827
828     logger.debug("domains()=%d", len(domains))
829     if len(domains) > 0:
830         logger.info("Adding %d new instances ...", len(domains))
831         for domain in domains:
832             logger.debug("domain='%s'", domain)
833             try:
834                 logger.info("Fetching instances from domain='%s' ...", domain)
835                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
836             except network.exceptions as exception:
837                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
838                 instances.set_last_error(domain, exception)
839                 return 100
840
841     logger.debug("Success! - EXIT!")
842     return 0
843
844 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
845     logger.debug("args[]='%s' - CALLED!", type(args))
846
847     logger.debug("Invoking locking.acquire() ...")
848     locking.acquire()
849
850     source_domain = "ryona.agency"
851     feed = f"https://{source_domain}/users/fba/feed.atom"
852
853     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
854     if args.feed is not None and validators.url(args.feed):
855         logger.debug("Setting feed='%s' ...", args.feed)
856         feed = str(args.feed)
857         source_domain = urlparse(args.feed).netloc
858
859     if sources.is_recent(source_domain):
860         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
861         return 1
862     else:
863         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
864         sources.update(source_domain)
865
866     domains = list()
867
868     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
869     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
870
871     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
872     if response.ok and response.status_code == 200 and len(response.text) > 0:
873         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
874         atom = atoma.parse_atom_bytes(response.content)
875
876         logger.debug("atom[]='%s'", type(atom))
877         for entry in atom.entries:
878             logger.debug("entry[]='%s'", type(entry))
879             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
880             logger.debug("doc[]='%s'", type(doc))
881             for element in doc.findAll("a"):
882                 logger.debug("element[]='%s'", type(element))
883                 for href in element["href"].split(","):
884                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
885                     domain = tidyup.domain(href) if href not in [None, ""] else None
886
887                     logger.debug("domain='%s' - AFTER!", domain)
888                     if domain is None or domain == "":
889                         logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
890                         continue
891
892                     logger.debug("domain='%s' - BEFORE!", domain)
893                     domain = domain.encode("idna").decode("utf-8")
894                     logger.debug("domain='%s' - AFTER!", domain)
895
896                     if not domain_helper.is_wanted(domain):
897                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
898                         continue
899                     elif domain in domains:
900                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
901                         continue
902                     elif instances.is_registered(domain):
903                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
904                         continue
905                     elif instances.is_recent(domain):
906                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
907                         continue
908
909                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
910                     domains.append(domain)
911
912     logger.debug("domains()=%d", len(domains))
913     if len(domains) > 0:
914         logger.info("Adding %d new instances ...", len(domains))
915         for domain in domains:
916             logger.debug("domain='%s'", domain)
917             try:
918                 logger.info("Fetching instances from domain='%s' ...", domain)
919                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
920             except network.exceptions as exception:
921                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
922                 instances.set_last_error(domain, exception)
923                 return 100
924
925     logger.debug("Success! - EXIT!")
926     return 0
927
928 def fetch_instances(args: argparse.Namespace) -> int:
929     logger.debug("args[]='%s' - CALLED!", type(args))
930
931     logger.debug("args.domain='%s' - checking ...", args.domain)
932     if not validators.domain(args.domain):
933         logger.warning("args.domain='%s' is not valid.", args.domain)
934         return 100
935     elif blacklist.is_blacklisted(args.domain):
936         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
937         return 101
938
939     logger.debug("Invoking locking.acquire() ...")
940     locking.acquire()
941
942     # Initialize values
943     domain = tidyup.domain(args.domain)
944     origin = software = None
945
946     # Fetch record
947     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
948     row = database.cursor.fetchone()
949     if row is not None:
950         origin = row["origin"]
951         software = row["software"]
952
953     if software_helper.is_relay(software):
954         logger.warning("args.domain='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead.", args.domain, software)
955         return 102
956
957     # Initial fetch
958     try:
959         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
960         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
961     except network.exceptions as exception:
962         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
963         instances.set_last_error(args.domain, exception)
964         instances.update(args.domain)
965         return 100
966
967     if args.single:
968         logger.debug("Not fetching more instances - EXIT!")
969         return 0
970
971     # Loop through some instances
972     database.cursor.execute(
973         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC", [time.time() - config.get("recheck_instance")]
974     )
975
976     rows = database.cursor.fetchall()
977     logger.info("Checking %d entries ...", len(rows))
978     for row in rows:
979         logger.debug("row[domain]='%s'", row["domain"])
980         if row["domain"] == "":
981             logger.debug("row[domain] is empty - SKIPPED!")
982             continue
983
984         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
985         domain = row["domain"].encode("idna").decode("utf-8")
986         logger.debug("domain='%s' - AFTER!", domain)
987
988         if not domain_helper.is_wanted(domain):
989             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
990             continue
991
992         try:
993             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
994             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
995         except network.exceptions as exception:
996             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
997             instances.set_last_error(domain, exception)
998
999     logger.debug("Success - EXIT!")
1000     return 0
1001
1002 def fetch_csv(args: argparse.Namespace) -> int:
1003     logger.debug("args[]='%s' - CALLED!", type(args))
1004
1005     logger.debug("Invoking locking.acquire() ...")
1006     locking.acquire()
1007
1008     logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1009     for block in blocklists.csv_files:
1010         logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1011
1012         # Is domain given and not equal blocker?
1013         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1014             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1015             continue
1016
1017         logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1018         processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1019
1020     logger.debug("Success - EXIT!")
1021     return 0
1022
1023 def fetch_oliphant(args: argparse.Namespace) -> int:
1024     logger.debug("args[]='%s' - CALLED!", type(args))
1025
1026     logger.debug("Invoking locking.acquire() ...")
1027     locking.acquire()
1028
1029     source_domain = "codeberg.org"
1030     if sources.is_recent(source_domain):
1031         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1032         return 1
1033     else:
1034         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1035         sources.update(source_domain)
1036
1037     # Base URL
1038     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1039
1040     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1041     for block in blocklists.oliphant_blocklists:
1042         # Is domain given and not equal blocker?
1043         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1044             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1045             continue
1046
1047         url = f"{base_url}/{block['csv_url']}"
1048
1049         logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1050         processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1051
1052     logger.debug("Success! - EXIT!")
1053     return 0
1054
1055 def fetch_txt(args: argparse.Namespace) -> int:
1056     logger.debug("args[]='%s' - CALLED!", type(args))
1057
1058     logger.debug("Invoking locking.acquire() ...")
1059     locking.acquire()
1060
1061     logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1062     for row in blocklists.txt_files:
1063         logger.debug("Fetching row[url]='%s' ...", row["url"])
1064         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1065
1066         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1067         if response.ok and response.status_code == 200 and response.text != "":
1068             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1069             domains = response.text.strip().split("\n")
1070
1071             logger.info("Processing %d domains ...", len(domains))
1072             for domain in domains:
1073                 logger.debug("domain='%s' - BEFORE!", domain)
1074                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1075
1076                 logger.debug("domain='%s' - AFTER!", domain)
1077                 if domain is None or domain == "":
1078                     logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1079                     continue
1080                 elif not domain_helper.is_wanted(domain):
1081                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1082                     continue
1083                 elif instances.is_recent(domain):
1084                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1085                     continue
1086
1087                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1088                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1089
1090                 logger.debug("processed='%s'", processed)
1091                 if not processed:
1092                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1093                     continue
1094
1095     logger.debug("Success! - EXIT!")
1096     return 0
1097
1098 def fetch_fedipact(args: argparse.Namespace) -> int:
1099     logger.debug("args[]='%s' - CALLED!", type(args))
1100
1101     logger.debug("Invoking locking.acquire() ...")
1102     locking.acquire()
1103
1104     source_domain = "fedipact.online"
1105     if sources.is_recent(source_domain):
1106         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1107         return 1
1108     else:
1109         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1110         sources.update(source_domain)
1111
1112     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1113     response = utils.fetch_url(
1114         f"https://{source_domain}",
1115         network.web_headers,
1116         (config.get("connection_timeout"), config.get("read_timeout"))
1117     )
1118
1119     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1120     if response.ok and response.status_code == 200 and response.text != "":
1121         logger.debug("Parsing %d Bytes ...", len(response.text))
1122
1123         doc = bs4.BeautifulSoup(response.text, "html.parser")
1124         logger.debug("doc[]='%s'", type(doc))
1125
1126         rows = doc.findAll("li")
1127         logger.info("Checking %d row(s) ...", len(rows))
1128         for row in rows:
1129             logger.debug("row[]='%s'", type(row))
1130             domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1131
1132             logger.debug("domain='%s' - AFTER!", domain)
1133             if domain is None or domain == "":
1134                 logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1135                 continue
1136
1137             logger.debug("domain='%s' - BEFORE!", domain)
1138             domain = domain.encode("idna").decode("utf-8")
1139             logger.debug("domain='%s' - AFTER!", domain)
1140
1141             if not domain_helper.is_wanted(domain):
1142                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1143                 continue
1144             elif instances.is_registered(domain):
1145                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1146                 continue
1147             elif instances.is_recent(domain):
1148                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1149                 continue
1150
1151             logger.info("Fetching domain='%s' ...", domain)
1152             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1153
1154     logger.debug("Success! - EXIT!")
1155     return 0
1156
1157 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1158     logger.debug("args[]='%s' - CALLED!", type(args))
1159
1160     logger.debug("Invoking locking.acquire() ...")
1161     locking.acquire()
1162
1163     source_domain = "instances.joinmobilizon.org"
1164     if sources.is_recent(source_domain):
1165         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1166         return 1
1167     else:
1168         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1169         sources.update(source_domain)
1170
1171     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1172     raw = utils.fetch_url(
1173         f"https://{source_domain}/api/v1/instances",
1174         network.web_headers,
1175         (config.get("connection_timeout"), config.get("read_timeout"))
1176     ).text
1177     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1178
1179     parsed = json.loads(raw)
1180     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1181
1182     if "data" not in parsed:
1183         logger.warning("parsed()=%d does not contain key 'data'")
1184         return 1
1185
1186     logger.info("Checking %d instances ...", len(parsed["data"]))
1187     for row in parsed["data"]:
1188         logger.debug("row[]='%s'", type(row))
1189         if "host" not in row:
1190             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1191             continue
1192         elif not domain_helper.is_wanted(row["host"]):
1193             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1194             continue
1195         elif instances.is_registered(row["host"]):
1196             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1197             continue
1198
1199         logger.info("Fetching row[host]='%s' ...", row["host"])
1200         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1201
1202     logger.debug("Success! - EXIT!")
1203     return 0
1204
1205 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1206     logger.debug("args[]='%s' - CALLED!", type(args))
1207
1208     logger.debug("Invoking locking.acquire() ...")
1209     locking.acquire()
1210
1211     source_domain = "instanceapp.misskey.page"
1212     if sources.is_recent(source_domain):
1213         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1214         return 1
1215     else:
1216         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1217         sources.update(source_domain)
1218
1219     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1220     raw = utils.fetch_url(
1221         f"https://{source_domain}/instances.json",
1222         network.web_headers,
1223         (config.get("connection_timeout"), config.get("read_timeout"))
1224     ).text
1225     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1226
1227     parsed = json.loads(raw)
1228     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1229
1230     if "instancesInfos" not in parsed:
1231         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1232         return 1
1233
1234     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1235     for row in parsed["instancesInfos"]:
1236         logger.debug("row[%s]='%s'", type(row), row)
1237         if "url" not in row:
1238             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1239             continue
1240         elif not domain_helper.is_wanted(row["url"]):
1241             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1242             continue
1243         elif instances.is_registered(row["url"]):
1244             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1245             continue
1246
1247         logger.info("Fetching row[url]='%s' ...", row["url"])
1248         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1249
1250     logger.debug("Success! - EXIT!")
1251     return 0
1252
1253 def recheck_obfuscation(args: argparse.Namespace) -> int:
1254     logger.debug("args[]='%s' - CALLED!", type(args))
1255
1256     logger.debug("Invoking locking.acquire() ...")
1257     locking.acquire()
1258
1259     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1260         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1261     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1262         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1263     else:
1264         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1265
1266     rows = database.cursor.fetchall()
1267     logger.info("Checking %d domains ...", len(rows))
1268     for row in rows:
1269         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1270         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1271             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1272             continue
1273
1274         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1275         blocking = federation.fetch_blocks(row["domain"])
1276
1277         logger.debug("blocking()=%d", len(blocking))
1278         if len(blocking) == 0:
1279             if row["software"] == "pleroma":
1280                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1281                 blocking = pleroma.fetch_blocks(row["domain"])
1282             elif row["software"] == "mastodon":
1283                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1284                 blocking = mastodon.fetch_blocks(row["domain"])
1285             elif row["software"] == "lemmy":
1286                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1287                 blocking = lemmy.fetch_blocks(row["domain"])
1288             elif row["software"] == "friendica":
1289                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1290                 blocking = friendica.fetch_blocks(row["domain"])
1291             elif row["software"] == "misskey":
1292                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1293                 blocking = misskey.fetch_blocks(row["domain"])
1294             else:
1295                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1296
1297         # c.s isn't part of oliphant's "hidden" blocklists
1298         logger.debug("row[domain]='%s'", row["domain"])
1299         if row["domain"] != "chaos.social" and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1300             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1301             instances.set_last_blocked(row["domain"])
1302             instances.set_total_blocks(row["domain"], blocking)
1303
1304         obfuscated = 0
1305         blockdict = list()
1306
1307         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1308         for block in blocking:
1309             logger.debug("block[blocked]='%s'", block["blocked"])
1310             blocked = None
1311
1312             if block["blocked"] == "":
1313                 logger.debug("block[blocked] is empty - SKIPPED!")
1314                 continue
1315             elif block["blocked"].endswith(".arpa"):
1316                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1317                 continue
1318             elif block["blocked"].endswith(".tld"):
1319                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1320                 continue
1321             elif block["blocked"].endswith(".onion"):
1322                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1323                 continue
1324             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1325                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1326                 obfuscated = obfuscated + 1
1327                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1328             elif not domain_helper.is_wanted(block["blocked"]):
1329                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1330                 continue
1331             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1332                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1333                 continue
1334
1335             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1336             if blocked is not None and blocked != block["blocked"]:
1337                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1338                 obfuscated = obfuscated - 1
1339
1340                 if blacklist.is_blacklisted(blocked):
1341                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1342                     continue
1343                 elif blacklist.is_blacklisted(row["domain"]):
1344                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1345                     continue
1346                 elif blocks.is_instance_blocked(row["domain"], blocked):
1347                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1348                     continue
1349
1350                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1351
1352                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1353                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1354                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1355                     blockdict.append({
1356                         "blocked": blocked,
1357                         "reason" : block["reason"],
1358                     })
1359
1360         logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1361         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1362
1363         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1364         if instances.has_pending(row["domain"]):
1365             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1366             instances.update(row["domain"])
1367
1368         logger.debug("Invoking commit() ...")
1369         database.connection.commit()
1370
1371         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1372         if config.get("bot_enabled") and len(blockdict) > 0:
1373             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1374             network.send_bot_post(row["domain"], blockdict)
1375
1376     logger.debug("Success! - EXIT!")
1377     return 0
1378
1379 def fetch_fedilist(args: argparse.Namespace) -> int:
1380     logger.debug("args[]='%s' - CALLED!", type(args))
1381
1382     logger.debug("Invoking locking.acquire() ...")
1383     locking.acquire()
1384
1385     source_domain = "demo.fedilist.com"
1386     if sources.is_recent(source_domain):
1387         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1388         return 1
1389     else:
1390         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1391         sources.update(source_domain)
1392
1393     url = f"http://{source_domain}/instance/csv?onion=not"
1394     if args.software is not None and args.software != "":
1395         logger.debug("args.software='%s'", args.software)
1396         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1397
1398     logger.info("Fetching url='%s' ...", url)
1399     response = reqto.get(
1400         url,
1401         headers=network.web_headers,
1402         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1403         allow_redirects=False
1404     )
1405
1406     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1407     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1408         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1409         return 1
1410
1411     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1412
1413     logger.debug("reader[]='%s'", type(reader))
1414     if reader is None:
1415         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1416         return 2
1417
1418     rows = list(reader)
1419
1420     logger.info("Checking %d rows ...", len(rows))
1421     for row in rows:
1422         logger.debug("row[]='%s'", type(row))
1423         if "hostname" not in row:
1424             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1425             continue
1426
1427         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1428         domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1429         logger.debug("domain='%s' - AFTER!", domain)
1430
1431         if domain is None or domain == "":
1432             logger.debug("domain='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", domain, row["hostname"])
1433             continue
1434
1435         logger.debug("domain='%s' - BEFORE!", domain)
1436         domain = domain.encode("idna").decode("utf-8")
1437         logger.debug("domain='%s' - AFTER!", domain)
1438
1439         if not domain_helper.is_wanted(domain):
1440             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1441             continue
1442         elif (args.force is None or not args.force) and instances.is_registered(domain):
1443             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1444             continue
1445         elif instances.is_recent(domain):
1446             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1447             continue
1448
1449         logger.info("Fetching instances from domain='%s' ...", domain)
1450         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1451
1452     logger.debug("Success! - EXIT!")
1453     return 0
1454
1455 def update_nodeinfo(args: argparse.Namespace) -> int:
1456     logger.debug("args[]='%s' - CALLED!", type(args))
1457
1458     logger.debug("Invoking locking.acquire() ...")
1459     locking.acquire()
1460
1461     if args.domain is not None and args.domain != "":
1462         logger.debug("Fetching args.domain='%s'", args.domain)
1463         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1464     elif args.software is not None and args.software != "":
1465         logger.info("Fetching domains for args.software='%s'", args.software)
1466         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1467     elif args.mode is not None and args.mode != "":
1468         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1469         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1470     elif args.no_software:
1471         logger.info("Fetching domains with no software type detected ...")
1472         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1473     elif args.no_auto:
1474         logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1475         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1476     elif args.no_detection:
1477         logger.info("Fetching domains with no detection mode being set ...")
1478         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1479     else:
1480         logger.info("Fetching domains for recently updated ...")
1481         database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1482
1483     domains = database.cursor.fetchall()
1484
1485     logger.info("Checking %d domain(s) ...", len(domains))
1486     cnt = 0
1487     for row in domains:
1488         logger.debug("row[]='%s'", type(row))
1489         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1490             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1491             continue
1492
1493         try:
1494             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1495             software = federation.determine_software(row["domain"])
1496
1497             logger.debug("Determined software='%s'", software)
1498             if (software != row["software"] and software is not None) or args.force is True:
1499                 logger.debug("software='%s'", software)
1500                 if software is None:
1501                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1502                     instances.set_nodeinfo_url(row["domain"], None)
1503
1504                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1505                 instances.set_software(row["domain"], software)
1506
1507             if software is not None:
1508                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1509                 instances.set_success(row["domain"])
1510         except network.exceptions as exception:
1511             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1512             instances.set_last_error(row["domain"], exception)
1513
1514         instances.set_last_nodeinfo(row["domain"])
1515         instances.update(row["domain"])
1516         cnt = cnt + 1
1517
1518     logger.debug("Success! - EXIT!")
1519     return 0
1520
1521 def fetch_instances_social(args: argparse.Namespace) -> int:
1522     logger.debug("args[]='%s' - CALLED!", type(args))
1523
1524     logger.debug("Invoking locking.acquire() ...")
1525     locking.acquire()
1526
1527     source_domain = "instances.social"
1528
1529     if config.get("instances_social_api_key") == "":
1530         logger.error("API key not set. Please set in your config.json file.")
1531         return 1
1532     elif sources.is_recent(source_domain):
1533         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1534         return 2
1535     else:
1536         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1537         sources.update(source_domain)
1538
1539     headers = {
1540         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1541     }
1542
1543     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1544     fetched = network.get_json_api(
1545         source_domain,
1546         "/api/1.0/instances/list?count=0&sort_by=name",
1547         headers,
1548         (config.get("connection_timeout"), config.get("read_timeout"))
1549     )
1550     logger.debug("fetched[]='%s'", type(fetched))
1551
1552     if "error_message" in fetched:
1553         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1554         return 2
1555     elif "exception" in fetched:
1556         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1557         return 3
1558     elif "json" not in fetched:
1559         logger.warning("fetched has no element 'json' - EXIT!")
1560         return 4
1561     elif "instances" not in fetched["json"]:
1562         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1563         return 5
1564
1565     domains = list()
1566     rows = fetched["json"]["instances"]
1567
1568     logger.info("Checking %d row(s) ...", len(rows))
1569     for row in rows:
1570         logger.debug("row[]='%s'", type(row))
1571         domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1572         logger.debug("domain='%s' - AFTER!", domain)
1573
1574         if domain is None and domain == "":
1575             logger.debug("domain='%s' is empty after tidyup.domain() - SKIPPED!", domain)
1576             continue
1577
1578         logger.debug("domain='%s' - BEFORE!", domain)
1579         domain = domain.encode("idna").decode("utf-8")
1580         logger.debug("domain='%s' - AFTER!", domain)
1581
1582         if not domain_helper.is_wanted(domain):
1583             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1584             continue
1585         elif domain in domains:
1586             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1587             continue
1588         elif instances.is_registered(domain):
1589             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1590             continue
1591         elif instances.is_recent(domain):
1592             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1593             continue
1594
1595         logger.info("Fetching instances from domain='%s'", domain)
1596         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1597
1598     logger.debug("Success! - EXIT!")
1599     return 0
1600
1601 def fetch_relays(args: argparse.Namespace) -> int:
1602     logger.debug("args[]='%s' - CALLED!", type(args))
1603
1604     logger.debug("Invoking locking.acquire() ...")
1605     locking.acquire()
1606
1607     if args.domain is not None and args.domain != "":
1608         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1609     elif args.software is not None and args.software != "":
1610         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND software = ?", [args.software])
1611     else:
1612         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay')")
1613
1614     domains = list()
1615     rows = database.cursor.fetchall()
1616
1617     logger.info("Checking %d relays ...", len(rows))
1618     for row in rows:
1619         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1620         peers = list()
1621         if not args.force and instances.is_recent(row["domain"]):
1622             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1623             continue
1624
1625         try:
1626             if row["software"] == "pub-relay":
1627                 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1628                 raw = network.fetch_api_url(
1629                     row["nodeinfo_url"],
1630                     (config.get("connection_timeout"), config.get("read_timeout"))
1631                 )
1632
1633                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1634                 if "exception" in raw:
1635                     logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1636                     raise raw["exception"]
1637                 elif "error_message" in raw:
1638                     logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1639                     instances.set_last_error(row["domain"], raw)
1640                     instances.set_last_instance_fetch(row["domain"])
1641                     instances.update(row["domain"])
1642                     continue
1643                 elif "json" not in raw:
1644                     logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1645                     continue
1646                 elif not "metadata" in raw["json"]:
1647                     logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1648                     continue
1649                 elif not "peers" in raw["json"]["metadata"]:
1650                     logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1651                     continue
1652             else:
1653                 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1654                 raw = utils.fetch_url(
1655                     f"https://{row['domain']}",
1656                     network.web_headers,
1657                     (config.get("connection_timeout"), config.get("read_timeout"))
1658                 ).text
1659                 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1660
1661                 doc = bs4.BeautifulSoup(raw, features="html.parser")
1662                 logger.debug("doc[]='%s'", type(doc))
1663
1664         except network.exceptions as exception:
1665             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1666             instances.set_last_error(row["domain"], exception)
1667             instances.set_last_instance_fetch(row["domain"])
1668             instances.update(row["domain"])
1669             continue
1670
1671         logger.debug("row[software]='%s'", row["software"])
1672         if row["software"] == "activityrelay":
1673             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1674             tags = doc.findAll("p")
1675
1676             logger.debug("Checking %d paragraphs ...", len(tags))
1677             for tag in tags:
1678                 logger.debug("tag[]='%s'", type(tag))
1679                 if len(tag.contents) == 0:
1680                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1681                     continue
1682                 elif "registered instances" not in tag.contents[0]:
1683                     logger.debug("Skipping paragraph, text not found.")
1684                     continue
1685
1686                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1687                 for domain in tag.contents:
1688                     logger.debug("domain[%s]='%s'", type(domain), domain)
1689                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1690                         continue
1691
1692                     domain = str(domain)
1693                     logger.debug("domain='%s'", domain)
1694                     if not domain_helper.is_wanted(domain):
1695                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1696                         continue
1697
1698                     logger.debug("domain='%s' - BEFORE!", domain)
1699                     domain = tidyup.domain(domain) if domain not in[None, ""] else None
1700                     logger.debug("domain='%s' - AFTER!", domain)
1701
1702                     if domain is None or domain == "":
1703                         logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1704                         continue
1705                     elif domain not in peers:
1706                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1707                         peers.append(domain)
1708
1709                     if dict_helper.has_key(domains, "domain", domain):
1710                         logger.debug("domain='%s' already added", domain)
1711                         continue
1712
1713                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1714                     domains.append({
1715                         "domain": domain,
1716                         "origin": row["domain"],
1717                     })
1718         elif row["software"] in ["aoderelay", "selective-relay"]:
1719             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1720             if row["software"] == "aoderelay":
1721                 tags = doc.findAll("section", {"class": "instance"})
1722             else:
1723                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1724
1725             logger.debug("Checking %d tags ...", len(tags))
1726             for tag in tags:
1727                 logger.debug("tag[]='%s'", type(tag))
1728
1729                 link = tag.find("a")
1730                 logger.debug("link[%s]='%s'", type(link), link)
1731                 if not isinstance(link, bs4.element.Tag):
1732                     logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1733                     continue
1734
1735                 components = urlparse(link.get("href"))
1736                 logger.debug("components(%d)='%s'", len(components), components)
1737                 domain = components.netloc.lower().split(":")[0]
1738
1739                 logger.debug("domain='%s' - BEFORE!", domain)
1740                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1741                 logger.debug("domain='%s' - AFTER!", domain)
1742
1743                 if domain is None or domain == "":
1744                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1745                     continue
1746                 elif domain not in peers:
1747                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1748                     peers.append(domain)
1749
1750                 if dict_helper.has_key(domains, "domain", domain):
1751                     logger.debug("domain='%s' already added", domain)
1752                     continue
1753
1754                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1755                 domains.append({
1756                     "domain": domain,
1757                     "origin": row["domain"],
1758                 })
1759         elif row["software"] == "pub-relay":
1760             logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1761             for domain in raw["json"]["metadata"]["peers"]:
1762                 logger.debug("domain='%s' - BEFORE!", domain)
1763                 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1764                 logger.debug("domain='%s' - AFTER!", domain)
1765
1766                 if domain is None or domain == "":
1767                     logger.debug("domain='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", domain, row["domain"])
1768                     continue
1769                 elif domain not in peers:
1770                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1771                     peers.append(domain)
1772
1773                 if dict_helper.has_key(domains, "domain", domain):
1774                     logger.debug("domain='%s' already added", domain)
1775                     continue
1776
1777                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1778                 domains.append({
1779                     "domain": domain,
1780                     "origin": row["domain"],
1781                 })
1782         else:
1783             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1784             continue
1785
1786         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1787         instances.set_last_instance_fetch(row["domain"])
1788
1789         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1790         instances.set_total_peers(row["domain"], peers)
1791
1792         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1793         instances.update(row["domain"])
1794
1795     logger.info("Checking %d domains ...", len(domains))
1796     for row in domains:
1797         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1798         if not domain_helper.is_wanted(row["domain"]):
1799             logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1800             continue
1801         elif instances.is_registered(row["domain"]):
1802             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1803             continue
1804
1805         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1806         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1807
1808     logger.debug("Success! - EXIT!")
1809     return 0
1810
1811 def convert_idna(args: argparse.Namespace) -> int:
1812     logger.debug("args[]='%s' - CALLED!", type(args))
1813
1814     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1815     rows = database.cursor.fetchall()
1816
1817     logger.debug("rows[]='%s'", type(rows))
1818     instances.translate_idnas(rows, "domain")
1819
1820     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1821     rows = database.cursor.fetchall()
1822
1823     logger.debug("rows[]='%s'", type(rows))
1824     instances.translate_idnas(rows, "origin")
1825
1826     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1827     rows = database.cursor.fetchall()
1828
1829     logger.debug("rows[]='%s'", type(rows))
1830     blocks.translate_idnas(rows, "blocker")
1831
1832     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1833     rows = database.cursor.fetchall()
1834
1835     logger.debug("rows[]='%s'", type(rows))
1836     blocks.translate_idnas(rows, "blocked")
1837
1838     logger.debug("Success! - EXIT!")
1839     return 0
1840
1841 def remove_invalid(args: argparse.Namespace) -> int:
1842     logger.debug("args[]='%s' - CALLED!", type(args))
1843
1844     logger.debug("Invoking locking.acquire() ...")
1845     locking.acquire()
1846
1847     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
1848     rows = database.cursor.fetchall()
1849
1850     logger.info("Checking %d domains ...", len(rows))
1851     for row in rows:
1852         logger.debug("row[domain]='%s'", row["domain"])
1853         if not validators.domain(row["domain"].split("/")[0]):
1854             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
1855             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
1856             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
1857
1858     logger.debug("Invoking commit() ...")
1859     database.connection.commit()
1860
1861     logger.info("Vaccum cleaning database ...")
1862     database.cursor.execute("VACUUM")
1863
1864     logger.debug("Success! - EXIT!")
1865     return 0