]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 0
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] == "":
152                 logger.debug("row[domain] is empty - SKIPPED!")
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 0
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] == "":
228                 logger.debug("entry[domain] is empty - SKIPPED!")
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.info("blocker='%s',software='%s' has %d block entries returned.", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353
354         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
355         for block in blocking:
356             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
357
358             if block["block_level"] == "":
359                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
360                 continue
361
362             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
363             block["blocked"] = tidyup.domain(block["blocked"])
364             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
365             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
366
367             if block["blocked"] == "":
368                 logger.warning("blocked is empty, blocker='%s'", blocker)
369                 continue
370             elif block["blocked"].endswith(".onion"):
371                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
372                 continue
373             elif block["blocked"].endswith(".arpa"):
374                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".tld"):
377                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].find("*") >= 0:
380                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
381
382                 # Some friendica servers also obscure domains without hash
383                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
384
385                 logger.debug("row[]='%s'", type(row))
386                 if row is None:
387                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
388                     instances.set_has_obfuscation(blocker, True)
389                     continue
390
391                 block["blocked"] = row["domain"]
392                 origin           = row["origin"]
393                 nodeinfo_url     = row["nodeinfo_url"]
394             elif block["blocked"].find("?") >= 0:
395                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
396
397                 # Some obscure them with question marks, not sure if that's dependent on version or not
398                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
399
400                 logger.debug("row[]='%s'", type(row))
401                 if row is None:
402                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
403                     instances.set_has_obfuscation(blocker, True)
404                     continue
405
406                 block["blocked"] = row["domain"]
407                 origin           = row["origin"]
408                 nodeinfo_url     = row["nodeinfo_url"]
409
410             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
411             if block["blocked"] == "":
412                 logger.debug("block[blocked] is empty - SKIPPED!")
413                 continue
414
415             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
416             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
417             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
418
419             if not domain_helper.is_wanted(block["blocked"]):
420                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
421                 continue
422             elif block["block_level"] in ["accept", "accepted"]:
423                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
424                 continue
425             elif not instances.is_registered(block["blocked"]):
426                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
427                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
428
429             block["block_level"] = blocks.alias_block_level(block["block_level"])
430
431             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
432                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
433                 blockdict.append({
434                     "blocked": block["blocked"],
435                     "reason" : block["reason"],
436                 })
437
438             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
439             cookies.clear(block["blocked"])
440
441         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
442         if instances.has_pending(blocker):
443             logger.debug("Flushing updates for blocker='%s' ...", blocker)
444             instances.update(blocker)
445
446         logger.debug("Invoking commit() ...")
447         database.connection.commit()
448
449         logger.debug("Invoking cookies.clear(%s) ...", blocker)
450         cookies.clear(blocker)
451
452         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
453         if config.get("bot_enabled") and len(blockdict) > 0:
454             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
455             network.send_bot_post(blocker, blockdict)
456
457     logger.debug("Success! - EXIT!")
458     return 0
459
460 def fetch_observer(args: argparse.Namespace) -> int:
461     logger.debug("args[]='%s' - CALLED!", type(args))
462
463     logger.debug("Invoking locking.acquire() ...")
464     locking.acquire()
465
466     source_domain = "fediverse.observer"
467     if sources.is_recent(source_domain):
468         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
469         return 0
470     else:
471         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
472         sources.update(source_domain)
473
474     types = list()
475     if args.software is None:
476         logger.info("Fetching software list ...")
477         raw = utils.fetch_url(
478             f"https://{source_domain}",
479             network.web_headers,
480             (config.get("connection_timeout"), config.get("read_timeout"))
481         ).text
482         logger.debug("raw[%s]()=%d", type(raw), len(raw))
483
484         doc = bs4.BeautifulSoup(raw, features="html.parser")
485         logger.debug("doc[]='%s'", type(doc))
486
487         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
488         logger.debug("navbar[]='%s'", type(navbar))
489         if navbar is None:
490             logger.warning("Cannot find navigation bar, cannot continue!")
491             return 1
492
493         items = navbar.findAll("a", {"class": "dropdown-item"})
494         logger.debug("items[]='%s'", type(items))
495
496         logger.info("Checking %d menu items ...", len(items))
497         for item in items:
498             logger.debug("item[%s]='%s'", type(item), item)
499             if item.text.lower() == "all":
500                 logger.debug("Skipping 'All' menu entry ...")
501                 continue
502
503             logger.debug("Appending item.text='%s' ...", item.text)
504             types.append(tidyup.domain(item.text))
505     else:
506         logger.info("Adding args.software='%s' as type ...", args.software)
507         types.append(args.software)
508
509     logger.info("Fetching %d different table data ...", len(types))
510     for software in types:
511         logger.debug("software='%s' - BEFORE!", software)
512         if args.software is not None and args.software != software:
513             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
514             continue
515
516         doc = None
517         try:
518             logger.debug("Fetching table data for software='%s' ...", software)
519             raw = utils.fetch_url(
520                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
521                 network.web_headers,
522                 (config.get("connection_timeout"), config.get("read_timeout"))
523             ).text
524             logger.debug("raw[%s]()=%d", type(raw), len(raw))
525
526             doc = bs4.BeautifulSoup(raw, features="html.parser")
527             logger.debug("doc[]='%s'", type(doc))
528         except network.exceptions as exception:
529             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
530             continue
531
532         items = doc.findAll("a", {"class": "url"})
533         logger.info("Checking %d items,software='%s' ...", len(items), software)
534         for item in items:
535             logger.debug("item[]='%s'", type(item))
536             domain = item.decode_contents()
537             logger.debug("domain='%s' - AFTER!", domain)
538
539             if domain == "":
540                 logger.debug("domain is empty - SKIPPED!")
541                 continue
542
543             logger.debug("domain='%s' - BEFORE!", domain)
544             domain = domain.encode("idna").decode("utf-8")
545             logger.debug("domain='%s' - AFTER!", domain)
546
547             if not domain_helper.is_wanted(domain):
548                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
549                 continue
550             elif instances.is_registered(domain):
551                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
552                 continue
553
554             software = software_helper.alias(software)
555             logger.info("Fetching instances for domain='%s'", domain)
556             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
557
558     logger.debug("Success! - EXIT!")
559     return 0
560
561 def fetch_todon_wiki(args: argparse.Namespace) -> int:
562     logger.debug("args[]='%s' - CALLED!", type(args))
563
564     logger.debug("Invoking locking.acquire() ...")
565     locking.acquire()
566
567     source_domain = "wiki.todon.eu"
568     if sources.is_recent(source_domain):
569         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
570         return 0
571     else:
572         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
573         sources.update(source_domain)
574
575     blocklist = {
576         "silenced": list(),
577         "reject": list(),
578     }
579
580     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
581     raw = utils.fetch_url(
582         f"https://{source_domain}/todon/domainblocks",
583         network.web_headers,
584         (config.get("connection_timeout"), config.get("read_timeout"))
585     ).text
586     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
587
588     doc = bs4.BeautifulSoup(raw, "html.parser")
589     logger.debug("doc[]='%s'", type(doc))
590
591     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
592     logger.info("Checking %d silenced/limited entries ...", len(silenced))
593     blocklist["silenced"] = utils.find_domains(silenced, "div")
594
595     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
596     logger.info("Checking %d suspended entries ...", len(suspended))
597     blocklist["reject"] = utils.find_domains(suspended, "div")
598
599     blocking = blocklist["silenced"] + blocklist["reject"]
600     blocker = "todon.eu"
601
602     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
603     instances.set_last_blocked(blocker)
604     instances.set_total_blocks(blocker, blocking)
605
606     blockdict = list()
607     for block_level in blocklist:
608         blockers = blocklist[block_level]
609
610         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
611         for blocked in blockers:
612             logger.debug("blocked='%s'", blocked)
613
614             if not instances.is_registered(blocked):
615                 try:
616                     logger.info("Fetching instances from domain='%s' ...", blocked)
617                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
618                 except network.exceptions as exception:
619                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
620                     instances.set_last_error(blocked, exception)
621
622             if blocks.is_instance_blocked(blocker, blocked, block_level):
623                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
624                 continue
625
626             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
627             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
628                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
629                 blockdict.append({
630                     "blocked": blocked,
631                     "reason" : None,
632                 })
633
634         logger.debug("Invoking commit() ...")
635         database.connection.commit()
636
637         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
638         if config.get("bot_enabled") and len(blockdict) > 0:
639             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
640             network.send_bot_post(blocker, blockdict)
641
642     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
643     if instances.has_pending(blocker):
644         logger.debug("Flushing updates for blocker='%s' ...", blocker)
645         instances.update(blocker)
646
647     logger.debug("Success! - EXIT!")
648     return 0
649
650 def fetch_cs(args: argparse.Namespace):
651     logger.debug("args[]='%s' - CALLED!", type(args))
652
653     logger.debug("Invoking locking.acquire() ...")
654     locking.acquire()
655
656     extensions = [
657         "extra",
658         "abbr",
659         "attr_list",
660         "def_list",
661         "fenced_code",
662         "footnotes",
663         "md_in_html",
664         "admonition",
665         "codehilite",
666         "legacy_attrs",
667         "legacy_em",
668         "meta",
669         "nl2br",
670         "sane_lists",
671         "smarty",
672         "toc",
673         "wikilinks"
674     ]
675
676     blocklist = {
677         "silenced": list(),
678         "reject"  : list(),
679     }
680
681     source_domain = "raw.githubusercontent.com"
682     if sources.is_recent(source_domain):
683         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
684         return 0
685     else:
686         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
687         sources.update(source_domain)
688
689     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
690     raw = utils.fetch_url(
691         f"https://{source_domain}/chaossocial/meta/master/federation.md",
692         network.web_headers,
693         (config.get("connection_timeout"), config.get("read_timeout"))
694     ).text
695     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
696
697     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
698     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
699
700     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
701     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
702     blocklist["silenced"] = federation.find_domains(silenced)
703
704     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
705     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
706     blocklist["reject"] = federation.find_domains(blocked)
707
708     blocking = blocklist["silenced"] + blocklist["reject"]
709     blocker = "chaos.social"
710
711     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
712     instances.set_last_blocked(blocker)
713     instances.set_total_blocks(blocker, blocking)
714
715     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
716     if len(blocking) > 0:
717         blockdict = list()
718         for block_level in blocklist:
719             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
720
721             for row in blocklist[block_level]:
722                 logger.debug("row[%s]='%s'", type(row), row)
723                 if not "domain" in row:
724                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
725                     continue
726                 elif not instances.is_registered(row["domain"]):
727                     try:
728                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
729                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
730                     except network.exceptions as exception:
731                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
732                         instances.set_last_error(row["domain"], exception)
733
734                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
735                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
736                     blockdict.append({
737                         "blocked": row["domain"],
738                         "reason" : row["reason"],
739                     })
740
741         logger.debug("Invoking commit() ...")
742         database.connection.commit()
743
744         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
745         if config.get("bot_enabled") and len(blockdict) > 0:
746             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
747             network.send_bot_post(blocker, blockdict)
748
749     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
750     if instances.has_pending(blocker):
751         logger.debug("Flushing updates for blocker='%s' ...", blocker)
752         instances.update(blocker)
753
754     logger.debug("Success! - EXIT!")
755     return 0
756
757 def fetch_fba_rss(args: argparse.Namespace) -> int:
758     logger.debug("args[]='%s' - CALLED!", type(args))
759
760     domains = list()
761
762     logger.debug("Invoking locking.acquire() ...")
763     locking.acquire()
764
765     components = urlparse(args.feed)
766
767     if sources.is_recent(components.netloc):
768         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
769         return 0
770     else:
771         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
772         sources.update(components.netloc)
773
774     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
775     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
776
777     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
778     if response.ok and response.status_code == 200 and len(response.text) > 0:
779         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
780         rss = atoma.parse_rss_bytes(response.content)
781
782         logger.debug("rss[]='%s'", type(rss))
783         for item in rss.items:
784             logger.debug("item[%s]='%s'", type(item), item)
785             domain = tidyup.domain(item.link.split("=")[1])
786
787             logger.debug("domain='%s' - AFTER!", domain)
788             if domain == "":
789                 logger.debug("domain is empty - SKIPPED!")
790                 continue
791
792             logger.debug("domain='%s' - BEFORE!", domain)
793             domain = domain.encode("idna").decode("utf-8")
794             logger.debug("domain='%s' - AFTER!", domain)
795
796             if not domain_helper.is_wanted(domain):
797                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
798                 continue
799             elif domain in domains:
800                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
801                 continue
802             elif instances.is_registered(domain):
803                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
804                 continue
805             elif instances.is_recent(domain):
806                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
807                 continue
808
809             logger.debug("Adding domain='%s'", domain)
810             domains.append(domain)
811
812     logger.debug("domains()=%d", len(domains))
813     if len(domains) > 0:
814         logger.info("Adding %d new instances ...", len(domains))
815         for domain in domains:
816             logger.debug("domain='%s'", domain)
817             try:
818                 logger.info("Fetching instances from domain='%s' ...", domain)
819                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
820             except network.exceptions as exception:
821                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
822                 instances.set_last_error(domain, exception)
823                 return 100
824
825     logger.debug("Success! - EXIT!")
826     return 0
827
828 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
829     logger.debug("args[]='%s' - CALLED!", type(args))
830
831     logger.debug("Invoking locking.acquire() ...")
832     locking.acquire()
833
834     source_domain = "ryona.agency"
835     feed = f"https://{source_domain}/users/fba/feed.atom"
836
837     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
838     if args.feed is not None and validators.url(args.feed):
839         logger.debug("Setting feed='%s' ...", args.feed)
840         feed = str(args.feed)
841         source_domain = urlparse(args.feed).netloc
842
843     if sources.is_recent(source_domain):
844         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
845         return 0
846     else:
847         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
848         sources.update(source_domain)
849
850     domains = list()
851
852     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
853     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
854
855     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
856     if response.ok and response.status_code == 200 and len(response.text) > 0:
857         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
858         atom = atoma.parse_atom_bytes(response.content)
859
860         logger.debug("atom[]='%s'", type(atom))
861         for entry in atom.entries:
862             logger.debug("entry[]='%s'", type(entry))
863             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
864             logger.debug("doc[]='%s'", type(doc))
865             for element in doc.findAll("a"):
866                 logger.debug("element[]='%s'", type(element))
867                 for href in element["href"].split(","):
868                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
869                     domain = tidyup.domain(href)
870
871                     logger.debug("domain='%s' - AFTER!", domain)
872                     if domain == "":
873                         logger.debug("domain is empty - SKIPPED!")
874                         continue
875
876                     logger.debug("domain='%s' - BEFORE!", domain)
877                     domain = domain.encode("idna").decode("utf-8")
878                     logger.debug("domain='%s' - AFTER!", domain)
879
880                     if not domain_helper.is_wanted(domain):
881                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
882                         continue
883                     elif domain in domains:
884                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
885                         continue
886                     elif instances.is_registered(domain):
887                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
888                         continue
889                     elif instances.is_recent(domain):
890                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
891                         continue
892
893                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
894                     domains.append(domain)
895
896     logger.debug("domains()=%d", len(domains))
897     if len(domains) > 0:
898         logger.info("Adding %d new instances ...", len(domains))
899         for domain in domains:
900             logger.debug("domain='%s'", domain)
901             try:
902                 logger.info("Fetching instances from domain='%s' ...", domain)
903                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
904             except network.exceptions as exception:
905                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
906                 instances.set_last_error(domain, exception)
907                 return 100
908
909     logger.debug("Success! - EXIT!")
910     return 0
911
912 def fetch_instances(args: argparse.Namespace) -> int:
913     logger.debug("args[]='%s' - CALLED!", type(args))
914
915     logger.debug("args.domain='%s' - checking ...", args.domain)
916     if not validators.domain(args.domain):
917         logger.warning("args.domain='%s' is not valid.", args.domain)
918         return 100
919     elif blacklist.is_blacklisted(args.domain):
920         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
921         return 101
922
923     logger.debug("Invoking locking.acquire() ...")
924     locking.acquire()
925
926     # Initialize values
927     domain = tidyup.domain(args.domain)
928     origin = software = None
929
930     # Fetch record
931     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
932     row = database.cursor.fetchone()
933     if row is not None:
934         origin = row["origin"]
935         software = row["software"]
936
937     # Initial fetch
938     try:
939         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
940         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
941     except network.exceptions as exception:
942         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
943         instances.set_last_error(args.domain, exception)
944         instances.update(args.domain)
945         return 100
946
947     if args.single:
948         logger.debug("Not fetching more instances - EXIT!")
949         return 0
950
951     # Loop through some instances
952     database.cursor.execute(
953         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
954     )
955
956     rows = database.cursor.fetchall()
957     logger.info("Checking %d entries ...", len(rows))
958     for row in rows:
959         logger.debug("row[domain]='%s'", row["domain"])
960         if row["domain"] == "":
961             logger.debug("row[domain] is empty - SKIPPED!")
962             continue
963
964         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
965         domain = row["domain"].encode("idna").decode("utf-8")
966         logger.debug("domain='%s' - AFTER!", domain)
967
968         if not domain_helper.is_wanted(domain):
969             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
970             continue
971
972         try:
973             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
974             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
975         except network.exceptions as exception:
976             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
977             instances.set_last_error(domain, exception)
978
979     logger.debug("Success - EXIT!")
980     return 0
981
982 def fetch_oliphant(args: argparse.Namespace) -> int:
983     logger.debug("args[]='%s' - CALLED!", type(args))
984
985     logger.debug("Invoking locking.acquire() ...")
986     locking.acquire()
987
988     source_domain = "codeberg.org"
989     if sources.is_recent(source_domain):
990         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
991         return 0
992     else:
993         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
994         sources.update(source_domain)
995
996     # Base URL
997     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
998
999     domains = list()
1000
1001     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1002     for block in blocklists.oliphant_blocklists:
1003         # Is domain given and not equal blocker?
1004         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1005             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1006             continue
1007         elif args.domain in domains:
1008             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1009             continue
1010
1011         instances.set_last_blocked(block["blocker"])
1012
1013         # Fetch this URL
1014         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1015         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1016
1017         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1018         if not response.ok or response.status_code > 200 or response.content == "":
1019             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1020             continue
1021
1022         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1023         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1024
1025         blockdict = list()
1026
1027         cnt = 0
1028         for row in reader:
1029             logger.debug("row[%s]='%s'", type(row), row)
1030             domain = severity = None
1031             reject_media = reject_reports = False
1032
1033             if "#domain" in row:
1034                 domain = row["#domain"]
1035             elif "domain" in row:
1036                 domain = row["domain"]
1037             else:
1038                 logger.debug("row='%s' does not contain domain column", row)
1039                 continue
1040
1041             if "#severity" in row:
1042                 severity = blocks.alias_block_level(row["#severity"])
1043             elif "severity" in row:
1044                 severity = blocks.alias_block_level(row["severity"])
1045             else:
1046                 logger.debug("row='%s' does not contain severity column", row)
1047                 continue
1048
1049             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1050                 reject_media = True
1051             elif "reject_media" in row and row["reject_media"].lower() == "true":
1052                 reject_media = True
1053
1054             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1055                 reject_reports = True
1056             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1057                 reject_reports = True
1058
1059             cnt = cnt + 1
1060             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1061             if domain == "":
1062                 logger.debug("domain is empty - SKIPPED!")
1063                 continue
1064             elif domain.endswith(".onion"):
1065                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1066                 continue
1067             elif domain.endswith(".arpa"):
1068                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1069                 continue
1070             elif domain.endswith(".tld"):
1071                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1072                 continue
1073             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1074                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1075                 domain = utils.deobfuscate(domain, block["blocker"])
1076                 logger.debug("domain='%s' - AFTER!", domain)
1077
1078             if not validators.domain(domain):
1079                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1080                 continue
1081             elif blacklist.is_blacklisted(domain):
1082                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1083                 continue
1084             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1085                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1086                 continue
1087
1088             logger.debug("Marking domain='%s' as handled", domain)
1089             domains.append(domain)
1090
1091             logger.debug("Processing domain='%s' ...", domain)
1092             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1093             logger.debug("processed='%s'", processed)
1094
1095             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1096                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1097                 blockdict.append({
1098                     "blocked": domain,
1099                     "reason" : block["reason"],
1100                 })
1101
1102             if reject_media:
1103                 processing.block(block["blocker"], domain, None, "reject_media")
1104             if reject_reports:
1105                 processing.block(block["blocker"], domain, None, "reject_reports")
1106
1107         logger.debug("block[blocker]='%s'", block["blocker"])
1108         if not blocklists.has(block["blocker"]):
1109             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1110             instances.set_total_blocks(block["blocker"], domains)
1111
1112         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1113         if instances.has_pending(block["blocker"]):
1114             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1115             instances.update(block["blocker"])
1116
1117         logger.debug("Invoking commit() ...")
1118         database.connection.commit()
1119
1120         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1121         if config.get("bot_enabled") and len(blockdict) > 0:
1122             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1123             network.send_bot_post(block["blocker"], blockdict)
1124
1125     logger.debug("Success! - EXIT!")
1126     return 0
1127
1128 def fetch_txt(args: argparse.Namespace) -> int:
1129     logger.debug("args[]='%s' - CALLED!", type(args))
1130
1131     logger.debug("Invoking locking.acquire() ...")
1132     locking.acquire()
1133
1134     # Static URLs
1135     urls = ({
1136         "blocker": "seirdy.one",
1137         "url"    : "https://seirdy.one/pb/bsl.txt",
1138     },)
1139
1140     logger.info("Checking %d text file(s) ...", len(urls))
1141     for row in urls:
1142         logger.debug("Fetching row[url]='%s' ...", row["url"])
1143         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1144
1145         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1146         if response.ok and response.status_code == 200 and response.text != "":
1147             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1148             domains = response.text.split("\n")
1149
1150             logger.info("Processing %d domains ...", len(domains))
1151             for domain in domains:
1152                 logger.debug("domain='%s' - BEFORE!", domain)
1153                 domain = tidyup.domain(domain)
1154
1155                 logger.debug("domain='%s' - AFTER!", domain)
1156                 if domain == "":
1157                     logger.debug("domain is empty - SKIPPED!")
1158                     continue
1159                 elif not domain_helper.is_wanted(domain):
1160                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1161                     continue
1162                 elif instances.is_recent(domain):
1163                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1164                     continue
1165
1166                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1167                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1168
1169                 logger.debug("processed='%s'", processed)
1170                 if not processed:
1171                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1172                     continue
1173
1174     logger.debug("Success! - EXIT!")
1175     return 0
1176
1177 def fetch_fedipact(args: argparse.Namespace) -> int:
1178     logger.debug("args[]='%s' - CALLED!", type(args))
1179
1180     logger.debug("Invoking locking.acquire() ...")
1181     locking.acquire()
1182
1183     source_domain = "fedipact.online"
1184     if sources.is_recent(source_domain):
1185         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1186         return 0
1187     else:
1188         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1189         sources.update(source_domain)
1190
1191     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1192     response = utils.fetch_url(
1193         f"https://{source_domain}",
1194         network.web_headers,
1195         (config.get("connection_timeout"), config.get("read_timeout"))
1196     )
1197
1198     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1199     if response.ok and response.status_code == 200 and response.text != "":
1200         logger.debug("Parsing %d Bytes ...", len(response.text))
1201
1202         doc = bs4.BeautifulSoup(response.text, "html.parser")
1203         logger.debug("doc[]='%s'", type(doc))
1204
1205         rows = doc.findAll("li")
1206         logger.info("Checking %d row(s) ...", len(rows))
1207         for row in rows:
1208             logger.debug("row[]='%s'", type(row))
1209             domain = tidyup.domain(row.contents[0])
1210
1211             logger.debug("domain='%s' - AFTER!", domain)
1212             if domain == "":
1213                 logger.debug("domain is empty - SKIPPED!")
1214                 continue
1215
1216             logger.debug("domain='%s' - BEFORE!", domain)
1217             domain = domain.encode("idna").decode("utf-8")
1218             logger.debug("domain='%s' - AFTER!", domain)
1219
1220             if not domain_helper.is_wanted(domain):
1221                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1222                 continue
1223             elif instances.is_registered(domain):
1224                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1225                 continue
1226             elif instances.is_recent(domain):
1227                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1228                 continue
1229
1230             logger.info("Fetching domain='%s' ...", domain)
1231             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1232
1233     logger.debug("Success! - EXIT!")
1234     return 0
1235
1236 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1237     logger.debug("args[]='%s' - CALLED!", type(args))
1238
1239     logger.debug("Invoking locking.acquire() ...")
1240     locking.acquire()
1241
1242     source_domain = "instances.joinmobilizon.org"
1243     if sources.is_recent(source_domain):
1244         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1245         return 0
1246     else:
1247         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1248         sources.update(source_domain)
1249
1250     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1251     raw = utils.fetch_url(
1252         f"https://{source_domain}/api/v1/instances",
1253         network.web_headers,
1254         (config.get("connection_timeout"), config.get("read_timeout"))
1255     ).text
1256     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1257
1258     parsed = json.loads(raw)
1259     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1260
1261     if "data" not in parsed:
1262         logger.warning("parsed()=%d does not contain key 'data'")
1263         return 1
1264
1265     logger.info("Checking %d instances ...", len(parsed["data"]))
1266     for row in parsed["data"]:
1267         logger.debug("row[]='%s'", type(row))
1268         if "host" not in row:
1269             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1270             continue
1271         elif not domain_helper.is_wanted(row["host"]):
1272             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1273             continue
1274         elif instances.is_registered(row["host"]):
1275             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1276             continue
1277
1278         logger.info("Fetching row[host]='%s' ...", row["host"])
1279         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1280
1281     logger.debug("Success! - EXIT!")
1282     return 0
1283
1284 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1285     logger.debug("args[]='%s' - CALLED!", type(args))
1286
1287     logger.debug("Invoking locking.acquire() ...")
1288     locking.acquire()
1289
1290     source_domain = "instanceapp.misskey.page"
1291     if sources.is_recent(source_domain):
1292         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1293         return 0
1294     else:
1295         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1296         sources.update(source_domain)
1297
1298     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1299     raw = utils.fetch_url(
1300         f"https://{source_domain}/instances.json",
1301         network.web_headers,
1302         (config.get("connection_timeout"), config.get("read_timeout"))
1303     ).text
1304     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1305
1306     parsed = json.loads(raw)
1307     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1308
1309     if "instancesInfos" not in parsed:
1310         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1311         return 1
1312
1313     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1314     for row in parsed["instancesInfos"]:
1315         logger.debug("row[%s]='%s'", type(row), row)
1316         if "url" not in row:
1317             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1318             continue
1319         elif not domain_helper.is_wanted(row["url"]):
1320             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1321             continue
1322         elif instances.is_registered(row["url"]):
1323             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1324             continue
1325
1326         logger.info("Fetching row[url]='%s' ...", row["url"])
1327         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1328
1329     logger.debug("Success! - EXIT!")
1330     return 0
1331
1332 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1333     logger.debug("args[]='%s' - CALLED!", type(args))
1334
1335     logger.debug("Invoking locking.acquire() ...")
1336     locking.acquire()
1337
1338     source_domain = "joinfediverse.wiki"
1339     if sources.is_recent(source_domain):
1340         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1341         return 0
1342     else:
1343         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1344         sources.update(source_domain)
1345
1346     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1347     raw = utils.fetch_url(
1348         f"https://{source_domain}/FediBlock",
1349         network.web_headers,
1350         (config.get("connection_timeout"), config.get("read_timeout"))
1351     ).text
1352     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1353
1354     doc = bs4.BeautifulSoup(raw, "html.parser")
1355     logger.debug("doc[]='%s'", type(doc))
1356
1357     tables = doc.findAll("table", {"class": "wikitable"})
1358
1359     logger.info("Analyzing %d table(s) ...", len(tables))
1360     blocklist = list()
1361     for table in tables:
1362         logger.debug("table[]='%s'", type(table))
1363
1364         rows = table.findAll("tr")
1365         logger.info("Checking %d row(s) ...", len(rows))
1366         block_headers = dict()
1367         for row in rows:
1368             logger.debug("row[%s]='%s'", type(row), row)
1369
1370             headers = row.findAll("th")
1371             logger.debug("Found headers()=%d header(s)", len(headers))
1372             if len(headers) > 1:
1373                 block_headers = dict()
1374                 cnt = 0
1375                 for header in headers:
1376                     cnt = cnt + 1
1377                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1378                     text = header.contents[0]
1379
1380                     logger.debug("text[]='%s'", type(text))
1381                     if not isinstance(text, str):
1382                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1383                         continue
1384                     elif validators.domain(text.strip()):
1385                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1386                         continue
1387
1388                     text = tidyup.domain(text.strip())
1389                     logger.debug("text='%s' - AFTER!", text)
1390                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1391                         logger.debug("Found header: '%s'=%d", text, cnt)
1392                         block_headers[cnt] = text
1393
1394             elif len(block_headers) == 0:
1395                 logger.debug("row is not scrapable - SKIPPED!")
1396                 continue
1397             elif len(block_headers) > 0:
1398                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1399                 cnt = 0
1400                 block = dict()
1401
1402                 for element in row.find_all(["th", "td"]):
1403                     cnt = cnt + 1
1404                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1405                     if cnt in block_headers:
1406                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1407
1408                         text = element.text.strip()
1409                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1410
1411                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1412                         if key in ["domain", "instance"]:
1413                             block[key] = text
1414                         elif key == "reason":
1415                             block[key] = tidyup.reason(text)
1416                         elif key == "subdomain(s)":
1417                             block[key] = list()
1418                             if text != "":
1419                                 block[key] = text.split("/")
1420                         else:
1421                             logger.debug("key='%s'", key)
1422                             block[key] = text
1423
1424                 logger.debug("block()=%d ...", len(block))
1425                 if len(block) > 0:
1426                     logger.debug("Appending block()=%d ...", len(block))
1427                     blocklist.append(block)
1428
1429     logger.debug("blocklist()=%d", len(blocklist))
1430
1431     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1432     domains = database.cursor.fetchall()
1433
1434     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1435     blocking = list()
1436     for block in blocklist:
1437         logger.debug("block='%s'", block)
1438         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1439             origin = block["blocked"]
1440             logger.debug("origin='%s'", origin)
1441             for subdomain in block["subdomain(s)"]:
1442                 block["blocked"] = subdomain + "." + origin
1443                 logger.debug("block[blocked]='%s'", block["blocked"])
1444                 blocking.append(block)
1445         else:
1446             blocking.append(block)
1447
1448     logger.debug("blocking()=%d", blocking)
1449     for block in blocking:
1450         logger.debug("block[]='%s'", type(block))
1451         if "blocked" not in block:
1452             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1453
1454         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1455         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1456
1457         if block["blocked"] == "":
1458             logger.debug("block[blocked] is empty - SKIPPED!")
1459             continue
1460         elif not domain_helper.is_wanted(block["blocked"]):
1461             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1462             continue
1463         elif instances.is_recent(block["blocked"]):
1464             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1465             continue
1466
1467         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1468         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1469
1470     blockdict = list()
1471     for blocker in domains:
1472         blocker = blocker[0]
1473         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1474         instances.set_last_blocked(blocker)
1475
1476         for block in blocking:
1477             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1478             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1479
1480             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1481             if block["blocked"] == "":
1482                 logger.debug("block[blocked] is empty - SKIPPED!")
1483                 continue
1484             elif not domain_helper.is_wanted(block["blocked"]):
1485                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1486                 continue
1487
1488             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1489             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1490                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1491                 blockdict.append({
1492                     "blocked": block["blocked"],
1493                     "reason" : block["reason"],
1494                 })
1495
1496         if instances.has_pending(blocker):
1497             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1498             instances.update(blocker)
1499
1500         logger.debug("Invoking commit() ...")
1501         database.connection.commit()
1502
1503         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1504         if config.get("bot_enabled") and len(blockdict) > 0:
1505             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1506             network.send_bot_post(blocker, blockdict)
1507
1508     logger.debug("Success! - EXIT!")
1509     return 0
1510
1511 def recheck_obfuscation(args: argparse.Namespace) -> int:
1512     logger.debug("args[]='%s' - CALLED!", type(args))
1513
1514     logger.debug("Invoking locking.acquire() ...")
1515     locking.acquire()
1516
1517     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1518         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1519     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1520         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1521     else:
1522         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1523
1524     rows = database.cursor.fetchall()
1525     logger.info("Checking %d domains ...", len(rows))
1526     for row in rows:
1527         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1528         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1529             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1530             continue
1531
1532         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1533         blocking = federation.fetch_blocks(row["domain"])
1534
1535         logger.debug("blocking()=%d", len(blocking))
1536         if len(blocking) == 0:
1537             if row["software"] == "pleroma":
1538                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1539                 blocking = pleroma.fetch_blocks(row["domain"])
1540             elif row["software"] == "mastodon":
1541                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1542                 blocking = mastodon.fetch_blocks(row["domain"])
1543             elif row["software"] == "lemmy":
1544                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1545                 blocking = lemmy.fetch_blocks(row["domain"])
1546             elif row["software"] == "friendica":
1547                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1548                 blocking = friendica.fetch_blocks(row["domain"])
1549             elif row["software"] == "misskey":
1550                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1551                 blocking = misskey.fetch_blocks(row["domain"])
1552             else:
1553                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1554
1555         # c.s isn't part of oliphant's "hidden" blocklists
1556         logger.debug("row[domain]='%s'", row["domain"])
1557         if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]):
1558             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1559             instances.set_last_blocked(row["domain"])
1560             instances.set_total_blocks(row["domain"], blocking)
1561
1562         obfuscated = 0
1563         blockdict = list()
1564
1565         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1566         for block in blocking:
1567             logger.debug("block[blocked]='%s'", block["blocked"])
1568             blocked = None
1569
1570             if block["blocked"] == "":
1571                 logger.debug("block[blocked] is empty - SKIPPED!")
1572                 continue
1573             elif block["blocked"].endswith(".arpa"):
1574                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1575                 continue
1576             elif block["blocked"].endswith(".tld"):
1577                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1578                 continue
1579             elif block["blocked"].endswith(".onion"):
1580                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1581                 continue
1582             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1583                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1584                 obfuscated = obfuscated + 1
1585                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1586             elif not domain_helper.is_wanted(block["blocked"]):
1587                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1588                 continue
1589             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1590                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1591                 continue
1592
1593             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1594             if blocked is not None and blocked != block["blocked"]:
1595                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1596                 obfuscated = obfuscated - 1
1597
1598                 if blocks.is_instance_blocked(row["domain"], blocked):
1599                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1600                     continue
1601                 elif blacklist.is_blacklisted(blocked):
1602                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1603                     continue
1604
1605                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1606
1607                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1608                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1609                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1610                     blockdict.append({
1611                         "blocked": blocked,
1612                         "reason" : block["reason"],
1613                     })
1614
1615         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1616         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1617
1618         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1619         if obfuscated == 0 and len(blocking) > 0:
1620             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1621             instances.set_has_obfuscation(row["domain"], False)
1622
1623         if instances.has_pending(row["domain"]):
1624             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1625             instances.update(row["domain"])
1626
1627         logger.debug("Invoking commit() ...")
1628         database.connection.commit()
1629
1630         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1631         if config.get("bot_enabled") and len(blockdict) > 0:
1632             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1633             network.send_bot_post(row["domain"], blockdict)
1634
1635     logger.debug("Success! - EXIT!")
1636     return 0
1637
1638 def fetch_fedilist(args: argparse.Namespace) -> int:
1639     logger.debug("args[]='%s' - CALLED!", type(args))
1640
1641     logger.debug("Invoking locking.acquire() ...")
1642     locking.acquire()
1643
1644     source_domain = "demo.fedilist.com"
1645     if sources.is_recent(source_domain):
1646         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1647         return 0
1648     else:
1649         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1650         sources.update(source_domain)
1651
1652     url = f"http://{source_domain}/instance/csv?onion=not"
1653     if args.software is not None and args.software != "":
1654         logger.debug("args.software='%s'", args.software)
1655         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1656
1657     logger.info("Fetching url='%s' ...", url)
1658     response = reqto.get(
1659         url,
1660         headers=network.web_headers,
1661         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1662         allow_redirects=False
1663     )
1664
1665     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1666     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1667         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1668         return 1
1669
1670     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1671
1672     logger.debug("reader[]='%s'", type(reader))
1673     if reader is None:
1674         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1675         return 2
1676
1677     rows = list(reader)
1678
1679     logger.info("Checking %d rows ...", len(rows))
1680     for row in rows:
1681         logger.debug("row[]='%s'", type(row))
1682         if "hostname" not in row:
1683             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1684             continue
1685
1686         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1687         domain = tidyup.domain(row["hostname"])
1688         logger.debug("domain='%s' - AFTER!", domain)
1689
1690         if domain == "":
1691             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1692             continue
1693
1694         logger.debug("domain='%s' - BEFORE!", domain)
1695         domain = domain.encode("idna").decode("utf-8")
1696         logger.debug("domain='%s' - AFTER!", domain)
1697
1698         if not domain_helper.is_wanted(domain):
1699             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1700             continue
1701         elif (args.force is None or not args.force) and instances.is_registered(domain):
1702             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1703             continue
1704         elif instances.is_recent(domain):
1705             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1706             continue
1707
1708         logger.info("Fetching instances from domain='%s' ...", domain)
1709         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1710
1711     logger.debug("Success! - EXIT!")
1712     return 0
1713
1714 def update_nodeinfo(args: argparse.Namespace) -> int:
1715     logger.debug("args[]='%s' - CALLED!", type(args))
1716
1717     logger.debug("Invoking locking.acquire() ...")
1718     locking.acquire()
1719
1720     if args.domain is not None and args.domain != "":
1721         logger.debug("Fetching args.domain='%s'", args.domain)
1722         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1723     elif args.software is not None and args.software != "":
1724         logger.info("Fetching domains for args.software='%s'", args.software)
1725         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1726     elif args.mode is not None and args.mode != "":
1727         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1728         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1729     elif args.no_software:
1730         logger.info("Fetching domains with no software type detected ...")
1731         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1732     else:
1733         logger.info("Fetching domains for recently updated ...")
1734         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1735
1736     domains = database.cursor.fetchall()
1737
1738     logger.info("Checking %d domain(s) ...", len(domains))
1739     cnt = 0
1740     for row in domains:
1741         logger.debug("row[]='%s'", type(row))
1742         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1743             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1744             continue
1745
1746         try:
1747             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1748             software = federation.determine_software(row["domain"])
1749
1750             logger.debug("Determined software='%s'", software)
1751             if (software != row["software"] and software is not None) or args.force is True:
1752                 logger.debug("software='%s'", software)
1753                 if software is None:
1754                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1755                     instances.set_nodeinfo_url(row["domain"], None)
1756
1757                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1758                 instances.set_software(row["domain"], software)
1759
1760             if software is not None:
1761                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1762                 instances.set_success(row["domain"])
1763         except network.exceptions as exception:
1764             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1765             instances.set_last_error(row["domain"], exception)
1766
1767         instances.set_last_nodeinfo(row["domain"])
1768         instances.update(row["domain"])
1769         cnt = cnt + 1
1770
1771     logger.debug("Success! - EXIT!")
1772     return 0
1773
1774 def fetch_instances_social(args: argparse.Namespace) -> int:
1775     logger.debug("args[]='%s' - CALLED!", type(args))
1776
1777     logger.debug("Invoking locking.acquire() ...")
1778     locking.acquire()
1779
1780     source_domain = "instances.social"
1781
1782     if config.get("instances_social_api_key") == "":
1783         logger.error("API key not set. Please set in your config.json file.")
1784         return 1
1785     elif sources.is_recent(source_domain):
1786         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1787         return 0
1788     else:
1789         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1790         sources.update(source_domain)
1791
1792     headers = {
1793         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1794     }
1795
1796     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1797     fetched = network.get_json_api(
1798         source_domain,
1799         "/api/1.0/instances/list?count=0&sort_by=name",
1800         headers,
1801         (config.get("connection_timeout"), config.get("read_timeout"))
1802     )
1803     logger.debug("fetched[]='%s'", type(fetched))
1804
1805     if "error_message" in fetched:
1806         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1807         return 2
1808     elif "exception" in fetched:
1809         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1810         return 3
1811     elif "json" not in fetched:
1812         logger.warning("fetched has no element 'json' - EXIT!")
1813         return 4
1814     elif "instances" not in fetched["json"]:
1815         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1816         return 5
1817
1818     domains = list()
1819     rows = fetched["json"]["instances"]
1820
1821     logger.info("Checking %d row(s) ...", len(rows))
1822     for row in rows:
1823         logger.debug("row[]='%s'", type(row))
1824         domain = tidyup.domain(row["name"])
1825         logger.debug("domain='%s' - AFTER!", domain)
1826
1827         if domain == "":
1828             logger.debug("domain is empty - SKIPPED!")
1829             continue
1830
1831         logger.debug("domain='%s' - BEFORE!", domain)
1832         domain = domain.encode("idna").decode("utf-8")
1833         logger.debug("domain='%s' - AFTER!", domain)
1834
1835         if not domain_helper.is_wanted(domain):
1836             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1837             continue
1838         elif domain in domains:
1839             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1840             continue
1841         elif instances.is_registered(domain):
1842             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1843             continue
1844         elif instances.is_recent(domain):
1845             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1846             continue
1847
1848         logger.info("Fetching instances from domain='%s'", domain)
1849         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1850
1851     logger.debug("Success! - EXIT!")
1852     return 0
1853
1854 def fetch_relays(args: argparse.Namespace) -> int:
1855     logger.debug("args[]='%s' - CALLED!", type(args))
1856
1857     logger.debug("Invoking locking.acquire() ...")
1858     locking.acquire()
1859
1860     if args.domain is not None and args.domain != "":
1861         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1862     else:
1863         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1864
1865     domains = list()
1866     rows = database.cursor.fetchall()
1867
1868     logger.info("Checking %d relays ...", len(rows))
1869     for row in rows:
1870         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1871         peers = list()
1872         if not args.force and instances.is_recent(row["domain"]):
1873             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1874             continue
1875
1876         try:
1877             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1878             raw = utils.fetch_url(
1879                 f"https://{row['domain']}",
1880                 network.web_headers,
1881                 (config.get("connection_timeout"), config.get("read_timeout"))
1882             ).text
1883             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1884         except network.exceptions as exception:
1885             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1886             instances.set_last_error(row["domain"], exception)
1887             instances.set_last_instance_fetch(row["domain"])
1888             instances.update(row["domain"])
1889             continue
1890
1891         doc = bs4.BeautifulSoup(raw, features="html.parser")
1892         logger.debug("doc[]='%s'", type(doc))
1893
1894         logger.debug("row[software]='%s'", row["software"])
1895         if row["software"] == "activityrelay":
1896             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1897             tags = doc.findAll("p")
1898
1899             logger.debug("Checking %d paragraphs ...", len(tags))
1900             for tag in tags:
1901                 logger.debug("tag[]='%s'", type(tag))
1902                 if len(tag.contents) == 0:
1903                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1904                     continue
1905                 elif "registered instances" not in tag.contents[0]:
1906                     logger.debug("Skipping paragraph, text not found.")
1907                     continue
1908
1909                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1910                 for domain in tag.contents:
1911                     logger.debug("domain[%s]='%s'", type(domain), domain)
1912                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1913                         continue
1914
1915                     domain = str(domain)
1916                     logger.debug("domain='%s'", domain)
1917                     if not domain_helper.is_wanted(domain):
1918                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1919                         continue
1920
1921                     logger.debug("domain='%s' - BEFORE!", domain)
1922                     domain = tidyup.domain(domain)
1923                     logger.debug("domain='%s' - AFTER!", domain)
1924
1925                     if domain == "":
1926                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1927                         continue
1928                     elif domain not in peers:
1929                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1930                         peers.append(domain)
1931
1932                     if dict_helper.has_key(domains, "domain", domain):
1933                         logger.debug("domain='%s' already added", domain)
1934                         continue
1935
1936                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1937                     domains.append({
1938                         "domain": domain,
1939                         "origin": row["domain"],
1940                     })
1941         elif row["software"] in ["aoderelay", "selective-relay"]:
1942             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1943             if row["software"] == "aoderelay":
1944                 tags = doc.findAll("section", {"class": "instance"})
1945             else:
1946                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1947
1948             logger.debug("Checking %d tags ...", len(tags))
1949             for tag in tags:
1950                 logger.debug("tag[]='%s'", type(tag))
1951
1952                 link = tag.find("a")
1953                 logger.debug("link[%s]='%s'", type(link), link)
1954                 if link is None:
1955                     logger.warning("tag='%s' has no a-tag ...", tag)
1956                     continue
1957
1958                 components = urlparse(link["href"])
1959                 domain = components.netloc.lower()
1960
1961                 if not domain_helper.is_wanted(domain):
1962                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1963                     continue
1964
1965                 logger.debug("domain='%s' - BEFORE!", domain)
1966                 domain = tidyup.domain(domain)
1967                 logger.debug("domain='%s' - AFTER!", domain)
1968
1969                 if domain == "":
1970                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1971                     continue
1972                 elif domain not in peers:
1973                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1974                     peers.append(domain)
1975
1976                 if dict_helper.has_key(domains, "domain", domain):
1977                     logger.debug("domain='%s' already added", domain)
1978                     continue
1979
1980                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1981                 domains.append({
1982                     "domain": domain,
1983                     "origin": row["domain"],
1984                 })
1985         else:
1986             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1987
1988         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1989         instances.set_last_instance_fetch(row["domain"])
1990
1991         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1992         instances.set_total_peers(row["domain"], peers)
1993
1994         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1995         instances.update(row["domain"])
1996
1997     logger.info("Checking %d domains ...", len(domains))
1998     for row in domains:
1999         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2000         if instances.is_registered(row["domain"]):
2001             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2002             continue
2003
2004         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2005         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2006
2007     logger.debug("Success! - EXIT!")
2008     return 0
2009
2010 def convert_idna(args: argparse.Namespace) -> int:
2011     logger.debug("args[]='%s' - CALLED!", type(args))
2012
2013     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2014     rows = database.cursor.fetchall()
2015
2016     logger.debug("rows[]='%s'", type(rows))
2017     instances.translate_idnas(rows, "domain")
2018
2019     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2020     rows = database.cursor.fetchall()
2021
2022     logger.debug("rows[]='%s'", type(rows))
2023     instances.translate_idnas(rows, "origin")
2024
2025     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2026     rows = database.cursor.fetchall()
2027
2028     logger.debug("rows[]='%s'", type(rows))
2029     blocks.translate_idnas(rows, "blocker")
2030
2031     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2032     rows = database.cursor.fetchall()
2033
2034     logger.debug("rows[]='%s'", type(rows))
2035     blocks.translate_idnas(rows, "blocked")
2036
2037     logger.debug("Success! - EXIT!")
2038     return 0
2039
2040 def remove_invalid(args: argparse.Namespace) -> int:
2041     logger.debug("args[]='%s' - CALLED!", type(args))
2042
2043     logger.debug("Invoking locking.acquire() ...")
2044     locking.acquire()
2045
2046     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2047     rows = database.cursor.fetchall()
2048
2049     logger.info("Checking %d domains ...", len(rows))
2050     for row in rows:
2051         logger.debug("row[domain]='%s'", row["domain"])
2052         if not validators.domain(row["domain"].split("/")[0]):
2053             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2054             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2055             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2056
2057     logger.debug("Invoking commit() ...")
2058     database.connection.commit()
2059
2060     logger.info("Vaccum cleaning database ...")
2061     database.cursor.execute("VACUUM")
2062
2063     logger.debug("Success! - EXIT!")
2064     return 0