]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import federation
47 from fba.http import network
48
49 from fba.models import blocks
50 from fba.models import instances
51 from fba.models import sources
52
53 from fba.networks import friendica
54 from fba.networks import lemmy
55 from fba.networks import mastodon
56 from fba.networks import misskey
57 from fba.networks import pleroma
58
59 logging.basicConfig(level=logging.INFO)
60 logger = logging.getLogger(__name__)
61 #logger.setLevel(logging.DEBUG)
62
63 def check_instance(args: argparse.Namespace) -> int:
64     logger.debug("args.domain='%s' - CALLED!", args.domain)
65     status = 0
66     if not validators.domain(args.domain):
67         logger.warning("args.domain='%s' is not valid", args.domain)
68         status = 100
69     elif blacklist.is_blacklisted(args.domain):
70         logger.warning("args.domain='%s' is blacklisted", args.domain)
71         status = 101
72     elif instances.is_registered(args.domain):
73         logger.warning("args.domain='%s' is already registered", args.domain)
74         status = 102
75     else:
76         logger.info("args.domain='%s' is not known", args.domain)
77
78     logger.debug("status=%d - EXIT!", status)
79     return status
80
81 def check_nodeinfo(args: argparse.Namespace) -> int:
82     logger.debug("args[]='%s' - CALLED!", type(args))
83
84     # Fetch rows
85     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
86
87     cnt = 0
88     for row in database.cursor.fetchall():
89         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
90         punycode = row["domain"].encode("idna").decode("utf-8")
91
92         if row["nodeinfo_url"].startswith("/"):
93             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
94             continue
95         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
96             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
97             cnt = cnt + 1
98
99     logger.info("Found %d row(s)", cnt)
100
101     logger.debug("EXIT!")
102     return 0
103
104 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
105     logger.debug("args[]='%s' - CALLED!", type(args))
106
107     # No CSRF by default, you don't have to add network.source_headers by yourself here
108     headers = tuple()
109     source_domain = "pixelfed.org"
110
111     if sources.is_recent(source_domain):
112         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
113         return 0
114     else:
115         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
116         sources.update(source_domain)
117
118     try:
119         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
120         headers = csrf.determine(source_domain, dict())
121     except network.exceptions as exception:
122         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
123         return list()
124
125     try:
126         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
127         fetched = network.get_json_api(
128             source_domain,
129             "/api/v1/servers/all.json?scope=All&country=all&language=all",
130             headers,
131             (config.get("connection_timeout"), config.get("read_timeout"))
132         )
133
134         logger.debug("JSON API returned %d elements", len(fetched))
135         if "error_message" in fetched:
136             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
137             return 101
138         elif "data" not in fetched["json"]:
139             logger.warning("API did not return JSON with 'data' element - EXIT!")
140             return 102
141
142         rows = fetched["json"]["data"]
143         logger.info("Checking %d fetched rows ...", len(rows))
144         for row in rows:
145             logger.debug("row[]='%s'", type(row))
146             if "domain" not in row:
147                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
148                 continue
149             elif row["domain"] == "":
150                 logger.debug("row[domain] is empty - SKIPPED!")
151                 continue
152
153             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
154             domain = row["domain"].encode("idna").decode("utf-8")
155             logger.debug("domain='%s' - AFTER!", domain)
156
157             if not domain_helper.is_wanted(domain):
158                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
159                 continue
160             elif instances.is_registered(domain):
161                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
162                 continue
163             elif instances.is_recent(domain):
164                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
165                 continue
166
167             logger.debug("Fetching instances from domain='%s' ...", domain)
168             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
169
170     except network.exceptions as exception:
171         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
172         return 103
173
174     logger.debug("Success! - EXIT!")
175     return 0
176
177 def fetch_bkali(args: argparse.Namespace) -> int:
178     logger.debug("args[]='%s' - CALLED!", type(args))
179
180     logger.debug("Invoking locking.acquire() ...")
181     locking.acquire()
182
183     source_domain = "gql.api.bka.li"
184     if sources.is_recent(source_domain):
185         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
186         return 0
187     else:
188         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
189         sources.update(source_domain)
190
191     domains = list()
192     try:
193         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
194         fetched = network.post_json_api(
195             source_domain,
196             "/v1/graphql",
197             json.dumps({
198                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
199             })
200         )
201
202         logger.debug("fetched[]='%s'", type(fetched))
203         if "error_message" in fetched:
204             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
205             return 100
206         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
207             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
208             return 101
209
210         rows = fetched["json"]
211
212         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
213         if len(rows) == 0:
214             raise Exception("WARNING: Returned no records")
215         elif "data" not in rows:
216             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
217         elif "nodeinfo" not in rows["data"]:
218             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
219
220         for entry in rows["data"]["nodeinfo"]:
221             logger.debug("entry[%s]='%s'", type(entry), entry)
222             if "domain" not in entry:
223                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
224                 continue
225             elif entry["domain"] == "":
226                 logger.debug("entry[domain] is empty - SKIPPED!")
227                 continue
228             elif not domain_helper.is_wanted(entry["domain"]):
229                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
230                 continue
231             elif instances.is_registered(entry["domain"]):
232                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
233                 continue
234             elif instances.is_recent(entry["domain"]):
235                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
236                 continue
237
238             logger.debug("Adding domain='%s' ...", entry["domain"])
239             domains.append(entry["domain"])
240
241     except network.exceptions as exception:
242         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
243         return 102
244
245     logger.debug("domains()=%d", len(domains))
246     if len(domains) > 0:
247         logger.info("Adding %d new instances ...", len(domains))
248         for domain in domains:
249             logger.debug("domain='%s' - BEFORE!", domain)
250             domain = domain.encode("idna").decode("utf-8")
251             logger.debug("domain='%s' - AFTER!", domain)
252
253             try:
254                 logger.info("Fetching instances from domain='%s' ...", domain)
255                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
256             except network.exceptions as exception:
257                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
258                 instances.set_last_error(domain, exception)
259                 return 100
260
261     logger.debug("Success - EXIT!")
262     return 0
263
264 def fetch_blocks(args: argparse.Namespace) -> int:
265     logger.debug("args[]='%s' - CALLED!", type(args))
266     if args.domain is not None and args.domain != "":
267         logger.debug("args.domain='%s' - checking ...", args.domain)
268         if not validators.domain(args.domain):
269             logger.warning("args.domain='%s' is not valid.", args.domain)
270             return 100
271         elif blacklist.is_blacklisted(args.domain):
272             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
273             return 101
274         elif not instances.is_registered(args.domain):
275             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
276             return 102
277
278     logger.debug("Invoking locking.acquire() ...")
279     locking.acquire()
280
281     if args.domain is not None and args.domain != "":
282         # Re-check single domain
283         logger.debug("Querying database for args.domain='%s' ...", args.domain)
284         database.cursor.execute(
285             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
286         )
287     elif args.software is not None and args.software != "":
288         # Re-check single software
289         logger.debug("Querying database for args.software='%s' ...", args.software)
290         database.cursor.execute(
291             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
292         )
293     elif args.force:
294         # Re-check all
295         logger.debug("Re-checking all instances ...")
296         database.cursor.execute(
297             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC"
298         )
299     else:
300         # Re-check after "timeout" (aka. minimum interval)
301         database.cursor.execute(
302             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
303         )
304
305     rows = database.cursor.fetchall()
306     logger.info("Checking %d entries ...", len(rows))
307     for blocker, software, origin, nodeinfo_url in rows:
308         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
309         blocker = tidyup.domain(blocker)
310         logger.debug("blocker='%s' - AFTER!", blocker)
311
312         if blocker == "":
313             logger.warning("blocker is now empty!")
314             continue
315         elif nodeinfo_url is None or nodeinfo_url == "":
316             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
317             continue
318         elif not domain_helper.is_wanted(blocker):
319             logger.debug("blocker='%s' is not wanted - SKIPPED!", blocker)
320             continue
321
322         logger.debug("blocker='%s'", blocker)
323         instances.set_last_blocked(blocker)
324         instances.set_has_obfuscation(blocker, False)
325
326         blocking = list()
327         if software == "pleroma":
328             logger.info("blocker='%s',software='%s'", blocker, software)
329             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
330             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
331         elif software == "mastodon":
332             logger.info("blocker='%s',software='%s'", blocker, software)
333             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
334             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
335         elif software == "lemmy":
336             logger.info("blocker='%s',software='%s'", blocker, software)
337             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
338             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339         elif software == "friendica":
340             logger.info("blocker='%s',software='%s'", blocker, software)
341             blocking = friendica.fetch_blocks(blocker)
342             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343         elif software == "misskey":
344             logger.info("blocker='%s',software='%s'", blocker, software)
345             blocking = misskey.fetch_blocks(blocker)
346             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
347         else:
348             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
349
350         logger.debug("blocker='%s'", blocker)
351         if blocker != "chaos.social":
352             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
353             instances.set_total_blocks(blocker, blocking)
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         blockdict = list()
357         for block in blocking:
358             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
359
360             if block["block_level"] == "":
361                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
362                 continue
363
364             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
365             block["blocked"] = tidyup.domain(block["blocked"])
366             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
367             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
368
369             if block["blocked"] == "":
370                 logger.warning("blocked is empty, blocker='%s'", blocker)
371                 continue
372             elif block["blocked"].endswith(".onion"):
373                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
374                 continue
375             elif block["blocked"].endswith(".arpa"):
376                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
377                 continue
378             elif block["blocked"].endswith(".tld"):
379                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
380                 continue
381             elif block["blocked"].find("*") >= 0:
382                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
383
384                 # Some friendica servers also obscure domains without hash
385                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
386
387                 logger.debug("row[]='%s'", type(row))
388                 if row is None:
389                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
390                     instances.set_has_obfuscation(blocker, True)
391                     continue
392
393                 block["blocked"] = row["domain"]
394                 origin           = row["origin"]
395                 nodeinfo_url     = row["nodeinfo_url"]
396             elif block["blocked"].find("?") >= 0:
397                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
398
399                 # Some obscure them with question marks, not sure if that's dependent on version or not
400                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
401
402                 logger.debug("row[]='%s'", type(row))
403                 if row is None:
404                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
405                     instances.set_has_obfuscation(blocker, True)
406                     continue
407
408                 block["blocked"] = row["domain"]
409                 origin           = row["origin"]
410                 nodeinfo_url     = row["nodeinfo_url"]
411
412             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
413             if block["blocked"] == "":
414                 logger.debug("block[blocked] is empty - SKIPPED!")
415                 continue
416
417             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
418             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
419             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
420
421             if not domain_helper.is_wanted(block["blocked"]):
422                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
423                 continue
424             elif block["block_level"] in ["accept", "accepted"]:
425                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
426                 continue
427             elif not instances.is_registered(block["blocked"]):
428                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
429                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
430
431             block["block_level"] = blocks.alias_block_level(block["block_level"])
432
433             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
434                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
435                 blockdict.append({
436                     "blocked": block["blocked"],
437                     "reason" : block["reason"],
438                 })
439
440             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
441             cookies.clear(block["blocked"])
442
443         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
444         if instances.has_pending(blocker):
445             logger.debug("Flushing updates for blocker='%s' ...", blocker)
446             instances.update_data(blocker)
447
448         logger.debug("Invoking commit() ...")
449         database.connection.commit()
450
451         logger.debug("Invoking cookies.clear(%s) ...", blocker)
452         cookies.clear(blocker)
453
454         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
455         if config.get("bot_enabled") and len(blockdict) > 0:
456             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
457             network.send_bot_post(blocker, blockdict)
458
459     logger.debug("Success! - EXIT!")
460     return 0
461
462 def fetch_observer(args: argparse.Namespace) -> int:
463     logger.debug("args[]='%s' - CALLED!", type(args))
464
465     logger.debug("Invoking locking.acquire() ...")
466     locking.acquire()
467
468     source_domain = "fediverse.observer"
469     if sources.is_recent(source_domain):
470         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
471         return 0
472     else:
473         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
474         sources.update(source_domain)
475
476     types = list()
477     if args.software is None:
478         logger.info("Fetching software list ...")
479         raw = utils.fetch_url(
480             f"https://{source_domain}",
481             network.web_headers,
482             (config.get("connection_timeout"), config.get("read_timeout"))
483         ).text
484         logger.debug("raw[%s]()=%d", type(raw), len(raw))
485
486         doc = bs4.BeautifulSoup(raw, features="html.parser")
487         logger.debug("doc[]='%s'", type(doc))
488
489         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
490         logger.debug("navbar[]='%s'", type(navbar))
491         if navbar is None:
492             logger.warning("Cannot find navigation bar, cannot continue!")
493             return 1
494
495         items = navbar.findAll("a", {"class": "dropdown-item"})
496         logger.debug("items[]='%s'", type(items))
497
498         logger.info("Checking %d menu items ...", len(items))
499         for item in items:
500             logger.debug("item[%s]='%s'", type(item), item)
501             if item.text.lower() == "all":
502                 logger.debug("Skipping 'All' menu entry ...")
503                 continue
504
505             logger.debug("Appending item.text='%s' ...", item.text)
506             types.append(tidyup.domain(item.text))
507     else:
508         logger.info("Adding args.software='%s' as type ...", args.software)
509         types.append(args.software)
510
511     logger.info("Fetching %d different table data ...", len(types))
512     for software in types:
513         logger.debug("software='%s' - BEFORE!", software)
514         if args.software is not None and args.software != software:
515             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
516             continue
517
518         doc = None
519         try:
520             logger.debug("Fetching table data for software='%s' ...", software)
521             raw = utils.fetch_url(
522                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
523                 network.web_headers,
524                 (config.get("connection_timeout"), config.get("read_timeout"))
525             ).text
526             logger.debug("raw[%s]()=%d", type(raw), len(raw))
527
528             doc = bs4.BeautifulSoup(raw, features="html.parser")
529             logger.debug("doc[]='%s'", type(doc))
530         except network.exceptions as exception:
531             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
532             continue
533
534         items = doc.findAll("a", {"class": "url"})
535         logger.info("Checking %d items,software='%s' ...", len(items), software)
536         for item in items:
537             logger.debug("item[]='%s'", type(item))
538             domain = item.decode_contents()
539             logger.debug("domain='%s' - AFTER!", domain)
540
541             if domain == "":
542                 logger.debug("domain is empty - SKIPPED!")
543                 continue
544
545             logger.debug("domain='%s' - BEFORE!", domain)
546             domain = domain.encode("idna").decode("utf-8")
547             logger.debug("domain='%s' - AFTER!", domain)
548
549             if not domain_helper.is_wanted(domain):
550                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
551                 continue
552             elif instances.is_registered(domain):
553                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
554                 continue
555             elif instances.is_recent(domain):
556                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
557                 continue
558
559             software = software_helper.alias(software)
560             logger.info("Fetching instances for domain='%s'", domain)
561             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
562
563     logger.debug("Success! - EXIT!")
564     return 0
565
566 def fetch_todon_wiki(args: argparse.Namespace) -> int:
567     logger.debug("args[]='%s' - CALLED!", type(args))
568
569     logger.debug("Invoking locking.acquire() ...")
570     locking.acquire()
571
572     source_domain = "wiki.todon.eu"
573     if sources.is_recent(source_domain):
574         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
575         return 0
576     else:
577         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
578         sources.update(source_domain)
579
580     blocklist = {
581         "silenced": list(),
582         "reject": list(),
583     }
584
585     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
586     raw = utils.fetch_url(
587         f"https://{source_domain}/todon/domainblocks",
588         network.web_headers,
589         (config.get("connection_timeout"), config.get("read_timeout"))
590     ).text
591     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
592
593     doc = bs4.BeautifulSoup(raw, "html.parser")
594     logger.debug("doc[]='%s'", type(doc))
595
596     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
597     logger.info("Checking %d silenced/limited entries ...", len(silenced))
598     blocklist["silenced"] = utils.find_domains(silenced, "div")
599
600     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
601     logger.info("Checking %d suspended entries ...", len(suspended))
602     blocklist["reject"] = utils.find_domains(suspended, "div")
603
604     blocking = blocklist["silenced"] + blocklist["reject"]
605     blocker = "todon.eu"
606
607     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
608     instances.set_last_blocked(blocker)
609     instances.set_total_blocks(blocker, blocking)
610
611     blockdict = list()
612     for block_level in blocklist:
613         blockers = blocklist[block_level]
614
615         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
616         for blocked in blockers:
617             logger.debug("blocked='%s'", blocked)
618
619             if not instances.is_registered(blocked):
620                 try:
621                     logger.info("Fetching instances from domain='%s' ...", blocked)
622                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
623                 except network.exceptions as exception:
624                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
625                     instances.set_last_error(blocked, exception)
626
627             if blocks.is_instance_blocked(blocker, blocked, block_level):
628                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
629                 continue
630
631             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
632             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
633                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
634                 blockdict.append({
635                     "blocked": blocked,
636                     "reason" : None,
637                 })
638
639         logger.debug("Invoking commit() ...")
640         database.connection.commit()
641
642         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
643         if config.get("bot_enabled") and len(blockdict) > 0:
644             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
645             network.send_bot_post(blocker, blockdict)
646
647     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
648     if instances.has_pending(blocker):
649         logger.debug("Flushing updates for blocker='%s' ...", blocker)
650         instances.update_data(blocker)
651
652     logger.debug("Success! - EXIT!")
653     return 0
654
655 def fetch_cs(args: argparse.Namespace):
656     logger.debug("args[]='%s' - CALLED!", type(args))
657
658     logger.debug("Invoking locking.acquire() ...")
659     locking.acquire()
660
661     extensions = [
662         "extra",
663         "abbr",
664         "attr_list",
665         "def_list",
666         "fenced_code",
667         "footnotes",
668         "md_in_html",
669         "admonition",
670         "codehilite",
671         "legacy_attrs",
672         "legacy_em",
673         "meta",
674         "nl2br",
675         "sane_lists",
676         "smarty",
677         "toc",
678         "wikilinks"
679     ]
680
681     blocklist = {
682         "silenced": list(),
683         "reject"  : list(),
684     }
685
686     source_domain = "raw.githubusercontent.com"
687     if sources.is_recent(source_domain):
688         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
689         return 0
690     else:
691         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
692         sources.update(source_domain)
693
694     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
695     raw = utils.fetch_url(
696         f"https://{source_domain}/chaossocial/meta/master/federation.md",
697         network.web_headers,
698         (config.get("connection_timeout"), config.get("read_timeout"))
699     ).text
700     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
701
702     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
703     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
704
705     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
706     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
707     blocklist["silenced"] = federation.find_domains(silenced)
708
709     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
710     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
711     blocklist["reject"] = federation.find_domains(blocked)
712
713     blocking = blocklist["silenced"] + blocklist["reject"]
714     blocker = "chaos.social"
715
716     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
717     instances.set_last_blocked(blocker)
718     instances.set_total_blocks(blocker, blocking)
719
720     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
721     if len(blocking) > 0:
722         blockdict = list()
723         for block_level in blocklist:
724             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
725
726             for row in blocklist[block_level]:
727                 logger.debug("row[%s]='%s'", type(row), row)
728                 if not "domain" in row:
729                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
730                     continue
731                 elif not instances.is_registered(row["domain"]):
732                     try:
733                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
734                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
735                     except network.exceptions as exception:
736                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
737                         instances.set_last_error(row["domain"], exception)
738
739                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
740                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
741                     blockdict.append({
742                         "blocked": row["domain"],
743                         "reason" : row["reason"],
744                     })
745
746         logger.debug("Invoking commit() ...")
747         database.connection.commit()
748
749         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
750         if config.get("bot_enabled") and len(blockdict) > 0:
751             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
752             network.send_bot_post(blocker, blockdict)
753
754     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
755     if instances.has_pending(blocker):
756         logger.debug("Flushing updates for blocker='%s' ...", blocker)
757         instances.update_data(blocker)
758
759     logger.debug("Success! - EXIT!")
760     return 0
761
762 def fetch_fba_rss(args: argparse.Namespace) -> int:
763     logger.debug("args[]='%s' - CALLED!", type(args))
764
765     domains = list()
766
767     logger.debug("Invoking locking.acquire() ...")
768     locking.acquire()
769
770     components = urlparse(args.feed)
771
772     if sources.is_recent(components.netloc):
773         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
774         return 0
775     else:
776         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
777         sources.update(components.netloc)
778
779     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
780     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
781
782     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
783     if response.ok and response.status_code < 300 and len(response.text) > 0:
784         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
785         rss = atoma.parse_rss_bytes(response.content)
786
787         logger.debug("rss[]='%s'", type(rss))
788         for item in rss.items:
789             logger.debug("item[%s]='%s'", type(item), item)
790             domain = tidyup.domain(item.link.split("=")[1])
791
792             logger.debug("domain='%s' - AFTER!", domain)
793             if domain == "":
794                 logger.debug("domain is empty - SKIPPED!")
795                 continue
796
797             logger.debug("domain='%s' - BEFORE!", domain)
798             domain = domain.encode("idna").decode("utf-8")
799             logger.debug("domain='%s' - AFTER!", domain)
800
801             if not domain_helper.is_wanted(domain):
802                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
803                 continue
804             elif domain in domains:
805                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
806                 continue
807             elif instances.is_registered(domain):
808                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
809                 continue
810             elif instances.is_recent(domain):
811                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
812                 continue
813
814             logger.debug("Adding domain='%s'", domain)
815             domains.append(domain)
816
817     logger.debug("domains()=%d", len(domains))
818     if len(domains) > 0:
819         logger.info("Adding %d new instances ...", len(domains))
820         for domain in domains:
821             logger.debug("domain='%s'", domain)
822             try:
823                 logger.info("Fetching instances from domain='%s' ...", domain)
824                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
825             except network.exceptions as exception:
826                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
827                 instances.set_last_error(domain, exception)
828                 return 100
829
830     logger.debug("Success! - EXIT!")
831     return 0
832
833 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
834     logger.debug("args[]='%s' - CALLED!", type(args))
835
836     logger.debug("Invoking locking.acquire() ...")
837     locking.acquire()
838
839     source_domain = "ryona.agency"
840     if sources.is_recent(source_domain):
841         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
842         return 0
843     else:
844         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
845         sources.update(source_domain)
846
847     feed = f"https://{source_domain}/users/fba/feed.atom"
848
849     domains = list()
850
851     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
852     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
853
854     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
855     if response.ok and response.status_code < 300 and len(response.text) > 0:
856         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
857         atom = atoma.parse_atom_bytes(response.content)
858
859         logger.debug("atom[]='%s'", type(atom))
860         for entry in atom.entries:
861             logger.debug("entry[]='%s'", type(entry))
862             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
863             logger.debug("doc[]='%s'", type(doc))
864             for element in doc.findAll("a"):
865                 logger.debug("element[]='%s'", type(element))
866                 for href in element["href"].split(","):
867                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
868                     domain = tidyup.domain(href)
869
870                     logger.debug("domain='%s' - AFTER!", domain)
871                     if domain == "":
872                         logger.debug("domain is empty - SKIPPED!")
873                         continue
874
875                     logger.debug("domain='%s' - BEFORE!", domain)
876                     domain = domain.encode("idna").decode("utf-8")
877                     logger.debug("domain='%s' - AFTER!", domain)
878
879                     if not domain_helper.is_wanted(domain):
880                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
881                         continue
882                     elif domain in domains:
883                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
884                         continue
885                     elif instances.is_registered(domain):
886                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
887                         continue
888                     elif instances.is_recent(domain):
889                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
890                         continue
891
892                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
893                     domains.append(domain)
894
895     logger.debug("domains()=%d", len(domains))
896     if len(domains) > 0:
897         logger.info("Adding %d new instances ...", len(domains))
898         for domain in domains:
899             logger.debug("domain='%s'", domain)
900             try:
901                 logger.info("Fetching instances from domain='%s' ...", domain)
902                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
903             except network.exceptions as exception:
904                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
905                 instances.set_last_error(domain, exception)
906                 return 100
907
908     logger.debug("Success! - EXIT!")
909     return 0
910
911 def fetch_instances(args: argparse.Namespace) -> int:
912     logger.debug("args[]='%s' - CALLED!", type(args))
913
914     logger.debug("args.domain='%s' - checking ...", args.domain)
915     if not validators.domain(args.domain):
916         logger.warning("args.domain='%s' is not valid.", args.domain)
917         return 100
918     elif blacklist.is_blacklisted(args.domain):
919         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
920         return 101
921
922     logger.debug("Invoking locking.acquire() ...")
923     locking.acquire()
924
925     # Initial fetch
926     try:
927         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
928         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
929     except network.exceptions as exception:
930         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
931         instances.set_last_error(args.domain, exception)
932         instances.update_data(args.domain)
933         return 100
934
935     if args.single:
936         logger.debug("Not fetching more instances - EXIT!")
937         return 0
938
939     # Loop through some instances
940     database.cursor.execute(
941         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
942     )
943
944     rows = database.cursor.fetchall()
945     logger.info("Checking %d entries ...", len(rows))
946     for row in rows:
947         logger.debug("row[domain]='%s'", row["domain"])
948         if row["domain"] == "":
949             logger.debug("row[domain] is empty - SKIPPED!")
950             continue
951
952         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
953         domain = row["domain"].encode("idna").decode("utf-8")
954         logger.debug("domain='%s' - AFTER!", domain)
955
956         if not domain_helper.is_wanted(domain):
957             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
958             continue
959
960         try:
961             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
962             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
963         except network.exceptions as exception:
964             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
965             instances.set_last_error(domain, exception)
966
967     logger.debug("Success - EXIT!")
968     return 0
969
970 def fetch_oliphant(args: argparse.Namespace) -> int:
971     logger.debug("args[]='%s' - CALLED!", type(args))
972
973     logger.debug("Invoking locking.acquire() ...")
974     locking.acquire()
975
976     source_domain = "codeberg.org"
977     if sources.is_recent(source_domain):
978         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
979         return 0
980     else:
981         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
982         sources.update(source_domain)
983
984     # Base URL
985     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
986
987     # URLs to fetch
988     blocklists = (
989         {
990             "blocker": "artisan.chat",
991             "csv_url": "mastodon/artisan.chat.csv",
992         },{
993             "blocker": "mastodon.art",
994             "csv_url": "mastodon/mastodon.art.csv",
995         },{
996             "blocker": "pleroma.envs.net",
997             "csv_url": "mastodon/pleroma.envs.net.csv",
998         },{
999             "blocker": "oliphant.social",
1000             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
1001         },{
1002             "blocker": "mastodon.online",
1003             "csv_url": "mastodon/mastodon.online.csv",
1004         },{
1005             "blocker": "mastodon.social",
1006             "csv_url": "mastodon/mastodon.social.csv",
1007         },{
1008             "blocker": "mastodon.social",
1009             "csv_url": "other/missing-tier0-mastodon.social.csv",
1010         },{
1011             "blocker": "rage.love",
1012             "csv_url": "mastodon/rage.love.csv",
1013         },{
1014             "blocker": "sunny.garden",
1015             "csv_url": "mastodon/sunny.garden.csv",
1016         },{
1017             "blocker": "sunny.garden",
1018             "csv_url": "mastodon/gardenfence.csv",
1019         },{
1020             "blocker": "solarpunk.moe",
1021             "csv_url": "mastodon/solarpunk.moe.csv",
1022         },{
1023             "blocker": "toot.wales",
1024             "csv_url": "mastodon/toot.wales.csv",
1025         },{
1026             "blocker": "union.place",
1027             "csv_url": "mastodon/union.place.csv",
1028         },{
1029             "blocker": "oliphant.social",
1030             "csv_url": "mastodon/birdsite.csv",
1031         }
1032     )
1033
1034     domains = list()
1035
1036     logger.debug("Downloading %d files ...", len(blocklists))
1037     for block in blocklists:
1038         # Is domain given and not equal blocker?
1039         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1040             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1041             continue
1042         elif args.domain in domains:
1043             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1044             continue
1045
1046         instances.set_last_blocked(block["blocker"])
1047
1048         # Fetch this URL
1049         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1050         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1051
1052         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1053         if not response.ok or response.status_code >= 300 or response.content == "":
1054             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1055             continue
1056
1057         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1058         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1059
1060         blockdict = list()
1061
1062         cnt = 0
1063         for row in reader:
1064             logger.debug("row[%s]='%s'", type(row), row)
1065             domain = severity = None
1066             reject_media = reject_reports = False
1067
1068             if "#domain" in row:
1069                 domain = row["#domain"]
1070             elif "domain" in row:
1071                 domain = row["domain"]
1072             else:
1073                 logger.debug("row='%s' does not contain domain column", row)
1074                 continue
1075
1076             if "#severity" in row:
1077                 severity = blocks.alias_block_level(row["#severity"])
1078             elif "severity" in row:
1079                 severity = blocks.alias_block_level(row["severity"])
1080             else:
1081                 logger.debug("row='%s' does not contain severity column", row)
1082                 continue
1083
1084             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1085                 reject_media = True
1086             elif "reject_media" in row and row["reject_media"].lower() == "true":
1087                 reject_media = True
1088
1089             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1090                 reject_reports = True
1091             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1092                 reject_reports = True
1093
1094             cnt = cnt + 1
1095             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1096             if domain == "":
1097                 logger.debug("domain is empty - SKIPPED!")
1098                 continue
1099             elif domain.endswith(".onion"):
1100                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1101                 continue
1102             elif domain.endswith(".arpa"):
1103                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1104                 continue
1105             elif domain.endswith(".tld"):
1106                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1107                 continue
1108             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1109                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1110                 domain = utils.deobfuscate(domain, block["blocker"])
1111                 logger.debug("domain='%s' - AFTER!", domain)
1112
1113             if not validators.domain(domain):
1114                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1115                 continue
1116             elif blacklist.is_blacklisted(domain):
1117                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1118                 continue
1119             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1120                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1121                 continue
1122
1123             logger.debug("Marking domain='%s' as handled", domain)
1124             domains.append(domain)
1125
1126             logger.debug("Processing domain='%s' ...", domain)
1127             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1128             logger.debug("processed='%s'", processed)
1129
1130             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1131                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1132                 blockdict.append({
1133                     "blocked": domain,
1134                     "reason" : block["reason"],
1135                 })
1136
1137             if reject_media:
1138                 processing.block(block["blocker"], domain, None, "reject_media")
1139             if reject_reports:
1140                 processing.block(block["blocker"], domain, None, "reject_reports")
1141
1142         logger.debug("block[blocker]='%s'", block["blocker"])
1143         if block["blocker"] != "chaos.social":
1144             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1145             instances.set_total_blocks(block["blocker"], domains)
1146
1147         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1148         if instances.has_pending(block["blocker"]):
1149             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1150             instances.update_data(block["blocker"])
1151
1152         logger.debug("Invoking commit() ...")
1153         database.connection.commit()
1154
1155         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1156         if config.get("bot_enabled") and len(blockdict) > 0:
1157             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1158             network.send_bot_post(block["blocker"], blockdict)
1159
1160     logger.debug("Success! - EXIT!")
1161     return 0
1162
1163 def fetch_txt(args: argparse.Namespace) -> int:
1164     logger.debug("args[]='%s' - CALLED!", type(args))
1165
1166     logger.debug("Invoking locking.acquire() ...")
1167     locking.acquire()
1168
1169     # Static URLs
1170     urls = ({
1171         "blocker": "seirdy.one",
1172         "url"    : "https://seirdy.one/pb/bsl.txt",
1173     },)
1174
1175     logger.info("Checking %d text file(s) ...", len(urls))
1176     for row in urls:
1177         logger.debug("Fetching row[url]='%s' ...", row["url"])
1178         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1179
1180         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1181         if response.ok and response.status_code < 300 and response.text != "":
1182             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1183             domains = response.text.split("\n")
1184
1185             logger.info("Processing %d domains ...", len(domains))
1186             for domain in domains:
1187                 logger.debug("domain='%s' - BEFORE!", domain)
1188                 domain = tidyup.domain(domain)
1189
1190                 logger.debug("domain='%s' - AFTER!", domain)
1191                 if domain == "":
1192                     logger.debug("domain is empty - SKIPPED!")
1193                     continue
1194                 elif not domain_helper.is_wanted(domain):
1195                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1196                     continue
1197                 elif instances.is_recent(domain):
1198                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1199                     continue
1200
1201                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1202                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1203
1204                 logger.debug("processed='%s'", processed)
1205                 if not processed:
1206                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1207                     continue
1208
1209     logger.debug("Success! - EXIT!")
1210     return 0
1211
1212 def fetch_fedipact(args: argparse.Namespace) -> int:
1213     logger.debug("args[]='%s' - CALLED!", type(args))
1214
1215     logger.debug("Invoking locking.acquire() ...")
1216     locking.acquire()
1217
1218     source_domain = "fedipact.online"
1219     if sources.is_recent(source_domain):
1220         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1221         return 0
1222     else:
1223         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1224         sources.update(source_domain)
1225
1226     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1227     response = utils.fetch_url(
1228         f"https://{source_domain}",
1229         network.web_headers,
1230         (config.get("connection_timeout"), config.get("read_timeout"))
1231     )
1232
1233     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1234     if response.ok and response.status_code < 300 and response.text != "":
1235         logger.debug("Parsing %d Bytes ...", len(response.text))
1236
1237         doc = bs4.BeautifulSoup(response.text, "html.parser")
1238         logger.debug("doc[]='%s'", type(doc))
1239
1240         rows = doc.findAll("li")
1241         logger.info("Checking %d row(s) ...", len(rows))
1242         for row in rows:
1243             logger.debug("row[]='%s'", type(row))
1244             domain = tidyup.domain(row.contents[0])
1245
1246             logger.debug("domain='%s' - AFTER!", domain)
1247             if domain == "":
1248                 logger.debug("domain is empty - SKIPPED!")
1249                 continue
1250
1251             logger.debug("domain='%s' - BEFORE!", domain)
1252             domain = domain.encode("idna").decode("utf-8")
1253             logger.debug("domain='%s' - AFTER!", domain)
1254
1255             if not domain_helper.is_wanted(domain):
1256                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1257                 continue
1258             elif instances.is_registered(domain):
1259                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1260                 continue
1261             elif instances.is_recent(domain):
1262                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1263                 continue
1264
1265             logger.info("Fetching domain='%s' ...", domain)
1266             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1267
1268     logger.debug("Success! - EXIT!")
1269     return 0
1270
1271 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1272     logger.debug("args[]='%s' - CALLED!", type(args))
1273
1274     logger.debug("Invoking locking.acquire() ...")
1275     locking.acquire()
1276
1277     source_domain = "instances.joinmobilizon.org"
1278     if sources.is_recent(source_domain):
1279         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1280         return 0
1281     else:
1282         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1283         sources.update(source_domain)
1284
1285     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1286     raw = utils.fetch_url(
1287         f"https://{source_domain}/api/v1/instances",
1288         network.web_headers,
1289         (config.get("connection_timeout"), config.get("read_timeout"))
1290     ).text
1291     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1292
1293     parsed = json.loads(raw)
1294     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1295
1296     if "data" not in parsed:
1297         logger.warning("parsed()=%d does not contain key 'data'")
1298         return 1
1299
1300     logger.info("Checking %d instances ...", len(parsed["data"]))
1301     for row in parsed["data"]:
1302         logger.debug("row[]='%s'", type(row))
1303         if "host" not in row:
1304             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1305             continue
1306         elif not domain_helper.is_wanted(row["host"]):
1307             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1308             continue
1309         elif instances.is_registered(row["host"]):
1310             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1311             continue
1312
1313         logger.info("Fetching row[host]='%s' ...", row["host"])
1314         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1315
1316     logger.debug("Success! - EXIT!")
1317     return 0
1318
1319 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1320     logger.debug("args[]='%s' - CALLED!", type(args))
1321
1322     logger.debug("Invoking locking.acquire() ...")
1323     locking.acquire()
1324
1325     source_domain = "instanceapp.misskey.page"
1326     if sources.is_recent(source_domain):
1327         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1328         return 0
1329     else:
1330         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1331         sources.update(source_domain)
1332
1333     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1334     raw = utils.fetch_url(
1335         f"https://{source_domain}/instances.json",
1336         network.web_headers,
1337         (config.get("connection_timeout"), config.get("read_timeout"))
1338     ).text
1339     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1340
1341     parsed = json.loads(raw)
1342     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1343
1344     if "instancesInfos" not in parsed:
1345         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1346         return 1
1347
1348     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1349     for row in parsed["instancesInfos"]:
1350         logger.debug("row[%s]='%s'", type(row), row)
1351         if "url" not in row:
1352             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1353             continue
1354         elif not domain_helper.is_wanted(row["url"]):
1355             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1356             continue
1357         elif instances.is_registered(row["url"]):
1358             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1359             continue
1360
1361         logger.info("Fetching row[url]='%s' ...", row["url"])
1362         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1363
1364     logger.debug("Success! - EXIT!")
1365     return 0
1366
1367 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1368     logger.debug("args[]='%s' - CALLED!", type(args))
1369
1370     logger.debug("Invoking locking.acquire() ...")
1371     locking.acquire()
1372
1373     source_domain = "joinfediverse.wiki"
1374     if sources.is_recent(source_domain):
1375         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1376         return 0
1377     else:
1378         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1379         sources.update(source_domain)
1380
1381     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1382     raw = utils.fetch_url(
1383         f"https://{source_domain}/FediBlock",
1384         network.web_headers,
1385         (config.get("connection_timeout"), config.get("read_timeout"))
1386     ).text
1387     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1388
1389     doc = bs4.BeautifulSoup(raw, "html.parser")
1390     logger.debug("doc[]='%s'", type(doc))
1391
1392     tables = doc.findAll("table", {"class": "wikitable"})
1393
1394     logger.info("Analyzing %d table(s) ...", len(tables))
1395     blocklist = list()
1396     for table in tables:
1397         logger.debug("table[]='%s'", type(table))
1398
1399         rows = table.findAll("tr")
1400         logger.info("Checking %d row(s) ...", len(rows))
1401         block_headers = dict()
1402         for row in rows:
1403             logger.debug("row[%s]='%s'", type(row), row)
1404
1405             headers = row.findAll("th")
1406             logger.debug("Found headers()=%d header(s)", len(headers))
1407             if len(headers) > 1:
1408                 block_headers = dict()
1409                 cnt = 0
1410                 for header in headers:
1411                     cnt = cnt + 1
1412                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1413                     text = header.contents[0]
1414
1415                     logger.debug("text[]='%s'", type(text))
1416                     if not isinstance(text, str):
1417                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1418                         continue
1419                     elif validators.domain(text.strip()):
1420                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1421                         continue
1422
1423                     text = tidyup.domain(text.strip())
1424                     logger.debug("text='%s' - AFTER!", text)
1425                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1426                         logger.debug("Found header: '%s'=%d", text, cnt)
1427                         block_headers[cnt] = text
1428
1429             elif len(block_headers) == 0:
1430                 logger.debug("row is not scrapable - SKIPPED!")
1431                 continue
1432             elif len(block_headers) > 0:
1433                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1434                 cnt = 0
1435                 block = dict()
1436
1437                 for element in row.find_all(["th", "td"]):
1438                     cnt = cnt + 1
1439                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1440                     if cnt in block_headers:
1441                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1442
1443                         text = element.text.strip()
1444                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1445
1446                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1447                         if key in ["domain", "instance"]:
1448                             block[key] = text
1449                         elif key == "reason":
1450                             block[key] = tidyup.reason(text)
1451                         elif key == "subdomain(s)":
1452                             block[key] = list()
1453                             if text != "":
1454                                 block[key] = text.split("/")
1455                         else:
1456                             logger.debug("key='%s'", key)
1457                             block[key] = text
1458
1459                 logger.debug("block()=%d ...", len(block))
1460                 if len(block) > 0:
1461                     logger.debug("Appending block()=%d ...", len(block))
1462                     blocklist.append(block)
1463
1464     logger.debug("blocklist()=%d", len(blocklist))
1465
1466     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1467     domains = database.cursor.fetchall()
1468
1469     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1470     blocking = list()
1471     for block in blocklist:
1472         logger.debug("block='%s'", block)
1473         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1474             origin = block["blocked"]
1475             logger.debug("origin='%s'", origin)
1476             for subdomain in block["subdomain(s)"]:
1477                 block["blocked"] = subdomain + "." + origin
1478                 logger.debug("block[blocked]='%s'", block["blocked"])
1479                 blocking.append(block)
1480         else:
1481             blocking.append(block)
1482
1483     logger.debug("blocking()=%d", blocking)
1484     for block in blocking:
1485         logger.debug("block[]='%s'", type(block))
1486         if "blocked" not in block:
1487             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1488
1489         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1490         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1491
1492         if block["blocked"] == "":
1493             logger.debug("block[blocked] is empty - SKIPPED!")
1494             continue
1495         elif not domain_helper.is_wanted(block["blocked"]):
1496             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1497             continue
1498         elif instances.is_recent(block["blocked"]):
1499             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1500             continue
1501
1502         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1503         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1504
1505     blockdict = list()
1506     for blocker in domains:
1507         blocker = blocker[0]
1508         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1509         instances.set_last_blocked(blocker)
1510
1511         for block in blocking:
1512             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1513             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1514
1515             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1516             if block["blocked"] == "":
1517                 logger.debug("block[blocked] is empty - SKIPPED!")
1518                 continue
1519             elif not domain_helper.is_wanted(block["blocked"]):
1520                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1521                 continue
1522
1523             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1524             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1525                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1526                 blockdict.append({
1527                     "blocked": block["blocked"],
1528                     "reason" : block["reason"],
1529                 })
1530
1531         if instances.has_pending(blocker):
1532             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1533             instances.update_data(blocker)
1534
1535         logger.debug("Invoking commit() ...")
1536         database.connection.commit()
1537
1538         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1539         if config.get("bot_enabled") and len(blockdict) > 0:
1540             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1541             network.send_bot_post(blocker, blockdict)
1542
1543     logger.debug("Success! - EXIT!")
1544     return 0
1545
1546 def recheck_obfuscation(args: argparse.Namespace) -> int:
1547     logger.debug("args[]='%s' - CALLED!", type(args))
1548
1549     logger.debug("Invoking locking.acquire() ...")
1550     locking.acquire()
1551
1552     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1553         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1554     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1555         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1556     else:
1557         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1558
1559     rows = database.cursor.fetchall()
1560     logger.info("Checking %d domains ...", len(rows))
1561     for row in rows:
1562         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1563         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1564             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1565             continue
1566
1567         blocking = list()
1568         if row["software"] == "pleroma":
1569             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1570             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1571         elif row["software"] == "mastodon":
1572             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1573             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1574         elif row["software"] == "lemmy":
1575             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1576             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1577         elif row["software"] == "friendica":
1578             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1579             blocking = friendica.fetch_blocks(row["domain"])
1580         elif row["software"] == "misskey":
1581             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1582             blocking = misskey.fetch_blocks(row["domain"])
1583         else:
1584             logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1585
1586         logger.debug("row[domain]='%s'", row["domain"])
1587         # chaos.social requires special care ...
1588         if row["domain"] != "chaos.social":
1589             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1590             instances.set_last_blocked(row["domain"])
1591             instances.set_total_blocks(row["domain"], blocking)
1592
1593         obfuscated = 0
1594         blockdict = list()
1595
1596         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1597         for block in blocking:
1598             logger.debug("block[blocked]='%s'", block["blocked"])
1599             blocked = None
1600
1601             if block["blocked"] == "":
1602                 logger.debug("block[blocked] is empty - SKIPPED!")
1603                 continue
1604             elif block["blocked"].endswith(".arpa"):
1605                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1606                 continue
1607             elif block["blocked"].endswith(".tld"):
1608                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1609                 continue
1610             elif block["blocked"].endswith(".onion"):
1611                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1612                 continue
1613             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1614                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1615                 obfuscated = obfuscated + 1
1616                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1617             elif not domain_helper.is_wanted(block["blocked"]):
1618                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1619                 continue
1620             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1621                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1622                 continue
1623
1624             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1625             if blocked is not None and blocked != block["blocked"]:
1626                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1627                 obfuscated = obfuscated - 1
1628
1629                 if blocks.is_instance_blocked(row["domain"], blocked):
1630                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1631                     continue
1632                 elif blacklist.is_blacklisted(blocked):
1633                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1634                     continue
1635
1636                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1637
1638                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1639                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1640                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1641                     blockdict.append({
1642                         "blocked": blocked,
1643                         "reason" : block["reason"],
1644                     })
1645
1646         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1647         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1648
1649         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1650         if obfuscated == 0 and len(blocking) > 0:
1651             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1652             instances.set_has_obfuscation(row["domain"], False)
1653
1654         if instances.has_pending(row["domain"]):
1655             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1656             instances.update_data(row["domain"])
1657
1658         logger.debug("Invoking commit() ...")
1659         database.connection.commit()
1660
1661         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1662         if config.get("bot_enabled") and len(blockdict) > 0:
1663             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1664             network.send_bot_post(row["domain"], blockdict)
1665
1666     logger.debug("Success! - EXIT!")
1667     return 0
1668
1669 def fetch_fedilist(args: argparse.Namespace) -> int:
1670     logger.debug("args[]='%s' - CALLED!", type(args))
1671
1672     logger.debug("Invoking locking.acquire() ...")
1673     locking.acquire()
1674
1675     source_domain = "demo.fedilist.com"
1676     if sources.is_recent(source_domain):
1677         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1678         return 0
1679     else:
1680         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1681         sources.update(source_domain)
1682
1683     url = f"http://{source_domain}/instance/csv?onion=not"
1684     if args.software is not None and args.software != "":
1685         logger.debug("args.software='%s'", args.software)
1686         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1687
1688     logger.info("Fetching url='%s' ...", url)
1689     response = reqto.get(
1690         url,
1691         headers=network.web_headers,
1692         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1693         allow_redirects=False
1694     )
1695
1696     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1697     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1698         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1699         return 1
1700
1701     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1702
1703     logger.debug("reader[]='%s'", type(reader))
1704     if reader is None:
1705         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1706         return 2
1707
1708     rows = list(reader)
1709
1710     logger.info("Checking %d rows ...", len(rows))
1711     for row in rows:
1712         logger.debug("row[]='%s'", type(row))
1713         if "hostname" not in row:
1714             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1715             continue
1716
1717         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1718         domain = tidyup.domain(row["hostname"])
1719         logger.debug("domain='%s' - AFTER!", domain)
1720
1721         if domain == "":
1722             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1723             continue
1724
1725         logger.debug("domain='%s' - BEFORE!", domain)
1726         domain = domain.encode("idna").decode("utf-8")
1727         logger.debug("domain='%s' - AFTER!", domain)
1728
1729         if not domain_helper.is_wanted(domain):
1730             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1731             continue
1732         elif (args.force is None or not args.force) and instances.is_registered(domain):
1733             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1734             continue
1735         elif instances.is_recent(domain):
1736             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1737             continue
1738
1739         logger.info("Fetching instances from domain='%s' ...", domain)
1740         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1741
1742     logger.debug("Success! - EXIT!")
1743     return 0
1744
1745 def update_nodeinfo(args: argparse.Namespace) -> int:
1746     logger.debug("args[]='%s' - CALLED!", type(args))
1747
1748     logger.debug("Invoking locking.acquire() ...")
1749     locking.acquire()
1750
1751     if args.domain is not None and args.domain != "":
1752         logger.debug("Fetching args.domain='%s'", args.domain)
1753         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1754     elif args.software is not None and args.software != "":
1755         logger.info("Fetching domains for args.software='%s'", args.software)
1756         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1757     else:
1758         logger.info("Fetching domains for recently updated ...")
1759         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1760
1761     domains = database.cursor.fetchall()
1762
1763     logger.info("Checking %d domain(s) ...", len(domains))
1764     cnt = 0
1765     for row in domains:
1766         logger.debug("row[]='%s'", type(row))
1767         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1768             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1769             continue
1770
1771         try:
1772             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1773             software = federation.determine_software(row["domain"])
1774
1775             logger.debug("Determined software='%s'", software)
1776             if (software != row["software"] and software is not None) or args.force is True:
1777                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1778                 instances.set_software(row["domain"], software)
1779
1780             if software is not None:
1781                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1782                 instances.set_success(row["domain"])
1783         except network.exceptions as exception:
1784             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1785             instances.set_last_error(row["domain"], exception)
1786
1787         instances.set_last_nodeinfo(row["domain"])
1788         instances.update_data(row["domain"])
1789         cnt = cnt + 1
1790
1791     logger.debug("Success! - EXIT!")
1792     return 0
1793
1794 def fetch_instances_social(args: argparse.Namespace) -> int:
1795     logger.debug("args[]='%s' - CALLED!", type(args))
1796
1797     logger.debug("Invoking locking.acquire() ...")
1798     locking.acquire()
1799
1800     source_domain = "instances.social"
1801
1802     if config.get("instances_social_api_key") == "":
1803         logger.error("API key not set. Please set in your config.json file.")
1804         return 1
1805     elif sources.is_recent(source_domain):
1806         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1807         return 0
1808     else:
1809         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1810         sources.update(source_domain)
1811
1812     headers = {
1813         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1814     }
1815
1816     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1817     fetched = network.get_json_api(
1818         source_domain,
1819         "/api/1.0/instances/list?count=0&sort_by=name",
1820         headers,
1821         (config.get("connection_timeout"), config.get("read_timeout"))
1822     )
1823     logger.debug("fetched[]='%s'", type(fetched))
1824
1825     if "error_message" in fetched:
1826         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1827         return 2
1828     elif "exception" in fetched:
1829         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1830         return 3
1831     elif "json" not in fetched:
1832         logger.warning("fetched has no element 'json' - EXIT!")
1833         return 4
1834     elif "instances" not in fetched["json"]:
1835         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1836         return 5
1837
1838     domains = list()
1839     rows = fetched["json"]["instances"]
1840
1841     logger.info("Checking %d row(s) ...", len(rows))
1842     for row in rows:
1843         logger.debug("row[]='%s'", type(row))
1844         domain = tidyup.domain(row["name"])
1845         logger.debug("domain='%s' - AFTER!", domain)
1846
1847         if domain == "":
1848             logger.debug("domain is empty - SKIPPED!")
1849             continue
1850
1851         logger.debug("domain='%s' - BEFORE!", domain)
1852         domain = domain.encode("idna").decode("utf-8")
1853         logger.debug("domain='%s' - AFTER!", domain)
1854
1855         if not domain_helper.is_wanted(domain):
1856             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1857             continue
1858         elif domain in domains:
1859             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1860             continue
1861         elif instances.is_registered(domain):
1862             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1863             continue
1864         elif instances.is_recent(domain):
1865             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1866             continue
1867
1868         logger.info("Fetching instances from domain='%s'", domain)
1869         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1870
1871     logger.debug("Success! - EXIT!")
1872     return 0
1873
1874 def fetch_relays(args: argparse.Namespace) -> int:
1875     logger.debug("args[]='%s' - CALLED!", type(args))
1876
1877     logger.debug("Invoking locking.acquire() ...")
1878     locking.acquire()
1879
1880     if args.domain is not None and args.domain != "":
1881         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1882     else:
1883         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1884
1885     domains = list()
1886     rows = database.cursor.fetchall()
1887
1888     logger.info("Checking %d relays ...", len(rows))
1889     for row in rows:
1890         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1891         peers = list()
1892         if not args.force and instances.is_recent(row["domain"]):
1893             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1894             continue
1895
1896         try:
1897             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1898             raw = utils.fetch_url(
1899                 f"https://{row['domain']}",
1900                 network.web_headers,
1901                 (config.get("connection_timeout"), config.get("read_timeout"))
1902             ).text
1903             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1904         except network.exceptions as exception:
1905             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1906             instances.set_last_error(row["domain"], exception)
1907             instances.set_last_instance_fetch(row["domain"])
1908             instances.update_data(row["domain"])
1909             continue
1910
1911         doc = bs4.BeautifulSoup(raw, features="html.parser")
1912         logger.debug("doc[]='%s'", type(doc))
1913
1914         logger.debug("row[software]='%s'", row["software"])
1915         if row["software"] == "activityrelay":
1916             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1917             tags = doc.findAll("p")
1918
1919             logger.debug("Checking %d paragraphs ...", len(tags))
1920             for tag in tags:
1921                 logger.debug("tag[]='%s'", type(tag))
1922                 if len(tag.contents) == 0:
1923                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1924                     continue
1925                 elif "registered instances" not in tag.contents[0]:
1926                     logger.debug("Skipping paragraph, text not found.")
1927                     continue
1928
1929                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1930                 for domain in tag.contents:
1931                     logger.debug("domain[%s]='%s'", type(domain), domain)
1932                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1933                         continue
1934
1935                     domain = str(domain)
1936                     logger.debug("domain='%s'", domain)
1937                     if not domain_helper.is_wanted(domain):
1938                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1939                         continue
1940
1941                     logger.debug("domain='%s' - BEFORE!", domain)
1942                     domain = tidyup.domain(domain)
1943                     logger.debug("domain='%s' - AFTER!", domain)
1944
1945                     if domain == "":
1946                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1947                         continue
1948                     elif domain not in peers:
1949                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1950                         peers.append(domain)
1951
1952                     if dict_helper.has_key(domains, "domain", domain):
1953                         logger.debug("domain='%s' already added", domain)
1954                         continue
1955
1956                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1957                     domains.append({
1958                         "domain": domain,
1959                         "origin": row["domain"],
1960                     })
1961         elif row["software"] in ["aoderelay", "selective-relay"]:
1962             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1963             if row["software"] == "aoderelay":
1964                 tags = doc.findAll("section", {"class": "instance"})
1965             else:
1966                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1967
1968             logger.debug("Checking %d tags ...", len(tags))
1969             for tag in tags:
1970                 logger.debug("tag[]='%s'", type(tag))
1971
1972                 link = tag.find("a")
1973                 logger.debug("link[%s]='%s'", type(link), link)
1974                 if link is None:
1975                     logger.warning("tag='%s' has no a-tag ...", tag)
1976                     continue
1977
1978                 components = urlparse(link["href"])
1979                 domain = components.netloc.lower()
1980
1981                 if not domain_helper.is_wanted(domain):
1982                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1983                     continue
1984
1985                 logger.debug("domain='%s' - BEFORE!", domain)
1986                 domain = tidyup.domain(domain)
1987                 logger.debug("domain='%s' - AFTER!", domain)
1988
1989                 if domain == "":
1990                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1991                     continue
1992                 elif domain not in peers:
1993                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1994                     peers.append(domain)
1995
1996                 if dict_helper.has_key(domains, "domain", domain):
1997                     logger.debug("domain='%s' already added", domain)
1998                     continue
1999
2000                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
2001                 domains.append({
2002                     "domain": domain,
2003                     "origin": row["domain"],
2004                 })
2005         else:
2006             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
2007
2008         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
2009         instances.set_last_instance_fetch(row["domain"])
2010
2011         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
2012         instances.set_total_peers(row["domain"], peers)
2013
2014         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
2015         instances.update_data(row["domain"])
2016
2017     logger.info("Checking %d domains ...", len(domains))
2018     for row in domains:
2019         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2020         if instances.is_registered(row["domain"]):
2021             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2022             continue
2023
2024         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2025         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2026
2027     logger.debug("Success! - EXIT!")
2028     return 0
2029
2030 def convert_idna(args: argparse.Namespace) -> int:
2031     logger.debug("args[]='%s' - CALLED!", type(args))
2032
2033     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2034     rows = database.cursor.fetchall()
2035
2036     logger.debug("rows[]='%s'", type(rows))
2037     instances.translate_idnas(rows, "domain")
2038
2039     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2040     rows = database.cursor.fetchall()
2041
2042     logger.debug("rows[]='%s'", type(rows))
2043     instances.translate_idnas(rows, "origin")
2044
2045     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2046     rows = database.cursor.fetchall()
2047
2048     logger.debug("rows[]='%s'", type(rows))
2049     blocks.translate_idnas(rows, "blocker")
2050
2051     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2052     rows = database.cursor.fetchall()
2053
2054     logger.debug("rows[]='%s'", type(rows))
2055     blocks.translate_idnas(rows, "blocked")
2056
2057     logger.debug("Success! - EXIT!")
2058     return 0