]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import federation
47 from fba.http import network
48
49 from fba.models import blocks
50 from fba.models import instances
51 from fba.models import sources
52
53 from fba.networks import friendica
54 from fba.networks import lemmy
55 from fba.networks import mastodon
56 from fba.networks import misskey
57 from fba.networks import pleroma
58
59 logging.basicConfig(level=logging.INFO)
60 logger = logging.getLogger(__name__)
61 #logger.setLevel(logging.DEBUG)
62
63 def check_instance(args: argparse.Namespace) -> int:
64     logger.debug("args.domain='%s' - CALLED!", args.domain)
65     status = 0
66     if not validators.domain(args.domain):
67         logger.warning("args.domain='%s' is not valid", args.domain)
68         status = 100
69     elif blacklist.is_blacklisted(args.domain):
70         logger.warning("args.domain='%s' is blacklisted", args.domain)
71         status = 101
72     elif instances.is_registered(args.domain):
73         logger.warning("args.domain='%s' is already registered", args.domain)
74         status = 102
75     else:
76         logger.info("args.domain='%s' is not known", args.domain)
77
78     logger.debug("status=%d - EXIT!", status)
79     return status
80
81 def check_nodeinfo(args: argparse.Namespace) -> int:
82     logger.debug("args[]='%s' - CALLED!", type(args))
83
84     # Fetch rows
85     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
86
87     cnt = 0
88     for row in database.cursor.fetchall():
89         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
90         punycode = row["domain"].encode("idna").decode("utf-8")
91
92         if row["nodeinfo_url"].startswith("/"):
93             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
94             continue
95         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
96             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
97             cnt = cnt + 1
98
99     logger.info("Found %d row(s)", cnt)
100
101     logger.debug("EXIT!")
102     return 0
103
104 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
105     logger.debug("args[]='%s' - CALLED!", type(args))
106
107     # No CSRF by default, you don't have to add network.source_headers by yourself here
108     headers = tuple()
109     source_domain = "pixelfed.org"
110
111     if sources.is_recent(source_domain):
112         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
113         return 0
114     else:
115         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
116         sources.update(source_domain)
117
118     try:
119         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
120         headers = csrf.determine(source_domain, dict())
121     except network.exceptions as exception:
122         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
123         return list()
124
125     try:
126         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
127         fetched = network.get_json_api(
128             source_domain,
129             "/api/v1/servers/all.json?scope=All&country=all&language=all",
130             headers,
131             (config.get("connection_timeout"), config.get("read_timeout"))
132         )
133
134         logger.debug("JSON API returned %d elements", len(fetched))
135         if "error_message" in fetched:
136             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
137             return 101
138         elif "data" not in fetched["json"]:
139             logger.warning("API did not return JSON with 'data' element - EXIT!")
140             return 102
141
142         rows = fetched["json"]["data"]
143         logger.info("Checking %d fetched rows ...", len(rows))
144         for row in rows:
145             logger.debug("row[]='%s'", type(row))
146             if "domain" not in row:
147                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
148                 continue
149             elif row["domain"] == "":
150                 logger.debug("row[domain] is empty - SKIPPED!")
151                 continue
152
153             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
154             domain = row["domain"].encode("idna").decode("utf-8")
155             logger.debug("domain='%s' - AFTER!", domain)
156
157             if not domain_helper.is_wanted(domain):
158                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
159                 continue
160             elif instances.is_registered(domain):
161                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
162                 continue
163             elif instances.is_recent(domain):
164                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
165                 continue
166
167             logger.debug("Fetching instances from domain='%s' ...", domain)
168             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
169
170     except network.exceptions as exception:
171         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
172         return 103
173
174     logger.debug("Success! - EXIT!")
175     return 0
176
177 def fetch_bkali(args: argparse.Namespace) -> int:
178     logger.debug("args[]='%s' - CALLED!", type(args))
179
180     logger.debug("Invoking locking.acquire() ...")
181     locking.acquire()
182
183     source_domain = "gql.api.bka.li"
184     if sources.is_recent(source_domain):
185         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
186         return 0
187     else:
188         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
189         sources.update(source_domain)
190
191     domains = list()
192     try:
193         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
194         fetched = network.post_json_api(
195             source_domain,
196             "/v1/graphql",
197             json.dumps({
198                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
199             })
200         )
201
202         logger.debug("fetched[]='%s'", type(fetched))
203         if "error_message" in fetched:
204             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
205             return 100
206         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
207             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
208             return 101
209
210         rows = fetched["json"]
211
212         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
213         if len(rows) == 0:
214             raise Exception("WARNING: Returned no records")
215         elif "data" not in rows:
216             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
217         elif "nodeinfo" not in rows["data"]:
218             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
219
220         for entry in rows["data"]["nodeinfo"]:
221             logger.debug("entry[%s]='%s'", type(entry), entry)
222             if "domain" not in entry:
223                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
224                 continue
225             elif entry["domain"] == "":
226                 logger.debug("entry[domain] is empty - SKIPPED!")
227                 continue
228             elif not domain_helper.is_wanted(entry["domain"]):
229                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
230                 continue
231             elif instances.is_registered(entry["domain"]):
232                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
233                 continue
234             elif instances.is_recent(entry["domain"]):
235                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
236                 continue
237
238             logger.debug("Adding domain='%s' ...", entry["domain"])
239             domains.append(entry["domain"])
240
241     except network.exceptions as exception:
242         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
243         return 102
244
245     logger.debug("domains()=%d", len(domains))
246     if len(domains) > 0:
247         logger.info("Adding %d new instances ...", len(domains))
248         for domain in domains:
249             logger.debug("domain='%s' - BEFORE!", domain)
250             domain = domain.encode("idna").decode("utf-8")
251             logger.debug("domain='%s' - AFTER!", domain)
252
253             try:
254                 logger.info("Fetching instances from domain='%s' ...", domain)
255                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
256             except network.exceptions as exception:
257                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
258                 instances.set_last_error(domain, exception)
259                 return 100
260
261     logger.debug("Success - EXIT!")
262     return 0
263
264 def fetch_blocks(args: argparse.Namespace) -> int:
265     logger.debug("args[]='%s' - CALLED!", type(args))
266     if args.domain is not None and args.domain != "":
267         logger.debug("args.domain='%s' - checking ...", args.domain)
268         if not validators.domain(args.domain):
269             logger.warning("args.domain='%s' is not valid.", args.domain)
270             return 100
271         elif blacklist.is_blacklisted(args.domain):
272             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
273             return 101
274         elif not instances.is_registered(args.domain):
275             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
276             return 102
277
278     logger.debug("Invoking locking.acquire() ...")
279     locking.acquire()
280
281     if args.domain is not None and args.domain != "":
282         # Re-check single domain
283         logger.debug("Querying database for args.domain='%s' ...", args.domain)
284         database.cursor.execute(
285             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
286         )
287     elif args.software is not None and args.software != "":
288         # Re-check single software
289         logger.debug("Querying database for args.software='%s' ...", args.software)
290         database.cursor.execute(
291             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
292         )
293     elif args.force:
294         # Re-check all
295         logger.debug("Re-checking all instances ...")
296         database.cursor.execute(
297             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC"
298         )
299     else:
300         # Re-check after "timeout" (aka. minimum interval)
301         database.cursor.execute(
302             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
303         )
304
305     rows = database.cursor.fetchall()
306     logger.info("Checking %d entries ...", len(rows))
307     for blocker, software, origin, nodeinfo_url in rows:
308         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
309         blocker = tidyup.domain(blocker)
310         logger.debug("blocker='%s' - AFTER!", blocker)
311
312         if blocker == "":
313             logger.warning("blocker is now empty!")
314             continue
315         elif nodeinfo_url is None or nodeinfo_url == "":
316             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
317             continue
318         elif not domain_helper.is_wanted(blocker):
319             logger.debug("blocker='%s' is not wanted - SKIPPED!", blocker)
320             continue
321
322         logger.debug("blocker='%s'", blocker)
323         instances.set_last_blocked(blocker)
324         instances.set_has_obfuscation(blocker, False)
325
326         blocking = list()
327         if software == "pleroma":
328             logger.info("blocker='%s',software='%s'", blocker, software)
329             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
330             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
331         elif software == "mastodon":
332             logger.info("blocker='%s',software='%s'", blocker, software)
333             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
334             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
335         elif software == "lemmy":
336             logger.info("blocker='%s',software='%s'", blocker, software)
337             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
338             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339         elif software == "friendica":
340             logger.info("blocker='%s',software='%s'", blocker, software)
341             blocking = friendica.fetch_blocks(blocker)
342             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343         elif software == "misskey":
344             logger.info("blocker='%s',software='%s'", blocker, software)
345             blocking = misskey.fetch_blocks(blocker)
346             logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
347         else:
348             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
349
350         logger.debug("blocker='%s'", blocker)
351         if blocker != "chaos.social":
352             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
353             instances.set_total_blocks(blocker, blocking)
354
355         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
356         blockdict = list()
357         for block in blocking:
358             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
359
360             if block["block_level"] == "":
361                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
362                 continue
363
364             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
365             block["blocked"] = tidyup.domain(block["blocked"])
366             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
367             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
368
369             if block["blocked"] == "":
370                 logger.warning("blocked is empty, blocker='%s'", blocker)
371                 continue
372             elif block["blocked"].endswith(".onion"):
373                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
374                 continue
375             elif block["blocked"].endswith(".arpa"):
376                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
377                 continue
378             elif block["blocked"].endswith(".tld"):
379                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
380                 continue
381             elif block["blocked"].find("*") >= 0:
382                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
383
384                 # Some friendica servers also obscure domains without hash
385                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
386
387                 logger.debug("row[]='%s'", type(row))
388                 if row is None:
389                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
390                     instances.set_has_obfuscation(blocker, True)
391                     continue
392
393                 block["blocked"] = row["domain"]
394                 origin           = row["origin"]
395                 nodeinfo_url     = row["nodeinfo_url"]
396             elif block["blocked"].find("?") >= 0:
397                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
398
399                 # Some obscure them with question marks, not sure if that's dependent on version or not
400                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
401
402                 logger.debug("row[]='%s'", type(row))
403                 if row is None:
404                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
405                     instances.set_has_obfuscation(blocker, True)
406                     continue
407
408                 block["blocked"] = row["domain"]
409                 origin           = row["origin"]
410                 nodeinfo_url     = row["nodeinfo_url"]
411
412             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
413             if block["blocked"] == "":
414                 logger.debug("block[blocked] is empty - SKIPPED!")
415                 continue
416
417             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
418             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
419             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
420
421             if not domain_helper.is_wanted(block["blocked"]):
422                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
423                 continue
424             elif block["block_level"] in ["accept", "accepted"]:
425                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
426                 continue
427             elif not instances.is_registered(block["blocked"]):
428                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
429                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
430
431             block["block_level"] = blocks.alias_block_level(block["block_level"])
432
433             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
434                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
435                 blockdict.append({
436                     "blocked": block["blocked"],
437                     "reason" : block["reason"],
438                 })
439
440             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
441             cookies.clear(block["blocked"])
442
443         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
444         if instances.has_pending(blocker):
445             logger.debug("Flushing updates for blocker='%s' ...", blocker)
446             instances.update_data(blocker)
447
448         logger.debug("Invoking commit() ...")
449         database.connection.commit()
450
451         logger.debug("Invoking cookies.clear(%s) ...", blocker)
452         cookies.clear(blocker)
453
454         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
455         if config.get("bot_enabled") and len(blockdict) > 0:
456             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
457             network.send_bot_post(blocker, blockdict)
458
459     logger.debug("Success! - EXIT!")
460     return 0
461
462 def fetch_observer(args: argparse.Namespace) -> int:
463     logger.debug("args[]='%s' - CALLED!", type(args))
464
465     logger.debug("Invoking locking.acquire() ...")
466     locking.acquire()
467
468     source_domain = "fediverse.observer"
469     if sources.is_recent(source_domain):
470         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
471         return 0
472     else:
473         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
474         sources.update(source_domain)
475
476     types = list()
477     if args.software is None:
478         logger.info("Fetching software list ...")
479         raw = utils.fetch_url(
480             f"https://{source_domain}",
481             network.web_headers,
482             (config.get("connection_timeout"), config.get("read_timeout"))
483         ).text
484         logger.debug("raw[%s]()=%d", type(raw), len(raw))
485
486         doc = bs4.BeautifulSoup(raw, features="html.parser")
487         logger.debug("doc[]='%s'", type(doc))
488
489         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
490         logger.debug("navbar[]='%s'", type(navbar))
491         if navbar is None:
492             logger.warning("Cannot find navigation bar, cannot continue!")
493             return 1
494
495         items = navbar.findAll("a", {"class": "dropdown-item"})
496         logger.debug("items[]='%s'", type(items))
497
498         logger.info("Checking %d menu items ...", len(items))
499         for item in items:
500             logger.debug("item[%s]='%s'", type(item), item)
501             if item.text.lower() == "all":
502                 logger.debug("Skipping 'All' menu entry ...")
503                 continue
504
505             logger.debug("Appending item.text='%s' ...", item.text)
506             types.append(tidyup.domain(item.text))
507     else:
508         logger.info("Adding args.software='%s' as type ...", args.software)
509         types.append(args.software)
510
511     logger.info("Fetching %d different table data ...", len(types))
512     for software in types:
513         logger.debug("software='%s' - BEFORE!", software)
514         if args.software is not None and args.software != software:
515             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
516             continue
517
518         doc = None
519         try:
520             logger.debug("Fetching table data for software='%s' ...", software)
521             raw = utils.fetch_url(
522                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
523                 network.web_headers,
524                 (config.get("connection_timeout"), config.get("read_timeout"))
525             ).text
526             logger.debug("raw[%s]()=%d", type(raw), len(raw))
527
528             doc = bs4.BeautifulSoup(raw, features="html.parser")
529             logger.debug("doc[]='%s'", type(doc))
530         except network.exceptions as exception:
531             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
532             continue
533
534         items = doc.findAll("a", {"class": "url"})
535         logger.info("Checking %d items,software='%s' ...", len(items), software)
536         for item in items:
537             logger.debug("item[]='%s'", type(item))
538             domain = item.decode_contents()
539             logger.debug("domain='%s' - AFTER!", domain)
540
541             if domain == "":
542                 logger.debug("domain is empty - SKIPPED!")
543                 continue
544
545             logger.debug("domain='%s' - BEFORE!", domain)
546             domain = domain.encode("idna").decode("utf-8")
547             logger.debug("domain='%s' - AFTER!", domain)
548
549             if not domain_helper.is_wanted(domain):
550                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
551                 continue
552             elif instances.is_registered(domain):
553                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
554                 continue
555             elif instances.is_recent(domain):
556                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
557                 continue
558
559             software = software_helper.alias(software)
560             logger.info("Fetching instances for domain='%s'", domain)
561             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
562
563     logger.debug("Success! - EXIT!")
564     return 0
565
566 def fetch_todon_wiki(args: argparse.Namespace) -> int:
567     logger.debug("args[]='%s' - CALLED!", type(args))
568
569     logger.debug("Invoking locking.acquire() ...")
570     locking.acquire()
571
572     source_domain = "wiki.todon.eu"
573     if sources.is_recent(source_domain):
574         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
575         return 0
576     else:
577         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
578         sources.update(source_domain)
579
580     blocklist = {
581         "silenced": list(),
582         "reject": list(),
583     }
584
585     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
586     raw = utils.fetch_url(
587         f"https://{source_domain}/todon/domainblocks",
588         network.web_headers,
589         (config.get("connection_timeout"), config.get("read_timeout"))
590     ).text
591     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
592
593     doc = bs4.BeautifulSoup(raw, "html.parser")
594     logger.debug("doc[]='%s'", type(doc))
595
596     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
597     logger.info("Checking %d silenced/limited entries ...", len(silenced))
598     blocklist["silenced"] = utils.find_domains(silenced, "div")
599
600     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
601     logger.info("Checking %d suspended entries ...", len(suspended))
602     blocklist["reject"] = utils.find_domains(suspended, "div")
603
604     blocking = blocklist["silenced"] + blocklist["reject"]
605     blocker = "todon.eu"
606
607     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
608     instances.set_total_blocks(blocker, blocking)
609
610     blockdict = list()
611     for block_level in blocklist:
612         blockers = blocklist[block_level]
613
614         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
615         for blocked in blockers:
616             logger.debug("blocked='%s'", blocked)
617
618             if not instances.is_registered(blocked):
619                 try:
620                     logger.info("Fetching instances from domain='%s' ...", blocked)
621                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
622                 except network.exceptions as exception:
623                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
624                     instances.set_last_error(blocked, exception)
625
626             if blocks.is_instance_blocked(blocker, blocked, block_level):
627                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
628                 continue
629
630             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
631             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
632                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
633                 blockdict.append({
634                     "blocked": blocked,
635                     "reason" : None,
636                 })
637
638         logger.debug("Invoking commit() ...")
639         database.connection.commit()
640
641         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
642         if config.get("bot_enabled") and len(blockdict) > 0:
643             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
644             network.send_bot_post(blocker, blockdict)
645
646     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
647     if instances.has_pending(blocker):
648         logger.debug("Flushing updates for blocker='%s' ...", blocker)
649         instances.update_data(blocker)
650
651     logger.debug("Success! - EXIT!")
652     return 0
653
654 def fetch_cs(args: argparse.Namespace):
655     logger.debug("args[]='%s' - CALLED!", type(args))
656
657     logger.debug("Invoking locking.acquire() ...")
658     locking.acquire()
659
660     extensions = [
661         "extra",
662         "abbr",
663         "attr_list",
664         "def_list",
665         "fenced_code",
666         "footnotes",
667         "md_in_html",
668         "admonition",
669         "codehilite",
670         "legacy_attrs",
671         "legacy_em",
672         "meta",
673         "nl2br",
674         "sane_lists",
675         "smarty",
676         "toc",
677         "wikilinks"
678     ]
679
680     blocklist = {
681         "silenced": list(),
682         "reject"  : list(),
683     }
684
685     source_domain = "raw.githubusercontent.com"
686     if sources.is_recent(source_domain):
687         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
688         return 0
689     else:
690         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
691         sources.update(source_domain)
692
693     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
694     raw = utils.fetch_url(
695         f"https://{source_domain}/chaossocial/meta/master/federation.md",
696         network.web_headers,
697         (config.get("connection_timeout"), config.get("read_timeout"))
698     ).text
699     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
700
701     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
702     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
703
704     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
705     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
706     blocklist["silenced"] = federation.find_domains(silenced)
707
708     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
709     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
710     blocklist["reject"] = federation.find_domains(blocked)
711
712     blocking = blocklist["silenced"] + blocklist["reject"]
713     blocker = "chaos.social"
714
715     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
716     instances.set_total_blocks(blocker, blocking)
717
718     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
719     if len(blocking) > 0:
720         blockdict = list()
721         for block_level in blocklist:
722             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
723
724             for row in blocklist[block_level]:
725                 logger.debug("row[%s]='%s'", type(row), row)
726                 if not "domain" in row:
727                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
728                     continue
729                 elif not instances.is_registered(row["domain"]):
730                     try:
731                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
732                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
733                     except network.exceptions as exception:
734                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
735                         instances.set_last_error(row["domain"], exception)
736
737                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
738                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
739                     blockdict.append({
740                         "blocked": row["domain"],
741                         "reason" : row["reason"],
742                     })
743
744         logger.debug("Invoking commit() ...")
745         database.connection.commit()
746
747         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
748         if config.get("bot_enabled") and len(blockdict) > 0:
749             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
750             network.send_bot_post(blocker, blockdict)
751
752     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
753     if instances.has_pending(blocker):
754         logger.debug("Flushing updates for blocker='%s' ...", blocker)
755         instances.update_data(blocker)
756
757     logger.debug("Success! - EXIT!")
758     return 0
759
760 def fetch_fba_rss(args: argparse.Namespace) -> int:
761     logger.debug("args[]='%s' - CALLED!", type(args))
762
763     domains = list()
764
765     logger.debug("Invoking locking.acquire() ...")
766     locking.acquire()
767
768     components = urlparse(args.feed)
769
770     if sources.is_recent(components.netloc):
771         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
772         return 0
773     else:
774         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
775         sources.update(components.netloc)
776
777     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
778     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
779
780     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
781     if response.ok and response.status_code < 300 and len(response.text) > 0:
782         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
783         rss = atoma.parse_rss_bytes(response.content)
784
785         logger.debug("rss[]='%s'", type(rss))
786         for item in rss.items:
787             logger.debug("item[%s]='%s'", type(item), item)
788             domain = tidyup.domain(item.link.split("=")[1])
789
790             logger.debug("domain='%s' - AFTER!", domain)
791             if domain == "":
792                 logger.debug("domain is empty - SKIPPED!")
793                 continue
794
795             logger.debug("domain='%s' - BEFORE!", domain)
796             domain = domain.encode("idna").decode("utf-8")
797             logger.debug("domain='%s' - AFTER!", domain)
798
799             if not domain_helper.is_wanted(domain):
800                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
801                 continue
802             elif domain in domains:
803                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
804                 continue
805             elif instances.is_registered(domain):
806                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
807                 continue
808             elif instances.is_recent(domain):
809                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
810                 continue
811
812             logger.debug("Adding domain='%s'", domain)
813             domains.append(domain)
814
815     logger.debug("domains()=%d", len(domains))
816     if len(domains) > 0:
817         logger.info("Adding %d new instances ...", len(domains))
818         for domain in domains:
819             logger.debug("domain='%s'", domain)
820             try:
821                 logger.info("Fetching instances from domain='%s' ...", domain)
822                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
823             except network.exceptions as exception:
824                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
825                 instances.set_last_error(domain, exception)
826                 return 100
827
828     logger.debug("Success! - EXIT!")
829     return 0
830
831 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
832     logger.debug("args[]='%s' - CALLED!", type(args))
833
834     logger.debug("Invoking locking.acquire() ...")
835     locking.acquire()
836
837     source_domain = "ryona.agency"
838     if sources.is_recent(source_domain):
839         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
840         return 0
841     else:
842         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
843         sources.update(source_domain)
844
845     feed = f"https://{source_domain}/users/fba/feed.atom"
846
847     domains = list()
848
849     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
850     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
851
852     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
853     if response.ok and response.status_code < 300 and len(response.text) > 0:
854         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
855         atom = atoma.parse_atom_bytes(response.content)
856
857         logger.debug("atom[]='%s'", type(atom))
858         for entry in atom.entries:
859             logger.debug("entry[]='%s'", type(entry))
860             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
861             logger.debug("doc[]='%s'", type(doc))
862             for element in doc.findAll("a"):
863                 logger.debug("element[]='%s'", type(element))
864                 for href in element["href"].split(","):
865                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
866                     domain = tidyup.domain(href)
867
868                     logger.debug("domain='%s' - AFTER!", domain)
869                     if domain == "":
870                         logger.debug("domain is empty - SKIPPED!")
871                         continue
872
873                     logger.debug("domain='%s' - BEFORE!", domain)
874                     domain = domain.encode("idna").decode("utf-8")
875                     logger.debug("domain='%s' - AFTER!", domain)
876
877                     if not domain_helper.is_wanted(domain):
878                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
879                         continue
880                     elif domain in domains:
881                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
882                         continue
883                     elif instances.is_registered(domain):
884                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
885                         continue
886                     elif instances.is_recent(domain):
887                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
888                         continue
889
890                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
891                     domains.append(domain)
892
893     logger.debug("domains()=%d", len(domains))
894     if len(domains) > 0:
895         logger.info("Adding %d new instances ...", len(domains))
896         for domain in domains:
897             logger.debug("domain='%s'", domain)
898             try:
899                 logger.info("Fetching instances from domain='%s' ...", domain)
900                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
901             except network.exceptions as exception:
902                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
903                 instances.set_last_error(domain, exception)
904                 return 100
905
906     logger.debug("Success! - EXIT!")
907     return 0
908
909 def fetch_instances(args: argparse.Namespace) -> int:
910     logger.debug("args[]='%s' - CALLED!", type(args))
911
912     logger.debug("args.domain='%s' - checking ...", args.domain)
913     if not validators.domain(args.domain):
914         logger.warning("args.domain='%s' is not valid.", args.domain)
915         return 100
916     elif blacklist.is_blacklisted(args.domain):
917         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
918         return 101
919
920     logger.debug("Invoking locking.acquire() ...")
921     locking.acquire()
922
923     # Initial fetch
924     try:
925         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
926         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
927     except network.exceptions as exception:
928         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
929         instances.set_last_error(args.domain, exception)
930         instances.update_data(args.domain)
931         return 100
932
933     if args.single:
934         logger.debug("Not fetching more instances - EXIT!")
935         return 0
936
937     # Loop through some instances
938     database.cursor.execute(
939         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
940     )
941
942     rows = database.cursor.fetchall()
943     logger.info("Checking %d entries ...", len(rows))
944     for row in rows:
945         logger.debug("row[domain]='%s'", row["domain"])
946         if row["domain"] == "":
947             logger.debug("row[domain] is empty - SKIPPED!")
948             continue
949
950         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
951         domain = row["domain"].encode("idna").decode("utf-8")
952         logger.debug("domain='%s' - AFTER!", domain)
953
954         if not domain_helper.is_wanted(domain):
955             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
956             continue
957
958         try:
959             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
960             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
961         except network.exceptions as exception:
962             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
963             instances.set_last_error(domain, exception)
964
965     logger.debug("Success - EXIT!")
966     return 0
967
968 def fetch_oliphant(args: argparse.Namespace) -> int:
969     logger.debug("args[]='%s' - CALLED!", type(args))
970
971     logger.debug("Invoking locking.acquire() ...")
972     locking.acquire()
973
974     source_domain = "codeberg.org"
975     if sources.is_recent(source_domain):
976         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
977         return 0
978     else:
979         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
980         sources.update(source_domain)
981
982     # Base URL
983     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
984
985     # URLs to fetch
986     blocklists = (
987         {
988             "blocker": "artisan.chat",
989             "csv_url": "mastodon/artisan.chat.csv",
990         },{
991             "blocker": "mastodon.art",
992             "csv_url": "mastodon/mastodon.art.csv",
993         },{
994             "blocker": "pleroma.envs.net",
995             "csv_url": "mastodon/pleroma.envs.net.csv",
996         },{
997             "blocker": "oliphant.social",
998             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
999         },{
1000             "blocker": "mastodon.online",
1001             "csv_url": "mastodon/mastodon.online.csv",
1002         },{
1003             "blocker": "mastodon.social",
1004             "csv_url": "mastodon/mastodon.social.csv",
1005         },{
1006             "blocker": "mastodon.social",
1007             "csv_url": "other/missing-tier0-mastodon.social.csv",
1008         },{
1009             "blocker": "rage.love",
1010             "csv_url": "mastodon/rage.love.csv",
1011         },{
1012             "blocker": "sunny.garden",
1013             "csv_url": "mastodon/sunny.garden.csv",
1014         },{
1015             "blocker": "sunny.garden",
1016             "csv_url": "mastodon/gardenfence.csv",
1017         },{
1018             "blocker": "solarpunk.moe",
1019             "csv_url": "mastodon/solarpunk.moe.csv",
1020         },{
1021             "blocker": "toot.wales",
1022             "csv_url": "mastodon/toot.wales.csv",
1023         },{
1024             "blocker": "union.place",
1025             "csv_url": "mastodon/union.place.csv",
1026         },{
1027             "blocker": "oliphant.social",
1028             "csv_url": "mastodon/birdsite.csv",
1029         }
1030     )
1031
1032     domains = list()
1033
1034     logger.debug("Downloading %d files ...", len(blocklists))
1035     for block in blocklists:
1036         # Is domain given and not equal blocker?
1037         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1038             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1039             continue
1040         elif args.domain in domains:
1041             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1042             continue
1043
1044         # Fetch this URL
1045         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1046         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1047
1048         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1049         if not response.ok or response.status_code >= 300 or response.content == "":
1050             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1051             continue
1052
1053         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1054         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1055
1056         blockdict = list()
1057
1058         cnt = 0
1059         for row in reader:
1060             logger.debug("row[%s]='%s'", type(row), row)
1061             domain = severity = None
1062             reject_media = reject_reports = False
1063
1064             if "#domain" in row:
1065                 domain = row["#domain"]
1066             elif "domain" in row:
1067                 domain = row["domain"]
1068             else:
1069                 logger.debug("row='%s' does not contain domain column", row)
1070                 continue
1071
1072             if "#severity" in row:
1073                 severity = blocks.alias_block_level(row["#severity"])
1074             elif "severity" in row:
1075                 severity = blocks.alias_block_level(row["severity"])
1076             else:
1077                 logger.debug("row='%s' does not contain severity column", row)
1078                 continue
1079
1080             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1081                 reject_media = True
1082             elif "reject_media" in row and row["reject_media"].lower() == "true":
1083                 reject_media = True
1084
1085             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1086                 reject_reports = True
1087             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1088                 reject_reports = True
1089
1090             cnt = cnt + 1
1091             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1092             if domain == "":
1093                 logger.debug("domain is empty - SKIPPED!")
1094                 continue
1095             elif domain.endswith(".onion"):
1096                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1097                 continue
1098             elif domain.endswith(".arpa"):
1099                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1100                 continue
1101             elif domain.endswith(".tld"):
1102                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1103                 continue
1104             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1105                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1106                 domain = utils.deobfuscate(domain, block["blocker"])
1107                 logger.debug("domain='%s' - AFTER!", domain)
1108
1109             if not validators.domain(domain):
1110                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1111                 continue
1112             elif blacklist.is_blacklisted(domain):
1113                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1114                 continue
1115             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1116                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1117                 continue
1118
1119             logger.debug("Marking domain='%s' as handled", domain)
1120             domains.append(domain)
1121
1122             logger.debug("Processing domain='%s' ...", domain)
1123             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1124             logger.debug("processed='%s'", processed)
1125
1126             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1127                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1128                 blockdict.append({
1129                     "blocked": domain,
1130                     "reason" : block["reason"],
1131                 })
1132
1133             if reject_media:
1134                 processing.block(block["blocker"], domain, None, "reject_media")
1135             if reject_reports:
1136                 processing.block(block["blocker"], domain, None, "reject_reports")
1137
1138         logger.debug("block[blocker]='%s'", block["blocker"])
1139         if block["blocker"] != "chaos.social":
1140             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1141             instances.set_total_blocks(block["blocker"], domains)
1142
1143         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1144         if instances.has_pending(block["blocker"]):
1145             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1146             instances.update_data(block["blocker"])
1147
1148         logger.debug("Invoking commit() ...")
1149         database.connection.commit()
1150
1151         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1152         if config.get("bot_enabled") and len(blockdict) > 0:
1153             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1154             network.send_bot_post(block["blocker"], blockdict)
1155
1156     logger.debug("Success! - EXIT!")
1157     return 0
1158
1159 def fetch_txt(args: argparse.Namespace) -> int:
1160     logger.debug("args[]='%s' - CALLED!", type(args))
1161
1162     logger.debug("Invoking locking.acquire() ...")
1163     locking.acquire()
1164
1165     # Static URLs
1166     urls = ({
1167         "blocker": "seirdy.one",
1168         "url"    : "https://seirdy.one/pb/bsl.txt",
1169     },)
1170
1171     logger.info("Checking %d text file(s) ...", len(urls))
1172     for row in urls:
1173         logger.debug("Fetching row[url]='%s' ...", row["url"])
1174         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1175
1176         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1177         if response.ok and response.status_code < 300 and response.text != "":
1178             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1179             domains = response.text.split("\n")
1180
1181             logger.info("Processing %d domains ...", len(domains))
1182             for domain in domains:
1183                 logger.debug("domain='%s' - BEFORE!", domain)
1184                 domain = tidyup.domain(domain)
1185
1186                 logger.debug("domain='%s' - AFTER!", domain)
1187                 if domain == "":
1188                     logger.debug("domain is empty - SKIPPED!")
1189                     continue
1190                 elif not domain_helper.is_wanted(domain):
1191                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1192                     continue
1193                 elif instances.is_recent(domain):
1194                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1195                     continue
1196
1197                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1198                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1199
1200                 logger.debug("processed='%s'", processed)
1201                 if not processed:
1202                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1203                     continue
1204
1205     logger.debug("Success! - EXIT!")
1206     return 0
1207
1208 def fetch_fedipact(args: argparse.Namespace) -> int:
1209     logger.debug("args[]='%s' - CALLED!", type(args))
1210
1211     logger.debug("Invoking locking.acquire() ...")
1212     locking.acquire()
1213
1214     source_domain = "fedipact.online"
1215     if sources.is_recent(source_domain):
1216         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1217         return 0
1218     else:
1219         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1220         sources.update(source_domain)
1221
1222     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1223     response = utils.fetch_url(
1224         f"https://{source_domain}",
1225         network.web_headers,
1226         (config.get("connection_timeout"), config.get("read_timeout"))
1227     )
1228
1229     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1230     if response.ok and response.status_code < 300 and response.text != "":
1231         logger.debug("Parsing %d Bytes ...", len(response.text))
1232
1233         doc = bs4.BeautifulSoup(response.text, "html.parser")
1234         logger.debug("doc[]='%s'", type(doc))
1235
1236         rows = doc.findAll("li")
1237         logger.info("Checking %d row(s) ...", len(rows))
1238         for row in rows:
1239             logger.debug("row[]='%s'", type(row))
1240             domain = tidyup.domain(row.contents[0])
1241
1242             logger.debug("domain='%s' - AFTER!", domain)
1243             if domain == "":
1244                 logger.debug("domain is empty - SKIPPED!")
1245                 continue
1246
1247             logger.debug("domain='%s' - BEFORE!", domain)
1248             domain = domain.encode("idna").decode("utf-8")
1249             logger.debug("domain='%s' - AFTER!", domain)
1250
1251             if not domain_helper.is_wanted(domain):
1252                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1253                 continue
1254             elif instances.is_registered(domain):
1255                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1256                 continue
1257             elif instances.is_recent(domain):
1258                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1259                 continue
1260
1261             logger.info("Fetching domain='%s' ...", domain)
1262             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1263
1264     logger.debug("Success! - EXIT!")
1265     return 0
1266
1267 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1268     logger.debug("args[]='%s' - CALLED!", type(args))
1269
1270     logger.debug("Invoking locking.acquire() ...")
1271     locking.acquire()
1272
1273     source_domain = "instances.joinmobilizon.org"
1274     if sources.is_recent(source_domain):
1275         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1276         return 0
1277     else:
1278         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1279         sources.update(source_domain)
1280
1281     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1282     raw = utils.fetch_url(
1283         f"https://{source_domain}/api/v1/instances",
1284         network.web_headers,
1285         (config.get("connection_timeout"), config.get("read_timeout"))
1286     ).text
1287     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1288
1289     parsed = json.loads(raw)
1290     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1291
1292     if "data" not in parsed:
1293         logger.warning("parsed()=%d does not contain key 'data'")
1294         return 1
1295
1296     logger.info("Checking %d instances ...", len(parsed["data"]))
1297     for row in parsed["data"]:
1298         logger.debug("row[]='%s'", type(row))
1299         if "host" not in row:
1300             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1301             continue
1302         elif not domain_helper.is_wanted(row["host"]):
1303             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1304             continue
1305         elif instances.is_registered(row["host"]):
1306             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1307             continue
1308
1309         logger.info("Fetching row[host]='%s' ...", row["host"])
1310         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1311
1312     logger.debug("Success! - EXIT!")
1313     return 0
1314
1315 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1316     logger.debug("args[]='%s' - CALLED!", type(args))
1317
1318     logger.debug("Invoking locking.acquire() ...")
1319     locking.acquire()
1320
1321     source_domain = "instanceapp.misskey.page"
1322     if sources.is_recent(source_domain):
1323         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1324         return 0
1325     else:
1326         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1327         sources.update(source_domain)
1328
1329     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1330     raw = utils.fetch_url(
1331         f"https://{source_domain}/instances.json",
1332         network.web_headers,
1333         (config.get("connection_timeout"), config.get("read_timeout"))
1334     ).text
1335     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1336
1337     parsed = json.loads(raw)
1338     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1339
1340     if "instancesInfos" not in parsed:
1341         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1342         return 1
1343
1344     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1345     for row in parsed["instancesInfos"]:
1346         logger.debug("row[%s]='%s'", type(row), row)
1347         if "url" not in row:
1348             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1349             continue
1350         elif not domain_helper.is_wanted(row["url"]):
1351             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1352             continue
1353         elif instances.is_registered(row["url"]):
1354             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1355             continue
1356
1357         logger.info("Fetching row[url]='%s' ...", row["url"])
1358         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1359
1360     logger.debug("Success! - EXIT!")
1361     return 0
1362
1363 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1364     logger.debug("args[]='%s' - CALLED!", type(args))
1365
1366     logger.debug("Invoking locking.acquire() ...")
1367     locking.acquire()
1368
1369     source_domain = "joinfediverse.wiki"
1370     if sources.is_recent(source_domain):
1371         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1372         return 0
1373     else:
1374         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1375         sources.update(source_domain)
1376
1377     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1378     raw = utils.fetch_url(
1379         f"https://{source_domain}/FediBlock",
1380         network.web_headers,
1381         (config.get("connection_timeout"), config.get("read_timeout"))
1382     ).text
1383     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1384
1385     doc = bs4.BeautifulSoup(raw, "html.parser")
1386     logger.debug("doc[]='%s'", type(doc))
1387
1388     tables = doc.findAll("table", {"class": "wikitable"})
1389
1390     logger.info("Analyzing %d table(s) ...", len(tables))
1391     blocklist = list()
1392     for table in tables:
1393         logger.debug("table[]='%s'", type(table))
1394
1395         rows = table.findAll("tr")
1396         logger.info("Checking %d row(s) ...", len(rows))
1397         block_headers = dict()
1398         for row in rows:
1399             logger.debug("row[%s]='%s'", type(row), row)
1400
1401             headers = row.findAll("th")
1402             logger.debug("Found headers()=%d header(s)", len(headers))
1403             if len(headers) > 1:
1404                 block_headers = dict()
1405                 cnt = 0
1406                 for header in headers:
1407                     cnt = cnt + 1
1408                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1409                     text = header.contents[0]
1410
1411                     logger.debug("text[]='%s'", type(text))
1412                     if not isinstance(text, str):
1413                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1414                         continue
1415                     elif validators.domain(text.strip()):
1416                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1417                         continue
1418
1419                     text = tidyup.domain(text.strip())
1420                     logger.debug("text='%s' - AFTER!", text)
1421                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1422                         logger.debug("Found header: '%s'=%d", text, cnt)
1423                         block_headers[cnt] = text
1424
1425             elif len(block_headers) == 0:
1426                 logger.debug("row is not scrapable - SKIPPED!")
1427                 continue
1428             elif len(block_headers) > 0:
1429                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1430                 cnt = 0
1431                 block = dict()
1432
1433                 for element in row.find_all(["th", "td"]):
1434                     cnt = cnt + 1
1435                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1436                     if cnt in block_headers:
1437                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1438
1439                         text = element.text.strip()
1440                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1441
1442                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1443                         if key in ["domain", "instance"]:
1444                             block[key] = text
1445                         elif key == "reason":
1446                             block[key] = tidyup.reason(text)
1447                         elif key == "subdomain(s)":
1448                             block[key] = list()
1449                             if text != "":
1450                                 block[key] = text.split("/")
1451                         else:
1452                             logger.debug("key='%s'", key)
1453                             block[key] = text
1454
1455                 logger.debug("block()=%d ...", len(block))
1456                 if len(block) > 0:
1457                     logger.debug("Appending block()=%d ...", len(block))
1458                     blocklist.append(block)
1459
1460     logger.debug("blocklist()=%d", len(blocklist))
1461
1462     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1463     domains = database.cursor.fetchall()
1464
1465     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1466     blocking = list()
1467     for block in blocklist:
1468         logger.debug("block='%s'", block)
1469         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1470             origin = block["blocked"]
1471             logger.debug("origin='%s'", origin)
1472             for subdomain in block["subdomain(s)"]:
1473                 block["blocked"] = subdomain + "." + origin
1474                 logger.debug("block[blocked]='%s'", block["blocked"])
1475                 blocking.append(block)
1476         else:
1477             blocking.append(block)
1478
1479     logger.debug("blocking()=%d", blocking)
1480     for block in blocking:
1481         logger.debug("block[]='%s'", type(block))
1482         if "blocked" not in block:
1483             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1484
1485         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1486         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1487
1488         if block["blocked"] == "":
1489             logger.debug("block[blocked] is empty - SKIPPED!")
1490             continue
1491         elif not domain_helper.is_wanted(block["blocked"]):
1492             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1493             continue
1494         elif instances.is_recent(block["blocked"]):
1495             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1496             continue
1497
1498         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1499         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1500
1501     blockdict = list()
1502     for blocker in domains:
1503         blocker = blocker[0]
1504         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1505
1506         for block in blocking:
1507             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1508             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1509
1510             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1511             if block["blocked"] == "":
1512                 logger.debug("block[blocked] is empty - SKIPPED!")
1513                 continue
1514             elif not domain_helper.is_wanted(block["blocked"]):
1515                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1516                 continue
1517
1518             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1519             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1520                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1521                 blockdict.append({
1522                     "blocked": block["blocked"],
1523                     "reason" : block["reason"],
1524                 })
1525
1526         if instances.has_pending(blocker):
1527             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1528             instances.update_data(blocker)
1529
1530         logger.debug("Invoking commit() ...")
1531         database.connection.commit()
1532
1533         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1534         if config.get("bot_enabled") and len(blockdict) > 0:
1535             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1536             network.send_bot_post(blocker, blockdict)
1537
1538     logger.debug("Success! - EXIT!")
1539     return 0
1540
1541 def recheck_obfuscation(args: argparse.Namespace) -> int:
1542     logger.debug("args[]='%s' - CALLED!", type(args))
1543
1544     logger.debug("Invoking locking.acquire() ...")
1545     locking.acquire()
1546
1547     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1548         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1549     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1550         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1551     else:
1552         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1553
1554     rows = database.cursor.fetchall()
1555     logger.info("Checking %d domains ...", len(rows))
1556     for row in rows:
1557         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1558         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1559             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1560             continue
1561
1562         blocking = list()
1563         if row["software"] == "pleroma":
1564             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1565             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1566         elif row["software"] == "mastodon":
1567             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1568             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1569         elif row["software"] == "lemmy":
1570             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1571             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1572         elif row["software"] == "friendica":
1573             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1574             blocking = friendica.fetch_blocks(row["domain"])
1575         elif row["software"] == "misskey":
1576             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1577             blocking = misskey.fetch_blocks(row["domain"])
1578         else:
1579             logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1580
1581         logger.debug("row[domain]='%s'", row["domain"])
1582         # chaos.social requires special care ...
1583         if row["domain"] != "chaos.social":
1584             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1585             instances.set_total_blocks(row["domain"], blocking)
1586
1587         obfuscated = 0
1588         blockdict = list()
1589
1590         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1591         for block in blocking:
1592             logger.debug("block[blocked]='%s'", block["blocked"])
1593             blocked = None
1594
1595             if block["blocked"] == "":
1596                 logger.debug("block[blocked] is empty - SKIPPED!")
1597                 continue
1598             elif block["blocked"].endswith(".arpa"):
1599                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1600                 continue
1601             elif block["blocked"].endswith(".tld"):
1602                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1603                 continue
1604             elif block["blocked"].endswith(".onion"):
1605                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1606                 continue
1607             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1608                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1609                 obfuscated = obfuscated + 1
1610                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1611             elif not domain_helper.is_wanted(block["blocked"]):
1612                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1613                 continue
1614             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1615                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1616                 continue
1617
1618             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1619             if blocked is not None and blocked != block["blocked"]:
1620                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1621                 obfuscated = obfuscated - 1
1622                 if blocks.is_instance_blocked(row["domain"], blocked):
1623                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1624                     continue
1625
1626                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1627
1628                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1629                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1630                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1631                     blockdict.append({
1632                         "blocked": blocked,
1633                         "reason" : block["reason"],
1634                     })
1635
1636         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1637         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1638
1639         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1640         if obfuscated == 0 and len(blocking) > 0:
1641             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1642             instances.set_has_obfuscation(row["domain"], False)
1643
1644         if instances.has_pending(row["domain"]):
1645             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1646             instances.update_data(row["domain"])
1647
1648         logger.debug("Invoking commit() ...")
1649         database.connection.commit()
1650
1651         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1652         if config.get("bot_enabled") and len(blockdict) > 0:
1653             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1654             network.send_bot_post(row["domain"], blockdict)
1655
1656     logger.debug("Success! - EXIT!")
1657     return 0
1658
1659 def fetch_fedilist(args: argparse.Namespace) -> int:
1660     logger.debug("args[]='%s' - CALLED!", type(args))
1661
1662     logger.debug("Invoking locking.acquire() ...")
1663     locking.acquire()
1664
1665     source_domain = "demo.fedilist.com"
1666     if sources.is_recent(source_domain):
1667         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1668         return 0
1669     else:
1670         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1671         sources.update(source_domain)
1672
1673     url = f"http://{source_domain}/instance/csv?onion=not"
1674     if args.software is not None and args.software != "":
1675         logger.debug("args.software='%s'", args.software)
1676         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1677
1678     logger.info("Fetching url='%s' ...", url)
1679     response = reqto.get(
1680         url,
1681         headers=network.web_headers,
1682         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1683         allow_redirects=False
1684     )
1685
1686     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1687     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1688         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1689         return 1
1690
1691     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1692
1693     logger.debug("reader[]='%s'", type(reader))
1694     if reader is None:
1695         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1696         return 2
1697
1698     rows = list(reader)
1699
1700     logger.info("Checking %d rows ...", len(rows))
1701     for row in rows:
1702         logger.debug("row[]='%s'", type(row))
1703         if "hostname" not in row:
1704             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1705             continue
1706
1707         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1708         domain = tidyup.domain(row["hostname"])
1709         logger.debug("domain='%s' - AFTER!", domain)
1710
1711         if domain == "":
1712             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1713             continue
1714
1715         logger.debug("domain='%s' - BEFORE!", domain)
1716         domain = domain.encode("idna").decode("utf-8")
1717         logger.debug("domain='%s' - AFTER!", domain)
1718
1719         if not domain_helper.is_wanted(domain):
1720             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1721             continue
1722         elif (args.force is None or not args.force) and instances.is_registered(domain):
1723             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1724             continue
1725         elif instances.is_recent(domain):
1726             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1727             continue
1728
1729         logger.info("Fetching instances from domain='%s' ...", domain)
1730         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1731
1732     logger.debug("Success! - EXIT!")
1733     return 0
1734
1735 def update_nodeinfo(args: argparse.Namespace) -> int:
1736     logger.debug("args[]='%s' - CALLED!", type(args))
1737
1738     logger.debug("Invoking locking.acquire() ...")
1739     locking.acquire()
1740
1741     if args.domain is not None and args.domain != "":
1742         logger.debug("Fetching args.domain='%s'", args.domain)
1743         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1744     elif args.software is not None and args.software != "":
1745         logger.info("Fetching domains for args.software='%s'", args.software)
1746         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1747     else:
1748         logger.info("Fetching domains for recently updated ...")
1749         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1750
1751     domains = database.cursor.fetchall()
1752
1753     logger.info("Checking %d domain(s) ...", len(domains))
1754     cnt = 0
1755     for row in domains:
1756         logger.debug("row[]='%s'", type(row))
1757         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1758             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1759             continue
1760
1761         try:
1762             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1763             software = federation.determine_software(row["domain"])
1764
1765             logger.debug("Determined software='%s'", software)
1766             if (software != row["software"] and software is not None) or args.force is True:
1767                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1768                 instances.set_software(row["domain"], software)
1769
1770             if software is not None:
1771                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1772                 instances.set_success(row["domain"])
1773         except network.exceptions as exception:
1774             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1775             instances.set_last_error(row["domain"], exception)
1776
1777         instances.set_last_nodeinfo(row["domain"])
1778         instances.update_data(row["domain"])
1779         cnt = cnt + 1
1780
1781     logger.debug("Success! - EXIT!")
1782     return 0
1783
1784 def fetch_instances_social(args: argparse.Namespace) -> int:
1785     logger.debug("args[]='%s' - CALLED!", type(args))
1786
1787     logger.debug("Invoking locking.acquire() ...")
1788     locking.acquire()
1789
1790     source_domain = "instances.social"
1791
1792     if config.get("instances_social_api_key") == "":
1793         logger.error("API key not set. Please set in your config.json file.")
1794         return 1
1795     elif sources.is_recent(source_domain):
1796         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1797         return 0
1798     else:
1799         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1800         sources.update(source_domain)
1801
1802     headers = {
1803         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1804     }
1805
1806     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1807     fetched = network.get_json_api(
1808         source_domain,
1809         "/api/1.0/instances/list?count=0&sort_by=name",
1810         headers,
1811         (config.get("connection_timeout"), config.get("read_timeout"))
1812     )
1813     logger.debug("fetched[]='%s'", type(fetched))
1814
1815     if "error_message" in fetched:
1816         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1817         return 2
1818     elif "exception" in fetched:
1819         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1820         return 3
1821     elif "json" not in fetched:
1822         logger.warning("fetched has no element 'json' - EXIT!")
1823         return 4
1824     elif "instances" not in fetched["json"]:
1825         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1826         return 5
1827
1828     domains = list()
1829     rows = fetched["json"]["instances"]
1830
1831     logger.info("Checking %d row(s) ...", len(rows))
1832     for row in rows:
1833         logger.debug("row[]='%s'", type(row))
1834         domain = tidyup.domain(row["name"])
1835         logger.debug("domain='%s' - AFTER!", domain)
1836
1837         if domain == "":
1838             logger.debug("domain is empty - SKIPPED!")
1839             continue
1840
1841         logger.debug("domain='%s' - BEFORE!", domain)
1842         domain = domain.encode("idna").decode("utf-8")
1843         logger.debug("domain='%s' - AFTER!", domain)
1844
1845         if not domain_helper.is_wanted(domain):
1846             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1847             continue
1848         elif domain in domains:
1849             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1850             continue
1851         elif instances.is_registered(domain):
1852             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1853             continue
1854         elif instances.is_recent(domain):
1855             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1856             continue
1857
1858         logger.info("Fetching instances from domain='%s'", domain)
1859         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1860
1861     logger.debug("Success! - EXIT!")
1862     return 0
1863
1864 def fetch_relays(args: argparse.Namespace) -> int:
1865     logger.debug("args[]='%s' - CALLED!", type(args))
1866
1867     logger.debug("Invoking locking.acquire() ...")
1868     locking.acquire()
1869
1870     if args.domain is not None and args.domain != "":
1871         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1872     else:
1873         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1874
1875     domains = list()
1876     rows = database.cursor.fetchall()
1877
1878     logger.info("Checking %d relays ...", len(rows))
1879     for row in rows:
1880         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1881         if not args.force and instances.is_recent(row["domain"]):
1882             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1883             continue
1884
1885         try:
1886             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1887             raw = utils.fetch_url(
1888                 f"https://{row['domain']}",
1889                 network.web_headers,
1890                 (config.get("connection_timeout"), config.get("read_timeout"))
1891             ).text
1892             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1893         except network.exceptions as exception:
1894             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1895             instances.set_last_error(row["domain"], exception)
1896             instances.set_last_instance_fetch(row["domain"])
1897             instances.update_data(row["domain"])
1898             continue
1899
1900         doc = bs4.BeautifulSoup(raw, features="html.parser")
1901         logger.debug("doc[]='%s'", type(doc))
1902
1903         logger.debug("row[software]='%s'", row["software"])
1904         if row["software"] == "activityrelay":
1905             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1906             tags = doc.findAll("p")
1907
1908             logger.debug("Checking %d paragraphs ...", len(tags))
1909             for tag in tags:
1910                 logger.debug("tag[]='%s'", type(tag))
1911                 if len(tag.contents) == 0:
1912                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1913                     continue
1914                 elif "registered instances" not in tag.contents[0]:
1915                     logger.debug("Skipping paragraph, text not found.")
1916                     continue
1917
1918                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1919                 for domain in tag.contents:
1920                     logger.debug("domain[%s]='%s'", type(domain), domain)
1921                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1922                         continue
1923
1924                     domain = str(domain)
1925                     if not domain_helper.is_wanted(domain):
1926                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1927                         continue
1928
1929                     logger.debug("domain='%s' - BEFORE!", domain)
1930                     domain = tidyup.domain(domain)
1931                     logger.debug("domain='%s' - AFTER!", domain)
1932
1933                     if domain == "":
1934                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1935                         continue
1936                     elif instances.is_registered(domain):
1937                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1938                         continue
1939                     elif dict_helper.has_key(domains, "domain", domain):
1940                         logger.debug("domain='%s' already added", domain)
1941                         continue
1942
1943                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1944                     domains.append({
1945                         "domain": domain,
1946                         "origin": row["domain"],
1947                     })
1948         elif row["software"] in ["aoderelay", "selective-relay"]:
1949             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1950             if row["software"] == "aoderelay":
1951                 tags = doc.findAll("section", {"class": "instance"})
1952             else:
1953                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1954
1955             logger.debug("Checking %d tags ...", len(tags))
1956             for tag in tags:
1957                 logger.debug("tag[]='%s'", type(tag))
1958
1959                 link = tag.find("a")
1960                 logger.debug("link[%s]='%s'", type(link), link)
1961                 if link is None:
1962                     logger.warning("tag='%s' has no a-tag ...", tag)
1963                     continue
1964
1965                 components = urlparse(link["href"])
1966                 domain = components.netloc.lower()
1967
1968                 if not domain_helper.is_wanted(domain):
1969                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1970                     continue
1971
1972                 logger.debug("domain='%s' - BEFORE!", domain)
1973                 domain = tidyup.domain(domain)
1974                 logger.debug("domain='%s' - AFTER!", domain)
1975
1976                 if domain == "":
1977                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1978                     continue
1979                 elif instances.is_registered(domain):
1980                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1981                     continue
1982                 elif dict_helper.has_key(domains, "domain", domain):
1983                     logger.debug("domain='%s' already added", domain)
1984                     continue
1985
1986                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1987                 domains.append({
1988                     "domain": domain,
1989                     "origin": row["domain"],
1990                 })
1991         else:
1992             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1993
1994         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1995         instances.set_last_instance_fetch(row["domain"])
1996         instances.update_data(row["domain"])
1997
1998     logger.info("Found %d domains to add ...", len(domains))
1999     for row in domains:
2000         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2001         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2002
2003     logger.debug("Success! - EXIT!")
2004     return 0
2005
2006 def convert_idna(args: argparse.Namespace) -> int:
2007     logger.debug("args[]='%s' - CALLED!", type(args))
2008
2009     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2010     rows = database.cursor.fetchall()
2011
2012     logger.debug("rows[]='%s'", type(rows))
2013     instances.translate_idnas(rows, "domain")
2014
2015     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2016     rows = database.cursor.fetchall()
2017
2018     logger.debug("rows[]='%s'", type(rows))
2019     instances.translate_idnas(rows, "origin")
2020
2021     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2022     rows = database.cursor.fetchall()
2023
2024     logger.debug("rows[]='%s'", type(rows))
2025     blocks.translate_idnas(rows, "blocker")
2026
2027     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2028     rows = database.cursor.fetchall()
2029
2030     logger.debug("rows[]='%s'", type(rows))
2031     blocks.translate_idnas(rows, "blocked")
2032
2033     logger.debug("Success! - EXIT!")
2034     return 0