]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Another attempt to rewrite:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66     status = 0
67     if not validators.domain(args.domain):
68         logger.warning("args.domain='%s' is not valid", args.domain)
69         status = 100
70     elif blacklist.is_blacklisted(args.domain):
71         logger.warning("args.domain='%s' is blacklisted", args.domain)
72         status = 101
73     elif instances.is_registered(args.domain):
74         logger.warning("args.domain='%s' is already registered", args.domain)
75         status = 102
76     else:
77         logger.info("args.domain='%s' is not known", args.domain)
78
79     logger.debug("status=%d - EXIT!", status)
80     return status
81
82 def check_nodeinfo(args: argparse.Namespace) -> int:
83     logger.debug("args[]='%s' - CALLED!", type(args))
84
85     # Fetch rows
86     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
87
88     cnt = 0
89     for row in database.cursor.fetchall():
90         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
91         punycode = row["domain"].encode("idna").decode("utf-8")
92
93         if row["nodeinfo_url"].startswith("/"):
94             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
95             continue
96         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
97             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
98             cnt = cnt + 1
99
100     logger.info("Found %d row(s)", cnt)
101
102     logger.debug("EXIT!")
103     return 0
104
105 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
106     logger.debug("args[]='%s' - CALLED!", type(args))
107
108     # No CSRF by default, you don't have to add network.source_headers by yourself here
109     headers = tuple()
110     source_domain = "pixelfed.org"
111
112     if sources.is_recent(source_domain):
113         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
114         return 0
115     else:
116         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
117         sources.update(source_domain)
118
119     try:
120         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
121         headers = csrf.determine(source_domain, dict())
122     except network.exceptions as exception:
123         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
124         return list()
125
126     try:
127         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
128         fetched = network.get_json_api(
129             source_domain,
130             "/api/v1/servers/all.json?scope=All&country=all&language=all",
131             headers,
132             (config.get("connection_timeout"), config.get("read_timeout"))
133         )
134
135         logger.debug("JSON API returned %d elements", len(fetched))
136         if "error_message" in fetched:
137             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
138             return 101
139         elif "data" not in fetched["json"]:
140             logger.warning("API did not return JSON with 'data' element - EXIT!")
141             return 102
142
143         rows = fetched["json"]["data"]
144         logger.info("Checking %d fetched rows ...", len(rows))
145         for row in rows:
146             logger.debug("row[]='%s'", type(row))
147             if "domain" not in row:
148                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
149                 continue
150             elif row["domain"] == "":
151                 logger.debug("row[domain] is empty - SKIPPED!")
152                 continue
153
154             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
155             domain = row["domain"].encode("idna").decode("utf-8")
156             logger.debug("domain='%s' - AFTER!", domain)
157
158             if not domain_helper.is_wanted(domain):
159                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
160                 continue
161             elif instances.is_registered(domain):
162                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
163                 continue
164             elif instances.is_recent(domain):
165                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
166                 continue
167
168             logger.debug("Fetching instances from domain='%s' ...", domain)
169             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
170
171     except network.exceptions as exception:
172         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
173         return 103
174
175     logger.debug("Success! - EXIT!")
176     return 0
177
178 def fetch_bkali(args: argparse.Namespace) -> int:
179     logger.debug("args[]='%s' - CALLED!", type(args))
180
181     logger.debug("Invoking locking.acquire() ...")
182     locking.acquire()
183
184     source_domain = "gql.api.bka.li"
185     if sources.is_recent(source_domain):
186         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
187         return 0
188     else:
189         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
190         sources.update(source_domain)
191
192     domains = list()
193     try:
194         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
195         fetched = network.post_json_api(
196             source_domain,
197             "/v1/graphql",
198             json.dumps({
199                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
200             })
201         )
202
203         logger.debug("fetched[]='%s'", type(fetched))
204         if "error_message" in fetched:
205             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
206             return 100
207         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
208             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
209             return 101
210
211         rows = fetched["json"]
212
213         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
214         if len(rows) == 0:
215             raise Exception("WARNING: Returned no records")
216         elif "data" not in rows:
217             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
218         elif "nodeinfo" not in rows["data"]:
219             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
220
221         for entry in rows["data"]["nodeinfo"]:
222             logger.debug("entry[%s]='%s'", type(entry), entry)
223             if "domain" not in entry:
224                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
225                 continue
226             elif entry["domain"] == "":
227                 logger.debug("entry[domain] is empty - SKIPPED!")
228                 continue
229             elif not domain_helper.is_wanted(entry["domain"]):
230                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
231                 continue
232             elif instances.is_registered(entry["domain"]):
233                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
234                 continue
235             elif instances.is_recent(entry["domain"]):
236                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
237                 continue
238
239             logger.debug("Adding domain='%s' ...", entry["domain"])
240             domains.append(entry["domain"])
241
242     except network.exceptions as exception:
243         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
244         return 102
245
246     logger.debug("domains()=%d", len(domains))
247     if len(domains) > 0:
248         logger.info("Adding %d new instances ...", len(domains))
249         for domain in domains:
250             logger.debug("domain='%s' - BEFORE!", domain)
251             domain = domain.encode("idna").decode("utf-8")
252             logger.debug("domain='%s' - AFTER!", domain)
253
254             try:
255                 logger.info("Fetching instances from domain='%s' ...", domain)
256                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
257             except network.exceptions as exception:
258                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
259                 instances.set_last_error(domain, exception)
260                 return 100
261
262     logger.debug("Success - EXIT!")
263     return 0
264
265 def fetch_blocks(args: argparse.Namespace) -> int:
266     logger.debug("args[]='%s' - CALLED!", type(args))
267     if args.domain is not None and args.domain != "":
268         logger.debug("args.domain='%s' - checking ...", args.domain)
269         if not validators.domain(args.domain):
270             logger.warning("args.domain='%s' is not valid.", args.domain)
271             return 100
272         elif blacklist.is_blacklisted(args.domain):
273             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
274             return 101
275         elif not instances.is_registered(args.domain):
276             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
277             return 102
278
279     logger.debug("Invoking locking.acquire() ...")
280     locking.acquire()
281
282     if args.domain is not None and args.domain != "":
283         # Re-check single domain
284         logger.debug("Querying database for args.domain='%s' ...", args.domain)
285         database.cursor.execute(
286             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
287         )
288     elif args.software is not None and args.software != "":
289         # Re-check single software
290         logger.debug("Querying database for args.software='%s' ...", args.software)
291         database.cursor.execute(
292             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
293         )
294     elif args.force:
295         # Re-check all
296         logger.debug("Re-checking all instances ...")
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if not domain_helper.is_wanted(blocker):
312             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
313             continue
314
315         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
316         instances.set_last_blocked(blocker)
317         instances.set_has_obfuscation(blocker, False)
318
319         # c.s isn't part of oliphant's "hidden" blocklists
320         if blocker == "chaos.social" or blocklists.has(blocker):
321             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
322             continue
323
324         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
325         blocking = federation.fetch_blocks(blocker)
326
327         logger.debug("blocking()=%d,nodeinfo_url='%s'", len(blocking), nodeinfo_url)
328         if len(blocking) == 0:
329             logger.debug("blocker='%s',software='%s'", blocker, software)
330             if software == "pleroma":
331                 logger.info("blocker='%s',software='%s'", blocker, software)
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 logger.info("blocker='%s',software='%s'", blocker, software)
336                 blocking = mastodon.fetch_blocks(blocker)
337                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
338             elif software == "lemmy":
339                 logger.info("blocker='%s',software='%s'", blocker, software)
340                 blocking = lemmy.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "friendica":
343                 logger.info("blocker='%s',software='%s'", blocker, software)
344                 blocking = friendica.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             elif software == "misskey":
347                 logger.info("blocker='%s',software='%s'", blocker, software)
348                 blocking = misskey.fetch_blocks(blocker)
349                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
350             else:
351                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
352
353         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
354         instances.set_total_blocks(blocker, blocking)
355
356         blockdict = list()
357
358         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
359         for block in blocking:
360             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
361
362             if block["block_level"] == "":
363                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
364                 continue
365
366             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
367             block["blocked"] = tidyup.domain(block["blocked"])
368             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
369             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
370
371             if block["blocked"] == "":
372                 logger.warning("blocked is empty, blocker='%s'", blocker)
373                 continue
374             elif block["blocked"].endswith(".onion"):
375                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".arpa"):
378                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].endswith(".tld"):
381                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
382                 continue
383             elif block["blocked"].find("*") >= 0:
384                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
385
386                 # Some friendica servers also obscure domains without hash
387                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
388
389                 logger.debug("row[]='%s'", type(row))
390                 if row is None:
391                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
392                     instances.set_has_obfuscation(blocker, True)
393                     continue
394
395                 block["blocked"] = row["domain"]
396                 origin           = row["origin"]
397                 nodeinfo_url     = row["nodeinfo_url"]
398             elif block["blocked"].find("?") >= 0:
399                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
400
401                 # Some obscure them with question marks, not sure if that's dependent on version or not
402                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
403
404                 logger.debug("row[]='%s'", type(row))
405                 if row is None:
406                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
407                     instances.set_has_obfuscation(blocker, True)
408                     continue
409
410                 block["blocked"] = row["domain"]
411                 origin           = row["origin"]
412                 nodeinfo_url     = row["nodeinfo_url"]
413
414             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
415             if block["blocked"] == "":
416                 logger.debug("block[blocked] is empty - SKIPPED!")
417                 continue
418
419             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
420             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
421             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
422
423             if not domain_helper.is_wanted(block["blocked"]):
424                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
425                 continue
426             elif block["block_level"] in ["accept", "accepted"]:
427                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
428                 continue
429             elif not instances.is_registered(block["blocked"]):
430                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
431                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
432
433             block["block_level"] = blocks.alias_block_level(block["block_level"])
434
435             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
436                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
437                 blockdict.append({
438                     "blocked": block["blocked"],
439                     "reason" : block["reason"],
440                 })
441
442             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
443             cookies.clear(block["blocked"])
444
445         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
446         if instances.has_pending(blocker):
447             logger.debug("Flushing updates for blocker='%s' ...", blocker)
448             instances.update(blocker)
449
450         logger.debug("Invoking commit() ...")
451         database.connection.commit()
452
453         logger.debug("Invoking cookies.clear(%s) ...", blocker)
454         cookies.clear(blocker)
455
456         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
457         if config.get("bot_enabled") and len(blockdict) > 0:
458             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
459             network.send_bot_post(blocker, blockdict)
460
461     logger.debug("Success! - EXIT!")
462     return 0
463
464 def fetch_observer(args: argparse.Namespace) -> int:
465     logger.debug("args[]='%s' - CALLED!", type(args))
466
467     logger.debug("Invoking locking.acquire() ...")
468     locking.acquire()
469
470     source_domain = "fediverse.observer"
471     if sources.is_recent(source_domain):
472         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
473         return 0
474     else:
475         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
476         sources.update(source_domain)
477
478     types = list()
479     if args.software is None:
480         logger.info("Fetching software list ...")
481         raw = utils.fetch_url(
482             f"https://{source_domain}",
483             network.web_headers,
484             (config.get("connection_timeout"), config.get("read_timeout"))
485         ).text
486         logger.debug("raw[%s]()=%d", type(raw), len(raw))
487
488         doc = bs4.BeautifulSoup(raw, features="html.parser")
489         logger.debug("doc[]='%s'", type(doc))
490
491         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
492         logger.debug("navbar[]='%s'", type(navbar))
493         if navbar is None:
494             logger.warning("Cannot find navigation bar, cannot continue!")
495             return 1
496
497         items = navbar.findAll("a", {"class": "dropdown-item"})
498         logger.debug("items[]='%s'", type(items))
499
500         logger.info("Checking %d menu items ...", len(items))
501         for item in items:
502             logger.debug("item[%s]='%s'", type(item), item)
503             if item.text.lower() == "all":
504                 logger.debug("Skipping 'All' menu entry ...")
505                 continue
506
507             logger.debug("Appending item.text='%s' ...", item.text)
508             types.append(tidyup.domain(item.text))
509     else:
510         logger.info("Adding args.software='%s' as type ...", args.software)
511         types.append(args.software)
512
513     logger.info("Fetching %d different table data ...", len(types))
514     for software in types:
515         logger.debug("software='%s' - BEFORE!", software)
516         if args.software is not None and args.software != software:
517             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
518             continue
519
520         doc = None
521         try:
522             logger.debug("Fetching table data for software='%s' ...", software)
523             raw = utils.fetch_url(
524                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
525                 network.web_headers,
526                 (config.get("connection_timeout"), config.get("read_timeout"))
527             ).text
528             logger.debug("raw[%s]()=%d", type(raw), len(raw))
529
530             doc = bs4.BeautifulSoup(raw, features="html.parser")
531             logger.debug("doc[]='%s'", type(doc))
532         except network.exceptions as exception:
533             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
534             continue
535
536         items = doc.findAll("a", {"class": "url"})
537         logger.info("Checking %d items,software='%s' ...", len(items), software)
538         for item in items:
539             logger.debug("item[]='%s'", type(item))
540             domain = item.decode_contents()
541             logger.debug("domain='%s' - AFTER!", domain)
542
543             if domain == "":
544                 logger.debug("domain is empty - SKIPPED!")
545                 continue
546
547             logger.debug("domain='%s' - BEFORE!", domain)
548             domain = domain.encode("idna").decode("utf-8")
549             logger.debug("domain='%s' - AFTER!", domain)
550
551             if not domain_helper.is_wanted(domain):
552                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
553                 continue
554             elif instances.is_registered(domain):
555                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
556                 continue
557
558             software = software_helper.alias(software)
559             logger.info("Fetching instances for domain='%s'", domain)
560             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
561
562     logger.debug("Success! - EXIT!")
563     return 0
564
565 def fetch_todon_wiki(args: argparse.Namespace) -> int:
566     logger.debug("args[]='%s' - CALLED!", type(args))
567
568     logger.debug("Invoking locking.acquire() ...")
569     locking.acquire()
570
571     source_domain = "wiki.todon.eu"
572     if sources.is_recent(source_domain):
573         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
574         return 0
575     else:
576         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
577         sources.update(source_domain)
578
579     blocklist = {
580         "silenced": list(),
581         "reject": list(),
582     }
583
584     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
585     raw = utils.fetch_url(
586         f"https://{source_domain}/todon/domainblocks",
587         network.web_headers,
588         (config.get("connection_timeout"), config.get("read_timeout"))
589     ).text
590     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
591
592     doc = bs4.BeautifulSoup(raw, "html.parser")
593     logger.debug("doc[]='%s'", type(doc))
594
595     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
596     logger.info("Checking %d silenced/limited entries ...", len(silenced))
597     blocklist["silenced"] = utils.find_domains(silenced, "div")
598
599     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
600     logger.info("Checking %d suspended entries ...", len(suspended))
601     blocklist["reject"] = utils.find_domains(suspended, "div")
602
603     blocking = blocklist["silenced"] + blocklist["reject"]
604     blocker = "todon.eu"
605
606     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
607     instances.set_last_blocked(blocker)
608     instances.set_total_blocks(blocker, blocking)
609
610     blockdict = list()
611     for block_level in blocklist:
612         blockers = blocklist[block_level]
613
614         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
615         for blocked in blockers:
616             logger.debug("blocked='%s'", blocked)
617
618             if not instances.is_registered(blocked):
619                 try:
620                     logger.info("Fetching instances from domain='%s' ...", blocked)
621                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
622                 except network.exceptions as exception:
623                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
624                     instances.set_last_error(blocked, exception)
625
626             if blocks.is_instance_blocked(blocker, blocked, block_level):
627                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
628                 continue
629
630             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
631             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
632                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
633                 blockdict.append({
634                     "blocked": blocked,
635                     "reason" : None,
636                 })
637
638         logger.debug("Invoking commit() ...")
639         database.connection.commit()
640
641         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
642         if config.get("bot_enabled") and len(blockdict) > 0:
643             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
644             network.send_bot_post(blocker, blockdict)
645
646     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
647     if instances.has_pending(blocker):
648         logger.debug("Flushing updates for blocker='%s' ...", blocker)
649         instances.update(blocker)
650
651     logger.debug("Success! - EXIT!")
652     return 0
653
654 def fetch_cs(args: argparse.Namespace):
655     logger.debug("args[]='%s' - CALLED!", type(args))
656
657     logger.debug("Invoking locking.acquire() ...")
658     locking.acquire()
659
660     extensions = [
661         "extra",
662         "abbr",
663         "attr_list",
664         "def_list",
665         "fenced_code",
666         "footnotes",
667         "md_in_html",
668         "admonition",
669         "codehilite",
670         "legacy_attrs",
671         "legacy_em",
672         "meta",
673         "nl2br",
674         "sane_lists",
675         "smarty",
676         "toc",
677         "wikilinks"
678     ]
679
680     blocklist = {
681         "silenced": list(),
682         "reject"  : list(),
683     }
684
685     source_domain = "raw.githubusercontent.com"
686     if sources.is_recent(source_domain):
687         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
688         return 0
689     else:
690         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
691         sources.update(source_domain)
692
693     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
694     raw = utils.fetch_url(
695         f"https://{source_domain}/chaossocial/meta/master/federation.md",
696         network.web_headers,
697         (config.get("connection_timeout"), config.get("read_timeout"))
698     ).text
699     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
700
701     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
702     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
703
704     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
705     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
706     blocklist["silenced"] = federation.find_domains(silenced)
707
708     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
709     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
710     blocklist["reject"] = federation.find_domains(blocked)
711
712     blocking = blocklist["silenced"] + blocklist["reject"]
713     blocker = "chaos.social"
714
715     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
716     instances.set_last_blocked(blocker)
717     instances.set_total_blocks(blocker, blocking)
718
719     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
720     if len(blocking) > 0:
721         blockdict = list()
722         for block_level in blocklist:
723             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
724
725             for row in blocklist[block_level]:
726                 logger.debug("row[%s]='%s'", type(row), row)
727                 if not "domain" in row:
728                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
729                     continue
730                 elif not instances.is_registered(row["domain"]):
731                     try:
732                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
733                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
734                     except network.exceptions as exception:
735                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
736                         instances.set_last_error(row["domain"], exception)
737
738                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
739                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
740                     blockdict.append({
741                         "blocked": row["domain"],
742                         "reason" : row["reason"],
743                     })
744
745         logger.debug("Invoking commit() ...")
746         database.connection.commit()
747
748         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
749         if config.get("bot_enabled") and len(blockdict) > 0:
750             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
751             network.send_bot_post(blocker, blockdict)
752
753     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
754     if instances.has_pending(blocker):
755         logger.debug("Flushing updates for blocker='%s' ...", blocker)
756         instances.update(blocker)
757
758     logger.debug("Success! - EXIT!")
759     return 0
760
761 def fetch_fba_rss(args: argparse.Namespace) -> int:
762     logger.debug("args[]='%s' - CALLED!", type(args))
763
764     domains = list()
765
766     logger.debug("Invoking locking.acquire() ...")
767     locking.acquire()
768
769     components = urlparse(args.feed)
770
771     if sources.is_recent(components.netloc):
772         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
773         return 0
774     else:
775         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
776         sources.update(components.netloc)
777
778     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
779     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
780
781     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
782     if response.ok and response.status_code == 200 and len(response.text) > 0:
783         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
784         rss = atoma.parse_rss_bytes(response.content)
785
786         logger.debug("rss[]='%s'", type(rss))
787         for item in rss.items:
788             logger.debug("item[%s]='%s'", type(item), item)
789             domain = tidyup.domain(item.link.split("=")[1])
790
791             logger.debug("domain='%s' - AFTER!", domain)
792             if domain == "":
793                 logger.debug("domain is empty - SKIPPED!")
794                 continue
795
796             logger.debug("domain='%s' - BEFORE!", domain)
797             domain = domain.encode("idna").decode("utf-8")
798             logger.debug("domain='%s' - AFTER!", domain)
799
800             if not domain_helper.is_wanted(domain):
801                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
802                 continue
803             elif domain in domains:
804                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
805                 continue
806             elif instances.is_registered(domain):
807                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
808                 continue
809             elif instances.is_recent(domain):
810                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
811                 continue
812
813             logger.debug("Adding domain='%s'", domain)
814             domains.append(domain)
815
816     logger.debug("domains()=%d", len(domains))
817     if len(domains) > 0:
818         logger.info("Adding %d new instances ...", len(domains))
819         for domain in domains:
820             logger.debug("domain='%s'", domain)
821             try:
822                 logger.info("Fetching instances from domain='%s' ...", domain)
823                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
824             except network.exceptions as exception:
825                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
826                 instances.set_last_error(domain, exception)
827                 return 100
828
829     logger.debug("Success! - EXIT!")
830     return 0
831
832 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
833     logger.debug("args[]='%s' - CALLED!", type(args))
834
835     logger.debug("Invoking locking.acquire() ...")
836     locking.acquire()
837
838     source_domain = "ryona.agency"
839     feed = f"https://{source_domain}/users/fba/feed.atom"
840
841     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
842     if args.feed is not None and validators.url(args.feed):
843         logger.debug("Setting feed='%s' ...", args.feed)
844         feed = str(args.feed)
845         source_domain = urlparse(args.feed).netloc
846
847     if sources.is_recent(source_domain):
848         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
849         return 0
850     else:
851         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
852         sources.update(source_domain)
853
854     domains = list()
855
856     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
857     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
858
859     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
860     if response.ok and response.status_code == 200 and len(response.text) > 0:
861         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
862         atom = atoma.parse_atom_bytes(response.content)
863
864         logger.debug("atom[]='%s'", type(atom))
865         for entry in atom.entries:
866             logger.debug("entry[]='%s'", type(entry))
867             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
868             logger.debug("doc[]='%s'", type(doc))
869             for element in doc.findAll("a"):
870                 logger.debug("element[]='%s'", type(element))
871                 for href in element["href"].split(","):
872                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
873                     domain = tidyup.domain(href)
874
875                     logger.debug("domain='%s' - AFTER!", domain)
876                     if domain == "":
877                         logger.debug("domain is empty - SKIPPED!")
878                         continue
879
880                     logger.debug("domain='%s' - BEFORE!", domain)
881                     domain = domain.encode("idna").decode("utf-8")
882                     logger.debug("domain='%s' - AFTER!", domain)
883
884                     if not domain_helper.is_wanted(domain):
885                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
886                         continue
887                     elif domain in domains:
888                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
889                         continue
890                     elif instances.is_registered(domain):
891                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
892                         continue
893                     elif instances.is_recent(domain):
894                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
895                         continue
896
897                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
898                     domains.append(domain)
899
900     logger.debug("domains()=%d", len(domains))
901     if len(domains) > 0:
902         logger.info("Adding %d new instances ...", len(domains))
903         for domain in domains:
904             logger.debug("domain='%s'", domain)
905             try:
906                 logger.info("Fetching instances from domain='%s' ...", domain)
907                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
908             except network.exceptions as exception:
909                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
910                 instances.set_last_error(domain, exception)
911                 return 100
912
913     logger.debug("Success! - EXIT!")
914     return 0
915
916 def fetch_instances(args: argparse.Namespace) -> int:
917     logger.debug("args[]='%s' - CALLED!", type(args))
918
919     logger.debug("args.domain='%s' - checking ...", args.domain)
920     if not validators.domain(args.domain):
921         logger.warning("args.domain='%s' is not valid.", args.domain)
922         return 100
923     elif blacklist.is_blacklisted(args.domain):
924         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
925         return 101
926
927     logger.debug("Invoking locking.acquire() ...")
928     locking.acquire()
929
930     # Initialize values
931     domain = tidyup.domain(args.domain)
932     origin = software = None
933
934     # Fetch record
935     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
936     row = database.cursor.fetchone()
937     if row is not None:
938         origin = row["origin"]
939         software = row["software"]
940
941     # Initial fetch
942     try:
943         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
944         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
945     except network.exceptions as exception:
946         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
947         instances.set_last_error(args.domain, exception)
948         instances.update(args.domain)
949         return 100
950
951     if args.single:
952         logger.debug("Not fetching more instances - EXIT!")
953         return 0
954
955     # Loop through some instances
956     database.cursor.execute(
957         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
958     )
959
960     rows = database.cursor.fetchall()
961     logger.info("Checking %d entries ...", len(rows))
962     for row in rows:
963         logger.debug("row[domain]='%s'", row["domain"])
964         if row["domain"] == "":
965             logger.debug("row[domain] is empty - SKIPPED!")
966             continue
967
968         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
969         domain = row["domain"].encode("idna").decode("utf-8")
970         logger.debug("domain='%s' - AFTER!", domain)
971
972         if not domain_helper.is_wanted(domain):
973             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
974             continue
975
976         try:
977             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
978             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
979         except network.exceptions as exception:
980             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
981             instances.set_last_error(domain, exception)
982
983     logger.debug("Success - EXIT!")
984     return 0
985
986 def fetch_oliphant(args: argparse.Namespace) -> int:
987     logger.debug("args[]='%s' - CALLED!", type(args))
988
989     logger.debug("Invoking locking.acquire() ...")
990     locking.acquire()
991
992     source_domain = "codeberg.org"
993     if sources.is_recent(source_domain):
994         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
995         return 0
996     else:
997         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
998         sources.update(source_domain)
999
1000     # Base URL
1001     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1002
1003     domains = list()
1004
1005     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1006     for block in blocklists.oliphant_blocklists:
1007         # Is domain given and not equal blocker?
1008         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1009             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1010             continue
1011         elif args.domain in domains:
1012             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1013             continue
1014
1015         instances.set_last_blocked(block["blocker"])
1016
1017         # Fetch this URL
1018         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1019         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1020
1021         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1022         if not response.ok or response.status_code > 200 or response.content == "":
1023             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1024             continue
1025
1026         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1027         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1028
1029         blockdict = list()
1030
1031         cnt = 0
1032         for row in reader:
1033             logger.debug("row[%s]='%s'", type(row), row)
1034             domain = severity = None
1035             reject_media = reject_reports = False
1036
1037             if "#domain" in row:
1038                 domain = row["#domain"]
1039             elif "domain" in row:
1040                 domain = row["domain"]
1041             else:
1042                 logger.debug("row='%s' does not contain domain column", row)
1043                 continue
1044
1045             if "#severity" in row:
1046                 severity = blocks.alias_block_level(row["#severity"])
1047             elif "severity" in row:
1048                 severity = blocks.alias_block_level(row["severity"])
1049             else:
1050                 logger.debug("row='%s' does not contain severity column", row)
1051                 continue
1052
1053             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1054                 reject_media = True
1055             elif "reject_media" in row and row["reject_media"].lower() == "true":
1056                 reject_media = True
1057
1058             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1059                 reject_reports = True
1060             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1061                 reject_reports = True
1062
1063             cnt = cnt + 1
1064             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1065             if domain == "":
1066                 logger.debug("domain is empty - SKIPPED!")
1067                 continue
1068             elif domain.endswith(".onion"):
1069                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1070                 continue
1071             elif domain.endswith(".arpa"):
1072                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1073                 continue
1074             elif domain.endswith(".tld"):
1075                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1076                 continue
1077             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1078                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1079                 domain = utils.deobfuscate(domain, block["blocker"])
1080                 logger.debug("domain='%s' - AFTER!", domain)
1081
1082             if not validators.domain(domain):
1083                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1084                 continue
1085             elif blacklist.is_blacklisted(domain):
1086                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1087                 continue
1088             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1089                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1090                 continue
1091
1092             logger.debug("Marking domain='%s' as handled", domain)
1093             domains.append(domain)
1094
1095             logger.debug("Processing domain='%s' ...", domain)
1096             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1097             logger.debug("processed='%s'", processed)
1098
1099             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1100                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1101                 blockdict.append({
1102                     "blocked": domain,
1103                     "reason" : block["reason"],
1104                 })
1105
1106             if reject_media:
1107                 processing.block(block["blocker"], domain, None, "reject_media")
1108             if reject_reports:
1109                 processing.block(block["blocker"], domain, None, "reject_reports")
1110
1111         logger.debug("block[blocker]='%s'", block["blocker"])
1112         if not blocklists.has(block["blocker"]):
1113             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1114             instances.set_total_blocks(block["blocker"], domains)
1115
1116         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1117         if instances.has_pending(block["blocker"]):
1118             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1119             instances.update(block["blocker"])
1120
1121         logger.debug("Invoking commit() ...")
1122         database.connection.commit()
1123
1124         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1125         if config.get("bot_enabled") and len(blockdict) > 0:
1126             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1127             network.send_bot_post(block["blocker"], blockdict)
1128
1129     logger.debug("Success! - EXIT!")
1130     return 0
1131
1132 def fetch_txt(args: argparse.Namespace) -> int:
1133     logger.debug("args[]='%s' - CALLED!", type(args))
1134
1135     logger.debug("Invoking locking.acquire() ...")
1136     locking.acquire()
1137
1138     # Static URLs
1139     urls = ({
1140         "blocker": "seirdy.one",
1141         "url"    : "https://seirdy.one/pb/bsl.txt",
1142     },)
1143
1144     logger.info("Checking %d text file(s) ...", len(urls))
1145     for row in urls:
1146         logger.debug("Fetching row[url]='%s' ...", row["url"])
1147         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1148
1149         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1150         if response.ok and response.status_code == 200 and response.text != "":
1151             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1152             domains = response.text.split("\n")
1153
1154             logger.info("Processing %d domains ...", len(domains))
1155             for domain in domains:
1156                 logger.debug("domain='%s' - BEFORE!", domain)
1157                 domain = tidyup.domain(domain)
1158
1159                 logger.debug("domain='%s' - AFTER!", domain)
1160                 if domain == "":
1161                     logger.debug("domain is empty - SKIPPED!")
1162                     continue
1163                 elif not domain_helper.is_wanted(domain):
1164                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1165                     continue
1166                 elif instances.is_recent(domain):
1167                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1168                     continue
1169
1170                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1171                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1172
1173                 logger.debug("processed='%s'", processed)
1174                 if not processed:
1175                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1176                     continue
1177
1178     logger.debug("Success! - EXIT!")
1179     return 0
1180
1181 def fetch_fedipact(args: argparse.Namespace) -> int:
1182     logger.debug("args[]='%s' - CALLED!", type(args))
1183
1184     logger.debug("Invoking locking.acquire() ...")
1185     locking.acquire()
1186
1187     source_domain = "fedipact.online"
1188     if sources.is_recent(source_domain):
1189         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1190         return 0
1191     else:
1192         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1193         sources.update(source_domain)
1194
1195     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1196     response = utils.fetch_url(
1197         f"https://{source_domain}",
1198         network.web_headers,
1199         (config.get("connection_timeout"), config.get("read_timeout"))
1200     )
1201
1202     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1203     if response.ok and response.status_code == 200 and response.text != "":
1204         logger.debug("Parsing %d Bytes ...", len(response.text))
1205
1206         doc = bs4.BeautifulSoup(response.text, "html.parser")
1207         logger.debug("doc[]='%s'", type(doc))
1208
1209         rows = doc.findAll("li")
1210         logger.info("Checking %d row(s) ...", len(rows))
1211         for row in rows:
1212             logger.debug("row[]='%s'", type(row))
1213             domain = tidyup.domain(row.contents[0])
1214
1215             logger.debug("domain='%s' - AFTER!", domain)
1216             if domain == "":
1217                 logger.debug("domain is empty - SKIPPED!")
1218                 continue
1219
1220             logger.debug("domain='%s' - BEFORE!", domain)
1221             domain = domain.encode("idna").decode("utf-8")
1222             logger.debug("domain='%s' - AFTER!", domain)
1223
1224             if not domain_helper.is_wanted(domain):
1225                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1226                 continue
1227             elif instances.is_registered(domain):
1228                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1229                 continue
1230             elif instances.is_recent(domain):
1231                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1232                 continue
1233
1234             logger.info("Fetching domain='%s' ...", domain)
1235             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1236
1237     logger.debug("Success! - EXIT!")
1238     return 0
1239
1240 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1241     logger.debug("args[]='%s' - CALLED!", type(args))
1242
1243     logger.debug("Invoking locking.acquire() ...")
1244     locking.acquire()
1245
1246     source_domain = "instances.joinmobilizon.org"
1247     if sources.is_recent(source_domain):
1248         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1249         return 0
1250     else:
1251         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1252         sources.update(source_domain)
1253
1254     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1255     raw = utils.fetch_url(
1256         f"https://{source_domain}/api/v1/instances",
1257         network.web_headers,
1258         (config.get("connection_timeout"), config.get("read_timeout"))
1259     ).text
1260     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1261
1262     parsed = json.loads(raw)
1263     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1264
1265     if "data" not in parsed:
1266         logger.warning("parsed()=%d does not contain key 'data'")
1267         return 1
1268
1269     logger.info("Checking %d instances ...", len(parsed["data"]))
1270     for row in parsed["data"]:
1271         logger.debug("row[]='%s'", type(row))
1272         if "host" not in row:
1273             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1274             continue
1275         elif not domain_helper.is_wanted(row["host"]):
1276             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1277             continue
1278         elif instances.is_registered(row["host"]):
1279             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1280             continue
1281
1282         logger.info("Fetching row[host]='%s' ...", row["host"])
1283         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1284
1285     logger.debug("Success! - EXIT!")
1286     return 0
1287
1288 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1289     logger.debug("args[]='%s' - CALLED!", type(args))
1290
1291     logger.debug("Invoking locking.acquire() ...")
1292     locking.acquire()
1293
1294     source_domain = "instanceapp.misskey.page"
1295     if sources.is_recent(source_domain):
1296         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1297         return 0
1298     else:
1299         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1300         sources.update(source_domain)
1301
1302     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1303     raw = utils.fetch_url(
1304         f"https://{source_domain}/instances.json",
1305         network.web_headers,
1306         (config.get("connection_timeout"), config.get("read_timeout"))
1307     ).text
1308     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1309
1310     parsed = json.loads(raw)
1311     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1312
1313     if "instancesInfos" not in parsed:
1314         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1315         return 1
1316
1317     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1318     for row in parsed["instancesInfos"]:
1319         logger.debug("row[%s]='%s'", type(row), row)
1320         if "url" not in row:
1321             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1322             continue
1323         elif not domain_helper.is_wanted(row["url"]):
1324             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1325             continue
1326         elif instances.is_registered(row["url"]):
1327             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1328             continue
1329
1330         logger.info("Fetching row[url]='%s' ...", row["url"])
1331         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1332
1333     logger.debug("Success! - EXIT!")
1334     return 0
1335
1336 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1337     logger.debug("args[]='%s' - CALLED!", type(args))
1338
1339     logger.debug("Invoking locking.acquire() ...")
1340     locking.acquire()
1341
1342     source_domain = "joinfediverse.wiki"
1343     if sources.is_recent(source_domain):
1344         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1345         return 0
1346     else:
1347         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1348         sources.update(source_domain)
1349
1350     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1351     raw = utils.fetch_url(
1352         f"https://{source_domain}/FediBlock",
1353         network.web_headers,
1354         (config.get("connection_timeout"), config.get("read_timeout"))
1355     ).text
1356     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1357
1358     doc = bs4.BeautifulSoup(raw, "html.parser")
1359     logger.debug("doc[]='%s'", type(doc))
1360
1361     tables = doc.findAll("table", {"class": "wikitable"})
1362
1363     logger.info("Analyzing %d table(s) ...", len(tables))
1364     blocklist = list()
1365     for table in tables:
1366         logger.debug("table[]='%s'", type(table))
1367
1368         rows = table.findAll("tr")
1369         logger.info("Checking %d row(s) ...", len(rows))
1370         block_headers = dict()
1371         for row in rows:
1372             logger.debug("row[%s]='%s'", type(row), row)
1373
1374             headers = row.findAll("th")
1375             logger.debug("Found headers()=%d header(s)", len(headers))
1376             if len(headers) > 1:
1377                 block_headers = dict()
1378                 cnt = 0
1379                 for header in headers:
1380                     cnt = cnt + 1
1381                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1382                     text = header.contents[0]
1383
1384                     logger.debug("text[]='%s'", type(text))
1385                     if not isinstance(text, str):
1386                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1387                         continue
1388                     elif validators.domain(text.strip()):
1389                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1390                         continue
1391
1392                     text = tidyup.domain(text.strip())
1393                     logger.debug("text='%s' - AFTER!", text)
1394                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1395                         logger.debug("Found header: '%s'=%d", text, cnt)
1396                         block_headers[cnt] = text
1397
1398             elif len(block_headers) == 0:
1399                 logger.debug("row is not scrapable - SKIPPED!")
1400                 continue
1401             elif len(block_headers) > 0:
1402                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1403                 cnt = 0
1404                 block = dict()
1405
1406                 for element in row.find_all(["th", "td"]):
1407                     cnt = cnt + 1
1408                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1409                     if cnt in block_headers:
1410                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1411
1412                         text = element.text.strip()
1413                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1414
1415                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1416                         if key in ["domain", "instance"]:
1417                             block[key] = text
1418                         elif key == "reason":
1419                             block[key] = tidyup.reason(text)
1420                         elif key == "subdomain(s)":
1421                             block[key] = list()
1422                             if text != "":
1423                                 block[key] = text.split("/")
1424                         else:
1425                             logger.debug("key='%s'", key)
1426                             block[key] = text
1427
1428                 logger.debug("block()=%d ...", len(block))
1429                 if len(block) > 0:
1430                     logger.debug("Appending block()=%d ...", len(block))
1431                     blocklist.append(block)
1432
1433     logger.debug("blocklist()=%d", len(blocklist))
1434
1435     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1436     domains = database.cursor.fetchall()
1437
1438     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1439     blocking = list()
1440     for block in blocklist:
1441         logger.debug("block='%s'", block)
1442         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1443             origin = block["blocked"]
1444             logger.debug("origin='%s'", origin)
1445             for subdomain in block["subdomain(s)"]:
1446                 block["blocked"] = subdomain + "." + origin
1447                 logger.debug("block[blocked]='%s'", block["blocked"])
1448                 blocking.append(block)
1449         else:
1450             blocking.append(block)
1451
1452     logger.debug("blocking()=%d", blocking)
1453     for block in blocking:
1454         logger.debug("block[]='%s'", type(block))
1455         if "blocked" not in block:
1456             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1457
1458         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1459         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1460
1461         if block["blocked"] == "":
1462             logger.debug("block[blocked] is empty - SKIPPED!")
1463             continue
1464         elif not domain_helper.is_wanted(block["blocked"]):
1465             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1466             continue
1467         elif instances.is_recent(block["blocked"]):
1468             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1469             continue
1470
1471         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1472         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1473
1474     blockdict = list()
1475     for blocker in domains:
1476         blocker = blocker[0]
1477         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1478         instances.set_last_blocked(blocker)
1479
1480         for block in blocking:
1481             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1482             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1483
1484             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1485             if block["blocked"] == "":
1486                 logger.debug("block[blocked] is empty - SKIPPED!")
1487                 continue
1488             elif not domain_helper.is_wanted(block["blocked"]):
1489                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1490                 continue
1491
1492             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1493             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1494                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1495                 blockdict.append({
1496                     "blocked": block["blocked"],
1497                     "reason" : block["reason"],
1498                 })
1499
1500         if instances.has_pending(blocker):
1501             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1502             instances.update(blocker)
1503
1504         logger.debug("Invoking commit() ...")
1505         database.connection.commit()
1506
1507         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1508         if config.get("bot_enabled") and len(blockdict) > 0:
1509             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1510             network.send_bot_post(blocker, blockdict)
1511
1512     logger.debug("Success! - EXIT!")
1513     return 0
1514
1515 def recheck_obfuscation(args: argparse.Namespace) -> int:
1516     logger.debug("args[]='%s' - CALLED!", type(args))
1517
1518     logger.debug("Invoking locking.acquire() ...")
1519     locking.acquire()
1520
1521     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1522         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1523     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1524         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1525     else:
1526         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1527
1528     rows = database.cursor.fetchall()
1529     logger.info("Checking %d domains ...", len(rows))
1530     for row in rows:
1531         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1532         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1533             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1534             continue
1535
1536         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1537         blocking = federation.fetch_blocks(row["domain"])
1538
1539         logger.debug("blocking()=%d", len(blocking))
1540         if len(blocking) == 0:
1541             if row["software"] == "pleroma":
1542                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1543                 blocking = pleroma.fetch_blocks(row["domain"])
1544             elif row["software"] == "mastodon":
1545                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1546                 blocking = mastodon.fetch_blocks(row["domain"])
1547             elif row["software"] == "lemmy":
1548                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1549                 blocking = lemmy.fetch_blocks(row["domain"])
1550             elif row["software"] == "friendica":
1551                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1552                 blocking = friendica.fetch_blocks(row["domain"])
1553             elif row["software"] == "misskey":
1554                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1555                 blocking = misskey.fetch_blocks(row["domain"])
1556             else:
1557                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1558
1559         # c.s isn't part of oliphant's "hidden" blocklists
1560         logger.debug("row[domain]='%s'", row["domain"])
1561         if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]):
1562             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1563             instances.set_last_blocked(row["domain"])
1564             instances.set_total_blocks(row["domain"], blocking)
1565
1566         obfuscated = 0
1567         blockdict = list()
1568
1569         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1570         for block in blocking:
1571             logger.debug("block[blocked]='%s'", block["blocked"])
1572             blocked = None
1573
1574             if block["blocked"] == "":
1575                 logger.debug("block[blocked] is empty - SKIPPED!")
1576                 continue
1577             elif block["blocked"].endswith(".arpa"):
1578                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1579                 continue
1580             elif block["blocked"].endswith(".tld"):
1581                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1582                 continue
1583             elif block["blocked"].endswith(".onion"):
1584                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1585                 continue
1586             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1587                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1588                 obfuscated = obfuscated + 1
1589                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1590             elif not domain_helper.is_wanted(block["blocked"]):
1591                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1592                 continue
1593             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1594                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1595                 continue
1596
1597             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1598             if blocked is not None and blocked != block["blocked"]:
1599                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1600                 obfuscated = obfuscated - 1
1601
1602                 if blocks.is_instance_blocked(row["domain"], blocked):
1603                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1604                     continue
1605                 elif blacklist.is_blacklisted(blocked):
1606                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1607                     continue
1608
1609                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1610
1611                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1612                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1613                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1614                     blockdict.append({
1615                         "blocked": blocked,
1616                         "reason" : block["reason"],
1617                     })
1618
1619         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1620         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1621
1622         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1623         if obfuscated == 0 and len(blocking) > 0:
1624             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1625             instances.set_has_obfuscation(row["domain"], False)
1626
1627         if instances.has_pending(row["domain"]):
1628             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1629             instances.update(row["domain"])
1630
1631         logger.debug("Invoking commit() ...")
1632         database.connection.commit()
1633
1634         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1635         if config.get("bot_enabled") and len(blockdict) > 0:
1636             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1637             network.send_bot_post(row["domain"], blockdict)
1638
1639     logger.debug("Success! - EXIT!")
1640     return 0
1641
1642 def fetch_fedilist(args: argparse.Namespace) -> int:
1643     logger.debug("args[]='%s' - CALLED!", type(args))
1644
1645     logger.debug("Invoking locking.acquire() ...")
1646     locking.acquire()
1647
1648     source_domain = "demo.fedilist.com"
1649     if sources.is_recent(source_domain):
1650         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1651         return 0
1652     else:
1653         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1654         sources.update(source_domain)
1655
1656     url = f"http://{source_domain}/instance/csv?onion=not"
1657     if args.software is not None and args.software != "":
1658         logger.debug("args.software='%s'", args.software)
1659         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1660
1661     logger.info("Fetching url='%s' ...", url)
1662     response = reqto.get(
1663         url,
1664         headers=network.web_headers,
1665         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1666         allow_redirects=False
1667     )
1668
1669     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1670     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1671         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1672         return 1
1673
1674     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1675
1676     logger.debug("reader[]='%s'", type(reader))
1677     if reader is None:
1678         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1679         return 2
1680
1681     rows = list(reader)
1682
1683     logger.info("Checking %d rows ...", len(rows))
1684     for row in rows:
1685         logger.debug("row[]='%s'", type(row))
1686         if "hostname" not in row:
1687             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1688             continue
1689
1690         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1691         domain = tidyup.domain(row["hostname"])
1692         logger.debug("domain='%s' - AFTER!", domain)
1693
1694         if domain == "":
1695             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1696             continue
1697
1698         logger.debug("domain='%s' - BEFORE!", domain)
1699         domain = domain.encode("idna").decode("utf-8")
1700         logger.debug("domain='%s' - AFTER!", domain)
1701
1702         if not domain_helper.is_wanted(domain):
1703             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1704             continue
1705         elif (args.force is None or not args.force) and instances.is_registered(domain):
1706             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1707             continue
1708         elif instances.is_recent(domain):
1709             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1710             continue
1711
1712         logger.info("Fetching instances from domain='%s' ...", domain)
1713         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1714
1715     logger.debug("Success! - EXIT!")
1716     return 0
1717
1718 def update_nodeinfo(args: argparse.Namespace) -> int:
1719     logger.debug("args[]='%s' - CALLED!", type(args))
1720
1721     logger.debug("Invoking locking.acquire() ...")
1722     locking.acquire()
1723
1724     if args.domain is not None and args.domain != "":
1725         logger.debug("Fetching args.domain='%s'", args.domain)
1726         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1727     elif args.software is not None and args.software != "":
1728         logger.info("Fetching domains for args.software='%s'", args.software)
1729         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1730     elif args.mode is not None and args.mode != "":
1731         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1732         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1733     elif args.no_software:
1734         logger.info("Fetching domains with no software type detected ...")
1735         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1736     else:
1737         logger.info("Fetching domains for recently updated ...")
1738         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1739
1740     domains = database.cursor.fetchall()
1741
1742     logger.info("Checking %d domain(s) ...", len(domains))
1743     cnt = 0
1744     for row in domains:
1745         logger.debug("row[]='%s'", type(row))
1746         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1747             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1748             continue
1749
1750         try:
1751             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1752             software = federation.determine_software(row["domain"])
1753
1754             logger.debug("Determined software='%s'", software)
1755             if (software != row["software"] and software is not None) or args.force is True:
1756                 logger.debug("software='%s'", software)
1757                 if software is None:
1758                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1759                     instances.set_nodeinfo_url(row["domain"], None)
1760
1761                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1762                 instances.set_software(row["domain"], software)
1763
1764             if software is not None:
1765                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1766                 instances.set_success(row["domain"])
1767         except network.exceptions as exception:
1768             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1769             instances.set_last_error(row["domain"], exception)
1770
1771         instances.set_last_nodeinfo(row["domain"])
1772         instances.update(row["domain"])
1773         cnt = cnt + 1
1774
1775     logger.debug("Success! - EXIT!")
1776     return 0
1777
1778 def fetch_instances_social(args: argparse.Namespace) -> int:
1779     logger.debug("args[]='%s' - CALLED!", type(args))
1780
1781     logger.debug("Invoking locking.acquire() ...")
1782     locking.acquire()
1783
1784     source_domain = "instances.social"
1785
1786     if config.get("instances_social_api_key") == "":
1787         logger.error("API key not set. Please set in your config.json file.")
1788         return 1
1789     elif sources.is_recent(source_domain):
1790         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1791         return 0
1792     else:
1793         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1794         sources.update(source_domain)
1795
1796     headers = {
1797         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1798     }
1799
1800     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1801     fetched = network.get_json_api(
1802         source_domain,
1803         "/api/1.0/instances/list?count=0&sort_by=name",
1804         headers,
1805         (config.get("connection_timeout"), config.get("read_timeout"))
1806     )
1807     logger.debug("fetched[]='%s'", type(fetched))
1808
1809     if "error_message" in fetched:
1810         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1811         return 2
1812     elif "exception" in fetched:
1813         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1814         return 3
1815     elif "json" not in fetched:
1816         logger.warning("fetched has no element 'json' - EXIT!")
1817         return 4
1818     elif "instances" not in fetched["json"]:
1819         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1820         return 5
1821
1822     domains = list()
1823     rows = fetched["json"]["instances"]
1824
1825     logger.info("Checking %d row(s) ...", len(rows))
1826     for row in rows:
1827         logger.debug("row[]='%s'", type(row))
1828         domain = tidyup.domain(row["name"])
1829         logger.debug("domain='%s' - AFTER!", domain)
1830
1831         if domain == "":
1832             logger.debug("domain is empty - SKIPPED!")
1833             continue
1834
1835         logger.debug("domain='%s' - BEFORE!", domain)
1836         domain = domain.encode("idna").decode("utf-8")
1837         logger.debug("domain='%s' - AFTER!", domain)
1838
1839         if not domain_helper.is_wanted(domain):
1840             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1841             continue
1842         elif domain in domains:
1843             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1844             continue
1845         elif instances.is_registered(domain):
1846             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1847             continue
1848         elif instances.is_recent(domain):
1849             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1850             continue
1851
1852         logger.info("Fetching instances from domain='%s'", domain)
1853         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1854
1855     logger.debug("Success! - EXIT!")
1856     return 0
1857
1858 def fetch_relays(args: argparse.Namespace) -> int:
1859     logger.debug("args[]='%s' - CALLED!", type(args))
1860
1861     logger.debug("Invoking locking.acquire() ...")
1862     locking.acquire()
1863
1864     if args.domain is not None and args.domain != "":
1865         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1866     else:
1867         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1868
1869     domains = list()
1870     rows = database.cursor.fetchall()
1871
1872     logger.info("Checking %d relays ...", len(rows))
1873     for row in rows:
1874         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1875         peers = list()
1876         if not args.force and instances.is_recent(row["domain"]):
1877             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1878             continue
1879
1880         try:
1881             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1882             raw = utils.fetch_url(
1883                 f"https://{row['domain']}",
1884                 network.web_headers,
1885                 (config.get("connection_timeout"), config.get("read_timeout"))
1886             ).text
1887             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1888         except network.exceptions as exception:
1889             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1890             instances.set_last_error(row["domain"], exception)
1891             instances.set_last_instance_fetch(row["domain"])
1892             instances.update(row["domain"])
1893             continue
1894
1895         doc = bs4.BeautifulSoup(raw, features="html.parser")
1896         logger.debug("doc[]='%s'", type(doc))
1897
1898         logger.debug("row[software]='%s'", row["software"])
1899         if row["software"] == "activityrelay":
1900             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1901             tags = doc.findAll("p")
1902
1903             logger.debug("Checking %d paragraphs ...", len(tags))
1904             for tag in tags:
1905                 logger.debug("tag[]='%s'", type(tag))
1906                 if len(tag.contents) == 0:
1907                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1908                     continue
1909                 elif "registered instances" not in tag.contents[0]:
1910                     logger.debug("Skipping paragraph, text not found.")
1911                     continue
1912
1913                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1914                 for domain in tag.contents:
1915                     logger.debug("domain[%s]='%s'", type(domain), domain)
1916                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1917                         continue
1918
1919                     domain = str(domain)
1920                     logger.debug("domain='%s'", domain)
1921                     if not domain_helper.is_wanted(domain):
1922                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1923                         continue
1924
1925                     logger.debug("domain='%s' - BEFORE!", domain)
1926                     domain = tidyup.domain(domain)
1927                     logger.debug("domain='%s' - AFTER!", domain)
1928
1929                     if domain == "":
1930                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1931                         continue
1932                     elif domain not in peers:
1933                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1934                         peers.append(domain)
1935
1936                     if dict_helper.has_key(domains, "domain", domain):
1937                         logger.debug("domain='%s' already added", domain)
1938                         continue
1939
1940                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1941                     domains.append({
1942                         "domain": domain,
1943                         "origin": row["domain"],
1944                     })
1945         elif row["software"] in ["aoderelay", "selective-relay"]:
1946             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1947             if row["software"] == "aoderelay":
1948                 tags = doc.findAll("section", {"class": "instance"})
1949             else:
1950                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1951
1952             logger.debug("Checking %d tags ...", len(tags))
1953             for tag in tags:
1954                 logger.debug("tag[]='%s'", type(tag))
1955
1956                 link = tag.find("a")
1957                 logger.debug("link[%s]='%s'", type(link), link)
1958                 if link is None:
1959                     logger.warning("tag='%s' has no a-tag ...", tag)
1960                     continue
1961
1962                 components = urlparse(link["href"])
1963                 domain = components.netloc.lower()
1964
1965                 if not domain_helper.is_wanted(domain):
1966                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1967                     continue
1968
1969                 logger.debug("domain='%s' - BEFORE!", domain)
1970                 domain = tidyup.domain(domain)
1971                 logger.debug("domain='%s' - AFTER!", domain)
1972
1973                 if domain == "":
1974                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1975                     continue
1976                 elif domain not in peers:
1977                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1978                     peers.append(domain)
1979
1980                 if dict_helper.has_key(domains, "domain", domain):
1981                     logger.debug("domain='%s' already added", domain)
1982                     continue
1983
1984                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1985                 domains.append({
1986                     "domain": domain,
1987                     "origin": row["domain"],
1988                 })
1989         else:
1990             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1991
1992         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1993         instances.set_last_instance_fetch(row["domain"])
1994
1995         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1996         instances.set_total_peers(row["domain"], peers)
1997
1998         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1999         instances.update(row["domain"])
2000
2001     logger.info("Checking %d domains ...", len(domains))
2002     for row in domains:
2003         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2004         if instances.is_registered(row["domain"]):
2005             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2006             continue
2007
2008         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2009         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2010
2011     logger.debug("Success! - EXIT!")
2012     return 0
2013
2014 def convert_idna(args: argparse.Namespace) -> int:
2015     logger.debug("args[]='%s' - CALLED!", type(args))
2016
2017     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2018     rows = database.cursor.fetchall()
2019
2020     logger.debug("rows[]='%s'", type(rows))
2021     instances.translate_idnas(rows, "domain")
2022
2023     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2024     rows = database.cursor.fetchall()
2025
2026     logger.debug("rows[]='%s'", type(rows))
2027     instances.translate_idnas(rows, "origin")
2028
2029     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2030     rows = database.cursor.fetchall()
2031
2032     logger.debug("rows[]='%s'", type(rows))
2033     blocks.translate_idnas(rows, "blocker")
2034
2035     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2036     rows = database.cursor.fetchall()
2037
2038     logger.debug("rows[]='%s'", type(rows))
2039     blocks.translate_idnas(rows, "blocked")
2040
2041     logger.debug("Success! - EXIT!")
2042     return 0
2043
2044 def remove_invalid(args: argparse.Namespace) -> int:
2045     logger.debug("args[]='%s' - CALLED!", type(args))
2046
2047     logger.debug("Invoking locking.acquire() ...")
2048     locking.acquire()
2049
2050     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2051     rows = database.cursor.fetchall()
2052
2053     logger.info("Checking %d domains ...", len(rows))
2054     for row in rows:
2055         logger.debug("row[domain]='%s'", row["domain"])
2056         if not validators.domain(row["domain"].split("/")[0]):
2057             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2058             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2059             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2060
2061     logger.debug("Invoking commit() ...")
2062     database.connection.commit()
2063
2064     logger.info("Vaccum cleaning database ...")
2065     database.cursor.execute("VACUUM")
2066
2067     logger.debug("Success! - EXIT!")
2068     return 0