]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import database
33 from fba import utils
34
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
45
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66
67     status = 0
68     if not validators.domain(args.domain):
69         logger.warning("args.domain='%s' is not valid", args.domain)
70         status = 100
71     elif blacklist.is_blacklisted(args.domain):
72         logger.warning("args.domain='%s' is blacklisted", args.domain)
73         status = 101
74     elif instances.is_registered(args.domain):
75         logger.warning("args.domain='%s' is already registered", args.domain)
76         status = 102
77     else:
78         logger.info("args.domain='%s' is not known", args.domain)
79
80     logger.debug("status=%d - EXIT!", status)
81     return status
82
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84     logger.debug("args[]='%s' - CALLED!", type(args))
85
86     # Fetch rows
87     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
88
89     cnt = 0
90     for row in database.cursor.fetchall():
91         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92         punycode = row["domain"].encode("idna").decode("utf-8")
93
94         if row["nodeinfo_url"].startswith("/"):
95             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
96             continue
97         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
99             cnt = cnt + 1
100
101     logger.info("Found %d row(s)", cnt)
102
103     logger.debug("EXIT!")
104     return 0
105
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107     logger.debug("args[]='%s' - CALLED!", type(args))
108
109     # No CSRF by default, you don't have to add network.source_headers by yourself here
110     headers = tuple()
111     source_domain = "pixelfed.org"
112
113     if sources.is_recent(source_domain):
114         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
115         return 0
116     else:
117         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118         sources.update(source_domain)
119
120     try:
121         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122         headers = csrf.determine(source_domain, dict())
123     except network.exceptions as exception:
124         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
125         return list()
126
127     try:
128         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129         fetched = network.get_json_api(
130             source_domain,
131             "/api/v1/servers/all.json?scope=All&country=all&language=all",
132             headers,
133             (config.get("connection_timeout"), config.get("read_timeout"))
134         )
135
136         logger.debug("JSON API returned %d elements", len(fetched))
137         if "error_message" in fetched:
138             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
139             return 101
140         elif "data" not in fetched["json"]:
141             logger.warning("API did not return JSON with 'data' element - EXIT!")
142             return 102
143
144         rows = fetched["json"]["data"]
145         logger.info("Checking %d fetched rows ...", len(rows))
146         for row in rows:
147             logger.debug("row[]='%s'", type(row))
148             if "domain" not in row:
149                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
150                 continue
151             elif row["domain"] == "":
152                 logger.debug("row[domain] is empty - SKIPPED!")
153                 continue
154
155             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156             domain = row["domain"].encode("idna").decode("utf-8")
157             logger.debug("domain='%s' - AFTER!", domain)
158
159             if not domain_helper.is_wanted(domain):
160                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
161                 continue
162             elif instances.is_registered(domain):
163                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
164                 continue
165             elif instances.is_recent(domain):
166                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
167                 continue
168
169             logger.debug("Fetching instances from domain='%s' ...", domain)
170             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
171
172     except network.exceptions as exception:
173         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
174         return 103
175
176     logger.debug("Success! - EXIT!")
177     return 0
178
179 def fetch_bkali(args: argparse.Namespace) -> int:
180     logger.debug("args[]='%s' - CALLED!", type(args))
181
182     logger.debug("Invoking locking.acquire() ...")
183     locking.acquire()
184
185     source_domain = "gql.api.bka.li"
186     if sources.is_recent(source_domain):
187         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
188         return 0
189     else:
190         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191         sources.update(source_domain)
192
193     domains = list()
194     try:
195         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196         fetched = network.post_json_api(
197             source_domain,
198             "/v1/graphql",
199             json.dumps({
200                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
201             })
202         )
203
204         logger.debug("fetched[]='%s'", type(fetched))
205         if "error_message" in fetched:
206             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
207             return 100
208         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
210             return 101
211
212         rows = fetched["json"]
213
214         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
215         if len(rows) == 0:
216             raise Exception("WARNING: Returned no records")
217         elif "data" not in rows:
218             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219         elif "nodeinfo" not in rows["data"]:
220             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
221
222         for entry in rows["data"]["nodeinfo"]:
223             logger.debug("entry[%s]='%s'", type(entry), entry)
224             if "domain" not in entry:
225                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
226                 continue
227             elif entry["domain"] == "":
228                 logger.debug("entry[domain] is empty - SKIPPED!")
229                 continue
230             elif not domain_helper.is_wanted(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_registered(entry["domain"]):
234                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
235                 continue
236             elif instances.is_recent(entry["domain"]):
237                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
238                 continue
239
240             logger.debug("Adding domain='%s' ...", entry["domain"])
241             domains.append(entry["domain"])
242
243     except network.exceptions as exception:
244         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
245         return 102
246
247     logger.debug("domains()=%d", len(domains))
248     if len(domains) > 0:
249         logger.info("Adding %d new instances ...", len(domains))
250         for domain in domains:
251             logger.debug("domain='%s' - BEFORE!", domain)
252             domain = domain.encode("idna").decode("utf-8")
253             logger.debug("domain='%s' - AFTER!", domain)
254
255             try:
256                 logger.info("Fetching instances from domain='%s' ...", domain)
257                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258             except network.exceptions as exception:
259                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260                 instances.set_last_error(domain, exception)
261                 return 100
262
263     logger.debug("Success - EXIT!")
264     return 0
265
266 def fetch_blocks(args: argparse.Namespace) -> int:
267     logger.debug("args[]='%s' - CALLED!", type(args))
268     if args.domain is not None and args.domain != "":
269         logger.debug("args.domain='%s' - checking ...", args.domain)
270         if not validators.domain(args.domain):
271             logger.warning("args.domain='%s' is not valid.", args.domain)
272             return 100
273         elif blacklist.is_blacklisted(args.domain):
274             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
275             return 101
276         elif not instances.is_registered(args.domain):
277             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
278             return 102
279
280     logger.debug("Invoking locking.acquire() ...")
281     locking.acquire()
282
283     if args.domain is not None and args.domain != "":
284         # Re-check single domain
285         logger.debug("Querying database for args.domain='%s' ...", args.domain)
286         database.cursor.execute(
287             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
288         )
289     elif args.software is not None and args.software != "":
290         # Re-check single software
291         logger.debug("Querying database for args.software='%s' ...", args.software)
292         database.cursor.execute(
293             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
294         )
295     elif args.force:
296         # Re-check all
297         logger.debug("Re-checking all instances ...")
298         database.cursor.execute(
299             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
300         )
301     else:
302         # Re-check after "timeout" (aka. minimum interval)
303         database.cursor.execute(
304             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
305         )
306
307     rows = database.cursor.fetchall()
308     logger.info("Checking %d entries ...", len(rows))
309     for blocker, software, origin, nodeinfo_url in rows:
310         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
311
312         if not domain_helper.is_wanted(blocker):
313             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314             continue
315
316         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317         instances.set_last_blocked(blocker)
318         instances.set_has_obfuscation(blocker, False)
319
320         # c.s isn't part of oliphant's "hidden" blocklists
321         if blocker == "chaos.social" or blocklists.has(blocker):
322             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
323             continue
324
325         logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326         blocking = federation.fetch_blocks(blocker)
327
328         logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
329         if len(blocking) == 0:
330             logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331             if software == "pleroma":
332                 blocking = pleroma.fetch_blocks(blocker)
333                 logger.info("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "mastodon":
335                 blocking = mastodon.fetch_blocks(blocker)
336                 logger.info("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337             elif software == "lemmy":
338                 blocking = lemmy.fetch_blocks(blocker)
339                 logger.info("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 blocking = friendica.fetch_blocks(blocker)
342                 logger.info("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343             elif software == "misskey":
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.info("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350         instances.set_total_blocks(blocker, blocking)
351
352         blockdict = list()
353
354         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
355         for block in blocking:
356             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
357
358             if block["block_level"] == "":
359                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
360                 continue
361
362             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
363             block["blocked"] = tidyup.domain(block["blocked"])
364             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
365             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
366
367             if block["blocked"] == "":
368                 logger.warning("blocked is empty, blocker='%s'", blocker)
369                 continue
370             elif block["blocked"].endswith(".onion"):
371                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
372                 continue
373             elif block["blocked"].endswith(".arpa"):
374                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".tld"):
377                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].find("*") >= 0:
380                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
381
382                 # Some friendica servers also obscure domains without hash
383                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
384
385                 logger.debug("row[]='%s'", type(row))
386                 if row is None:
387                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
388                     instances.set_has_obfuscation(blocker, True)
389                     continue
390
391                 block["blocked"] = row["domain"]
392                 origin           = row["origin"]
393                 nodeinfo_url     = row["nodeinfo_url"]
394             elif block["blocked"].find("?") >= 0:
395                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
396
397                 # Some obscure them with question marks, not sure if that's dependent on version or not
398                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
399
400                 logger.debug("row[]='%s'", type(row))
401                 if row is None:
402                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
403                     instances.set_has_obfuscation(blocker, True)
404                     continue
405
406                 block["blocked"] = row["domain"]
407                 origin           = row["origin"]
408                 nodeinfo_url     = row["nodeinfo_url"]
409
410             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
411             if block["blocked"] == "":
412                 logger.debug("block[blocked] is empty - SKIPPED!")
413                 continue
414
415             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
416             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
417             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
418
419             if not domain_helper.is_wanted(block["blocked"]):
420                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
421                 continue
422             elif block["block_level"] in ["accept", "accepted"]:
423                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
424                 continue
425             elif not instances.is_registered(block["blocked"]):
426                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
427                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
428
429             block["block_level"] = blocks.alias_block_level(block["block_level"])
430
431             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
432                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
433                 blockdict.append({
434                     "blocked": block["blocked"],
435                     "reason" : block["reason"],
436                 })
437
438             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
439             cookies.clear(block["blocked"])
440
441         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
442         if instances.has_pending(blocker):
443             logger.debug("Flushing updates for blocker='%s' ...", blocker)
444             instances.update(blocker)
445
446         logger.debug("Invoking commit() ...")
447         database.connection.commit()
448
449         logger.debug("Invoking cookies.clear(%s) ...", blocker)
450         cookies.clear(blocker)
451
452         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
453         if config.get("bot_enabled") and len(blockdict) > 0:
454             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
455             network.send_bot_post(blocker, blockdict)
456
457     logger.debug("Success! - EXIT!")
458     return 0
459
460 def fetch_observer(args: argparse.Namespace) -> int:
461     logger.debug("args[]='%s' - CALLED!", type(args))
462
463     logger.debug("Invoking locking.acquire() ...")
464     locking.acquire()
465
466     source_domain = "fediverse.observer"
467     if sources.is_recent(source_domain):
468         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
469         return 0
470     else:
471         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
472         sources.update(source_domain)
473
474     types = list()
475     if args.software is None:
476         logger.info("Fetching software list ...")
477         raw = utils.fetch_url(
478             f"https://{source_domain}",
479             network.web_headers,
480             (config.get("connection_timeout"), config.get("read_timeout"))
481         ).text
482         logger.debug("raw[%s]()=%d", type(raw), len(raw))
483
484         doc = bs4.BeautifulSoup(raw, features="html.parser")
485         logger.debug("doc[]='%s'", type(doc))
486
487         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
488         logger.debug("navbar[]='%s'", type(navbar))
489         if navbar is None:
490             logger.warning("Cannot find navigation bar, cannot continue!")
491             return 1
492
493         items = navbar.findAll("a", {"class": "dropdown-item"})
494         logger.debug("items[]='%s'", type(items))
495
496         logger.info("Checking %d menu items ...", len(items))
497         for item in items:
498             logger.debug("item[%s]='%s'", type(item), item)
499             if item.text.lower() == "all":
500                 logger.debug("Skipping 'All' menu entry ...")
501                 continue
502
503             logger.debug("Appending item.text='%s' ...", item.text)
504             types.append(tidyup.domain(item.text))
505     else:
506         logger.info("Adding args.software='%s' as type ...", args.software)
507         types.append(args.software)
508
509     logger.info("Fetching %d different table data ...", len(types))
510     for software in types:
511         logger.debug("software='%s' - BEFORE!", software)
512         if args.software is not None and args.software != software:
513             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
514             continue
515
516         doc = None
517         try:
518             logger.debug("Fetching table data for software='%s' ...", software)
519             raw = utils.fetch_url(
520                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
521                 network.web_headers,
522                 (config.get("connection_timeout"), config.get("read_timeout"))
523             ).text
524             logger.debug("raw[%s]()=%d", type(raw), len(raw))
525
526             doc = bs4.BeautifulSoup(raw, features="html.parser")
527             logger.debug("doc[]='%s'", type(doc))
528         except network.exceptions as exception:
529             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
530             continue
531
532         items = doc.findAll("a", {"class": "url"})
533         logger.info("Checking %d items,software='%s' ...", len(items), software)
534         for item in items:
535             logger.debug("item[]='%s'", type(item))
536             domain = item.decode_contents()
537             logger.debug("domain='%s' - AFTER!", domain)
538
539             if domain == "":
540                 logger.debug("domain is empty - SKIPPED!")
541                 continue
542
543             logger.debug("domain='%s' - BEFORE!", domain)
544             domain = domain.encode("idna").decode("utf-8")
545             logger.debug("domain='%s' - AFTER!", domain)
546
547             if not domain_helper.is_wanted(domain):
548                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
549                 continue
550             elif instances.is_registered(domain):
551                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
552                 continue
553
554             software = software_helper.alias(software)
555             logger.info("Fetching instances for domain='%s'", domain)
556             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
557
558     logger.debug("Success! - EXIT!")
559     return 0
560
561 def fetch_todon_wiki(args: argparse.Namespace) -> int:
562     logger.debug("args[]='%s' - CALLED!", type(args))
563
564     logger.debug("Invoking locking.acquire() ...")
565     locking.acquire()
566
567     source_domain = "wiki.todon.eu"
568     if sources.is_recent(source_domain):
569         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
570         return 0
571     else:
572         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
573         sources.update(source_domain)
574
575     blocklist = {
576         "silenced": list(),
577         "reject": list(),
578     }
579
580     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
581     raw = utils.fetch_url(
582         f"https://{source_domain}/todon/domainblocks",
583         network.web_headers,
584         (config.get("connection_timeout"), config.get("read_timeout"))
585     ).text
586     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
587
588     doc = bs4.BeautifulSoup(raw, "html.parser")
589     logger.debug("doc[]='%s'", type(doc))
590
591     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
592     logger.info("Checking %d silenced/limited entries ...", len(silenced))
593     blocklist["silenced"] = utils.find_domains(silenced, "div")
594
595     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
596     logger.info("Checking %d suspended entries ...", len(suspended))
597     blocklist["reject"] = utils.find_domains(suspended, "div")
598
599     blocking = blocklist["silenced"] + blocklist["reject"]
600     blocker = "todon.eu"
601
602     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
603     instances.set_last_blocked(blocker)
604     instances.set_total_blocks(blocker, blocking)
605
606     blockdict = list()
607     for block_level in blocklist:
608         blockers = blocklist[block_level]
609
610         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
611         for blocked in blockers:
612             logger.debug("blocked='%s'", blocked)
613
614             if not instances.is_registered(blocked):
615                 try:
616                     logger.info("Fetching instances from domain='%s' ...", blocked)
617                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
618                 except network.exceptions as exception:
619                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
620                     instances.set_last_error(blocked, exception)
621
622             if not domain_helper.is_wanted(blocked):
623                 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
624                 continue
625             elif not domain_helper.is_wanted(blocker):
626                 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
627                 continue
628             elif blocks.is_instance_blocked(blocker, blocked, block_level):
629                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
630                 continue
631
632             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
633             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
634                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
635                 blockdict.append({
636                     "blocked": blocked,
637                     "reason" : None,
638                 })
639
640         logger.debug("Invoking commit() ...")
641         database.connection.commit()
642
643         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
644         if config.get("bot_enabled") and len(blockdict) > 0:
645             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
646             network.send_bot_post(blocker, blockdict)
647
648     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
649     if instances.has_pending(blocker):
650         logger.debug("Flushing updates for blocker='%s' ...", blocker)
651         instances.update(blocker)
652
653     logger.debug("Success! - EXIT!")
654     return 0
655
656 def fetch_cs(args: argparse.Namespace):
657     logger.debug("args[]='%s' - CALLED!", type(args))
658
659     logger.debug("Invoking locking.acquire() ...")
660     locking.acquire()
661
662     extensions = [
663         "extra",
664         "abbr",
665         "attr_list",
666         "def_list",
667         "fenced_code",
668         "footnotes",
669         "md_in_html",
670         "admonition",
671         "codehilite",
672         "legacy_attrs",
673         "legacy_em",
674         "meta",
675         "nl2br",
676         "sane_lists",
677         "smarty",
678         "toc",
679         "wikilinks"
680     ]
681
682     blocklist = {
683         "silenced": list(),
684         "reject"  : list(),
685     }
686
687     source_domain = "raw.githubusercontent.com"
688     if sources.is_recent(source_domain):
689         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
690         return 0
691     else:
692         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
693         sources.update(source_domain)
694
695     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
696     raw = utils.fetch_url(
697         f"https://{source_domain}/chaossocial/meta/master/federation.md",
698         network.web_headers,
699         (config.get("connection_timeout"), config.get("read_timeout"))
700     ).text
701     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
702
703     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
704     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
705
706     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
707     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
708     blocklist["silenced"] = federation.find_domains(silenced)
709
710     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
711     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
712     blocklist["reject"] = federation.find_domains(blocked)
713
714     blocking = blocklist["silenced"] + blocklist["reject"]
715     blocker = "chaos.social"
716
717     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
718     instances.set_last_blocked(blocker)
719     instances.set_total_blocks(blocker, blocking)
720
721     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
722     if len(blocking) > 0:
723         blockdict = list()
724         for block_level in blocklist:
725             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
726
727             for row in blocklist[block_level]:
728                 logger.debug("row[%s]='%s'", type(row), row)
729                 if not "domain" in row:
730                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
731                     continue
732                 elif not instances.is_registered(row["domain"]):
733                     try:
734                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
735                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
736                     except network.exceptions as exception:
737                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
738                         instances.set_last_error(row["domain"], exception)
739
740                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
741                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
742                     blockdict.append({
743                         "blocked": row["domain"],
744                         "reason" : row["reason"],
745                     })
746
747         logger.debug("Invoking commit() ...")
748         database.connection.commit()
749
750         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
751         if config.get("bot_enabled") and len(blockdict) > 0:
752             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
753             network.send_bot_post(blocker, blockdict)
754
755     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
756     if instances.has_pending(blocker):
757         logger.debug("Flushing updates for blocker='%s' ...", blocker)
758         instances.update(blocker)
759
760     logger.debug("Success! - EXIT!")
761     return 0
762
763 def fetch_fba_rss(args: argparse.Namespace) -> int:
764     logger.debug("args[]='%s' - CALLED!", type(args))
765
766     domains = list()
767
768     logger.debug("Invoking locking.acquire() ...")
769     locking.acquire()
770
771     components = urlparse(args.feed)
772
773     if sources.is_recent(components.netloc):
774         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
775         return 0
776     else:
777         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
778         sources.update(components.netloc)
779
780     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
781     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
782
783     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
784     if response.ok and response.status_code == 200 and len(response.text) > 0:
785         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
786         rss = atoma.parse_rss_bytes(response.content)
787
788         logger.debug("rss[]='%s'", type(rss))
789         for item in rss.items:
790             logger.debug("item[%s]='%s'", type(item), item)
791             domain = tidyup.domain(item.link.split("=")[1])
792
793             logger.debug("domain='%s' - AFTER!", domain)
794             if domain == "":
795                 logger.debug("domain is empty - SKIPPED!")
796                 continue
797
798             logger.debug("domain='%s' - BEFORE!", domain)
799             domain = domain.encode("idna").decode("utf-8")
800             logger.debug("domain='%s' - AFTER!", domain)
801
802             if not domain_helper.is_wanted(domain):
803                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
804                 continue
805             elif domain in domains:
806                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
807                 continue
808             elif instances.is_registered(domain):
809                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
810                 continue
811             elif instances.is_recent(domain):
812                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
813                 continue
814
815             logger.debug("Adding domain='%s'", domain)
816             domains.append(domain)
817
818     logger.debug("domains()=%d", len(domains))
819     if len(domains) > 0:
820         logger.info("Adding %d new instances ...", len(domains))
821         for domain in domains:
822             logger.debug("domain='%s'", domain)
823             try:
824                 logger.info("Fetching instances from domain='%s' ...", domain)
825                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
826             except network.exceptions as exception:
827                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
828                 instances.set_last_error(domain, exception)
829                 return 100
830
831     logger.debug("Success! - EXIT!")
832     return 0
833
834 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
835     logger.debug("args[]='%s' - CALLED!", type(args))
836
837     logger.debug("Invoking locking.acquire() ...")
838     locking.acquire()
839
840     source_domain = "ryona.agency"
841     feed = f"https://{source_domain}/users/fba/feed.atom"
842
843     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
844     if args.feed is not None and validators.url(args.feed):
845         logger.debug("Setting feed='%s' ...", args.feed)
846         feed = str(args.feed)
847         source_domain = urlparse(args.feed).netloc
848
849     if sources.is_recent(source_domain):
850         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
851         return 0
852     else:
853         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
854         sources.update(source_domain)
855
856     domains = list()
857
858     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
859     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
860
861     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
862     if response.ok and response.status_code == 200 and len(response.text) > 0:
863         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
864         atom = atoma.parse_atom_bytes(response.content)
865
866         logger.debug("atom[]='%s'", type(atom))
867         for entry in atom.entries:
868             logger.debug("entry[]='%s'", type(entry))
869             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
870             logger.debug("doc[]='%s'", type(doc))
871             for element in doc.findAll("a"):
872                 logger.debug("element[]='%s'", type(element))
873                 for href in element["href"].split(","):
874                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
875                     domain = tidyup.domain(href)
876
877                     logger.debug("domain='%s' - AFTER!", domain)
878                     if domain == "":
879                         logger.debug("domain is empty - SKIPPED!")
880                         continue
881
882                     logger.debug("domain='%s' - BEFORE!", domain)
883                     domain = domain.encode("idna").decode("utf-8")
884                     logger.debug("domain='%s' - AFTER!", domain)
885
886                     if not domain_helper.is_wanted(domain):
887                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
888                         continue
889                     elif domain in domains:
890                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
891                         continue
892                     elif instances.is_registered(domain):
893                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
894                         continue
895                     elif instances.is_recent(domain):
896                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
897                         continue
898
899                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
900                     domains.append(domain)
901
902     logger.debug("domains()=%d", len(domains))
903     if len(domains) > 0:
904         logger.info("Adding %d new instances ...", len(domains))
905         for domain in domains:
906             logger.debug("domain='%s'", domain)
907             try:
908                 logger.info("Fetching instances from domain='%s' ...", domain)
909                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
910             except network.exceptions as exception:
911                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
912                 instances.set_last_error(domain, exception)
913                 return 100
914
915     logger.debug("Success! - EXIT!")
916     return 0
917
918 def fetch_instances(args: argparse.Namespace) -> int:
919     logger.debug("args[]='%s' - CALLED!", type(args))
920
921     logger.debug("args.domain='%s' - checking ...", args.domain)
922     if not validators.domain(args.domain):
923         logger.warning("args.domain='%s' is not valid.", args.domain)
924         return 100
925     elif blacklist.is_blacklisted(args.domain):
926         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
927         return 101
928
929     logger.debug("Invoking locking.acquire() ...")
930     locking.acquire()
931
932     # Initialize values
933     domain = tidyup.domain(args.domain)
934     origin = software = None
935
936     # Fetch record
937     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
938     row = database.cursor.fetchone()
939     if row is not None:
940         origin = row["origin"]
941         software = row["software"]
942
943     # Initial fetch
944     try:
945         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
946         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
947     except network.exceptions as exception:
948         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
949         instances.set_last_error(args.domain, exception)
950         instances.update(args.domain)
951         return 100
952
953     if args.single:
954         logger.debug("Not fetching more instances - EXIT!")
955         return 0
956
957     # Loop through some instances
958     database.cursor.execute(
959         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
960     )
961
962     rows = database.cursor.fetchall()
963     logger.info("Checking %d entries ...", len(rows))
964     for row in rows:
965         logger.debug("row[domain]='%s'", row["domain"])
966         if row["domain"] == "":
967             logger.debug("row[domain] is empty - SKIPPED!")
968             continue
969
970         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
971         domain = row["domain"].encode("idna").decode("utf-8")
972         logger.debug("domain='%s' - AFTER!", domain)
973
974         if not domain_helper.is_wanted(domain):
975             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
976             continue
977
978         try:
979             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
980             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
981         except network.exceptions as exception:
982             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
983             instances.set_last_error(domain, exception)
984
985     logger.debug("Success - EXIT!")
986     return 0
987
988 def fetch_oliphant(args: argparse.Namespace) -> int:
989     logger.debug("args[]='%s' - CALLED!", type(args))
990
991     logger.debug("Invoking locking.acquire() ...")
992     locking.acquire()
993
994     source_domain = "codeberg.org"
995     if sources.is_recent(source_domain):
996         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
997         return 0
998     else:
999         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1000         sources.update(source_domain)
1001
1002     # Base URL
1003     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1004
1005     domains = list()
1006
1007     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1008     for block in blocklists.oliphant_blocklists:
1009         # Is domain given and not equal blocker?
1010         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1011             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1012             continue
1013         elif args.domain in domains:
1014             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1015             continue
1016
1017         instances.set_last_blocked(block["blocker"])
1018
1019         # Fetch this URL
1020         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1021         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1022
1023         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1024         if not response.ok or response.status_code > 200 or response.content == "":
1025             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1026             continue
1027
1028         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1029         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1030
1031         blockdict = list()
1032
1033         cnt = 0
1034         for row in reader:
1035             logger.debug("row[%s]='%s'", type(row), row)
1036             domain = severity = None
1037             reject_media = reject_reports = False
1038
1039             if "#domain" in row:
1040                 domain = row["#domain"]
1041             elif "domain" in row:
1042                 domain = row["domain"]
1043             else:
1044                 logger.debug("row='%s' does not contain domain column", row)
1045                 continue
1046
1047             if "#severity" in row:
1048                 severity = blocks.alias_block_level(row["#severity"])
1049             elif "severity" in row:
1050                 severity = blocks.alias_block_level(row["severity"])
1051             else:
1052                 logger.debug("row='%s' does not contain severity column", row)
1053                 continue
1054
1055             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1056                 reject_media = True
1057             elif "reject_media" in row and row["reject_media"].lower() == "true":
1058                 reject_media = True
1059
1060             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1061                 reject_reports = True
1062             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1063                 reject_reports = True
1064
1065             cnt = cnt + 1
1066             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1067             if domain == "":
1068                 logger.debug("domain is empty - SKIPPED!")
1069                 continue
1070             elif domain.endswith(".onion"):
1071                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1072                 continue
1073             elif domain.endswith(".arpa"):
1074                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1075                 continue
1076             elif domain.endswith(".tld"):
1077                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1078                 continue
1079             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1080                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1081                 domain = utils.deobfuscate(domain, block["blocker"])
1082                 logger.debug("domain='%s' - AFTER!", domain)
1083
1084             if not validators.domain(domain):
1085                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1086                 continue
1087             elif blacklist.is_blacklisted(domain):
1088                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1089                 continue
1090             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1091                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1092                 continue
1093
1094             logger.debug("Marking domain='%s' as handled", domain)
1095             domains.append(domain)
1096
1097             logger.debug("Processing domain='%s' ...", domain)
1098             processed = processing.instance(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1099             logger.debug("processed='%s'", processed)
1100
1101             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1102                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1103                 blockdict.append({
1104                     "blocked": domain,
1105                     "reason" : block["reason"],
1106                 })
1107
1108             if reject_media:
1109                 processing.block(block["blocker"], domain, None, "reject_media")
1110             if reject_reports:
1111                 processing.block(block["blocker"], domain, None, "reject_reports")
1112
1113         logger.debug("block[blocker]='%s'", block["blocker"])
1114         if not blocklists.has(block["blocker"]):
1115             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1116             instances.set_total_blocks(block["blocker"], domains)
1117
1118         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1119         if instances.has_pending(block["blocker"]):
1120             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1121             instances.update(block["blocker"])
1122
1123         logger.debug("Invoking commit() ...")
1124         database.connection.commit()
1125
1126         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1127         if config.get("bot_enabled") and len(blockdict) > 0:
1128             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1129             network.send_bot_post(block["blocker"], blockdict)
1130
1131     logger.debug("Success! - EXIT!")
1132     return 0
1133
1134 def fetch_txt(args: argparse.Namespace) -> int:
1135     logger.debug("args[]='%s' - CALLED!", type(args))
1136
1137     logger.debug("Invoking locking.acquire() ...")
1138     locking.acquire()
1139
1140     # Static URLs
1141     urls = ({
1142         "blocker": "seirdy.one",
1143         "url"    : "https://seirdy.one/pb/bsl.txt",
1144     },)
1145
1146     logger.info("Checking %d text file(s) ...", len(urls))
1147     for row in urls:
1148         logger.debug("Fetching row[url]='%s' ...", row["url"])
1149         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1150
1151         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1152         if response.ok and response.status_code == 200 and response.text != "":
1153             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1154             domains = response.text.strip().split("\n")
1155
1156             logger.info("Processing %d domains ...", len(domains))
1157             for domain in domains:
1158                 logger.debug("domain='%s' - BEFORE!", domain)
1159                 domain = tidyup.domain(domain) if domain != None and domain != "" else None
1160
1161                 logger.debug("domain='%s' - AFTER!", domain)
1162                 if domain is None or domain == "":
1163                     logger.debug("domain='%s' is empty - SKIPPED!", domain)
1164                     continue
1165                 elif not domain_helper.is_wanted(domain):
1166                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1167                     continue
1168                 elif instances.is_recent(domain):
1169                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1170                     continue
1171
1172                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1173                 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1174
1175                 logger.debug("processed='%s'", processed)
1176                 if not processed:
1177                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1178                     continue
1179
1180     logger.debug("Success! - EXIT!")
1181     return 0
1182
1183 def fetch_fedipact(args: argparse.Namespace) -> int:
1184     logger.debug("args[]='%s' - CALLED!", type(args))
1185
1186     logger.debug("Invoking locking.acquire() ...")
1187     locking.acquire()
1188
1189     source_domain = "fedipact.online"
1190     if sources.is_recent(source_domain):
1191         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1192         return 0
1193     else:
1194         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1195         sources.update(source_domain)
1196
1197     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1198     response = utils.fetch_url(
1199         f"https://{source_domain}",
1200         network.web_headers,
1201         (config.get("connection_timeout"), config.get("read_timeout"))
1202     )
1203
1204     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1205     if response.ok and response.status_code == 200 and response.text != "":
1206         logger.debug("Parsing %d Bytes ...", len(response.text))
1207
1208         doc = bs4.BeautifulSoup(response.text, "html.parser")
1209         logger.debug("doc[]='%s'", type(doc))
1210
1211         rows = doc.findAll("li")
1212         logger.info("Checking %d row(s) ...", len(rows))
1213         for row in rows:
1214             logger.debug("row[]='%s'", type(row))
1215             domain = tidyup.domain(row.contents[0])
1216
1217             logger.debug("domain='%s' - AFTER!", domain)
1218             if domain == "":
1219                 logger.debug("domain is empty - SKIPPED!")
1220                 continue
1221
1222             logger.debug("domain='%s' - BEFORE!", domain)
1223             domain = domain.encode("idna").decode("utf-8")
1224             logger.debug("domain='%s' - AFTER!", domain)
1225
1226             if not domain_helper.is_wanted(domain):
1227                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1228                 continue
1229             elif instances.is_registered(domain):
1230                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1231                 continue
1232             elif instances.is_recent(domain):
1233                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1234                 continue
1235
1236             logger.info("Fetching domain='%s' ...", domain)
1237             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1238
1239     logger.debug("Success! - EXIT!")
1240     return 0
1241
1242 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1243     logger.debug("args[]='%s' - CALLED!", type(args))
1244
1245     logger.debug("Invoking locking.acquire() ...")
1246     locking.acquire()
1247
1248     source_domain = "instances.joinmobilizon.org"
1249     if sources.is_recent(source_domain):
1250         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1251         return 0
1252     else:
1253         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1254         sources.update(source_domain)
1255
1256     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1257     raw = utils.fetch_url(
1258         f"https://{source_domain}/api/v1/instances",
1259         network.web_headers,
1260         (config.get("connection_timeout"), config.get("read_timeout"))
1261     ).text
1262     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1263
1264     parsed = json.loads(raw)
1265     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1266
1267     if "data" not in parsed:
1268         logger.warning("parsed()=%d does not contain key 'data'")
1269         return 1
1270
1271     logger.info("Checking %d instances ...", len(parsed["data"]))
1272     for row in parsed["data"]:
1273         logger.debug("row[]='%s'", type(row))
1274         if "host" not in row:
1275             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1276             continue
1277         elif not domain_helper.is_wanted(row["host"]):
1278             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1279             continue
1280         elif instances.is_registered(row["host"]):
1281             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1282             continue
1283
1284         logger.info("Fetching row[host]='%s' ...", row["host"])
1285         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1286
1287     logger.debug("Success! - EXIT!")
1288     return 0
1289
1290 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1291     logger.debug("args[]='%s' - CALLED!", type(args))
1292
1293     logger.debug("Invoking locking.acquire() ...")
1294     locking.acquire()
1295
1296     source_domain = "instanceapp.misskey.page"
1297     if sources.is_recent(source_domain):
1298         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1299         return 0
1300     else:
1301         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1302         sources.update(source_domain)
1303
1304     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1305     raw = utils.fetch_url(
1306         f"https://{source_domain}/instances.json",
1307         network.web_headers,
1308         (config.get("connection_timeout"), config.get("read_timeout"))
1309     ).text
1310     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1311
1312     parsed = json.loads(raw)
1313     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1314
1315     if "instancesInfos" not in parsed:
1316         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1317         return 1
1318
1319     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1320     for row in parsed["instancesInfos"]:
1321         logger.debug("row[%s]='%s'", type(row), row)
1322         if "url" not in row:
1323             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1324             continue
1325         elif not domain_helper.is_wanted(row["url"]):
1326             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1327             continue
1328         elif instances.is_registered(row["url"]):
1329             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1330             continue
1331
1332         logger.info("Fetching row[url]='%s' ...", row["url"])
1333         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1334
1335     logger.debug("Success! - EXIT!")
1336     return 0
1337
1338 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1339     logger.debug("args[]='%s' - CALLED!", type(args))
1340
1341     logger.debug("Invoking locking.acquire() ...")
1342     locking.acquire()
1343
1344     source_domain = "joinfediverse.wiki"
1345     if sources.is_recent(source_domain):
1346         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1347         return 0
1348     else:
1349         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1350         sources.update(source_domain)
1351
1352     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1353     raw = utils.fetch_url(
1354         f"https://{source_domain}/FediBlock",
1355         network.web_headers,
1356         (config.get("connection_timeout"), config.get("read_timeout"))
1357     ).text
1358     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1359
1360     doc = bs4.BeautifulSoup(raw, "html.parser")
1361     logger.debug("doc[]='%s'", type(doc))
1362
1363     tables = doc.findAll("table", {"class": "wikitable"})
1364
1365     logger.info("Analyzing %d table(s) ...", len(tables))
1366     blocklist = list()
1367     for table in tables:
1368         logger.debug("table[]='%s'", type(table))
1369
1370         rows = table.findAll("tr")
1371         logger.info("Checking %d row(s) ...", len(rows))
1372         block_headers = dict()
1373         for row in rows:
1374             logger.debug("row[%s]='%s'", type(row), row)
1375
1376             headers = row.findAll("th")
1377             logger.debug("Found headers()=%d header(s)", len(headers))
1378             if len(headers) > 1:
1379                 block_headers = dict()
1380                 cnt = 0
1381                 for header in headers:
1382                     cnt = cnt + 1
1383                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1384                     text = header.contents[0]
1385
1386                     logger.debug("text[]='%s'", type(text))
1387                     if not isinstance(text, str):
1388                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1389                         continue
1390                     elif validators.domain(text.strip()):
1391                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1392                         continue
1393
1394                     text = tidyup.domain(text.strip())
1395                     logger.debug("text='%s' - AFTER!", text)
1396                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1397                         logger.debug("Found header: '%s'=%d", text, cnt)
1398                         block_headers[cnt] = text
1399
1400             elif len(block_headers) == 0:
1401                 logger.debug("row is not scrapable - SKIPPED!")
1402                 continue
1403             elif len(block_headers) > 0:
1404                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1405                 cnt = 0
1406                 block = dict()
1407
1408                 for element in row.find_all(["th", "td"]):
1409                     cnt = cnt + 1
1410                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1411                     if cnt in block_headers:
1412                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1413
1414                         text = element.text.strip()
1415                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1416
1417                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1418                         if key in ["domain", "instance"]:
1419                             block[key] = text
1420                         elif key == "reason":
1421                             block[key] = tidyup.reason(text)
1422                         elif key == "subdomain(s)":
1423                             block[key] = list()
1424                             if text != "":
1425                                 block[key] = text.split("/")
1426                         else:
1427                             logger.debug("key='%s'", key)
1428                             block[key] = text
1429
1430                 logger.debug("block()=%d ...", len(block))
1431                 if len(block) > 0:
1432                     logger.debug("Appending block()=%d ...", len(block))
1433                     blocklist.append(block)
1434
1435     logger.debug("blocklist()=%d", len(blocklist))
1436
1437     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1438     domains = database.cursor.fetchall()
1439
1440     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1441     blocking = list()
1442     for block in blocklist:
1443         logger.debug("block='%s'", block)
1444         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1445             origin = block["blocked"]
1446             logger.debug("origin='%s'", origin)
1447             for subdomain in block["subdomain(s)"]:
1448                 block["blocked"] = subdomain + "." + origin
1449                 logger.debug("block[blocked]='%s'", block["blocked"])
1450                 blocking.append(block)
1451         else:
1452             blocking.append(block)
1453
1454     logger.debug("blocking()=%d", blocking)
1455     for block in blocking:
1456         logger.debug("block[]='%s'", type(block))
1457         if "blocked" not in block:
1458             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1459
1460         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1461         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1462
1463         if block["blocked"] == "":
1464             logger.debug("block[blocked] is empty - SKIPPED!")
1465             continue
1466         elif not domain_helper.is_wanted(block["blocked"]):
1467             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1468             continue
1469         elif instances.is_recent(block["blocked"]):
1470             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1471             continue
1472
1473         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1474         processing.instance(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1475
1476     blockdict = list()
1477     for blocker in domains:
1478         blocker = blocker[0]
1479         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1480         instances.set_last_blocked(blocker)
1481
1482         for block in blocking:
1483             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1484             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1485
1486             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1487             if block["blocked"] == "":
1488                 logger.debug("block[blocked] is empty - SKIPPED!")
1489                 continue
1490             elif not domain_helper.is_wanted(block["blocked"]):
1491                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1492                 continue
1493
1494             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1495             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1496                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1497                 blockdict.append({
1498                     "blocked": block["blocked"],
1499                     "reason" : block["reason"],
1500                 })
1501
1502         if instances.has_pending(blocker):
1503             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1504             instances.update(blocker)
1505
1506         logger.debug("Invoking commit() ...")
1507         database.connection.commit()
1508
1509         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1510         if config.get("bot_enabled") and len(blockdict) > 0:
1511             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1512             network.send_bot_post(blocker, blockdict)
1513
1514     logger.debug("Success! - EXIT!")
1515     return 0
1516
1517 def recheck_obfuscation(args: argparse.Namespace) -> int:
1518     logger.debug("args[]='%s' - CALLED!", type(args))
1519
1520     logger.debug("Invoking locking.acquire() ...")
1521     locking.acquire()
1522
1523     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1524         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1525     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1526         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1527     else:
1528         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1529
1530     rows = database.cursor.fetchall()
1531     logger.info("Checking %d domains ...", len(rows))
1532     for row in rows:
1533         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1534         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1535             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1536             continue
1537
1538         logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1539         blocking = federation.fetch_blocks(row["domain"])
1540
1541         logger.debug("blocking()=%d", len(blocking))
1542         if len(blocking) == 0:
1543             if row["software"] == "pleroma":
1544                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1545                 blocking = pleroma.fetch_blocks(row["domain"])
1546             elif row["software"] == "mastodon":
1547                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1548                 blocking = mastodon.fetch_blocks(row["domain"])
1549             elif row["software"] == "lemmy":
1550                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1551                 blocking = lemmy.fetch_blocks(row["domain"])
1552             elif row["software"] == "friendica":
1553                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1554                 blocking = friendica.fetch_blocks(row["domain"])
1555             elif row["software"] == "misskey":
1556                 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1557                 blocking = misskey.fetch_blocks(row["domain"])
1558             else:
1559                 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1560
1561         # c.s isn't part of oliphant's "hidden" blocklists
1562         logger.debug("row[domain]='%s'", row["domain"])
1563         if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]):
1564             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1565             instances.set_last_blocked(row["domain"])
1566             instances.set_total_blocks(row["domain"], blocking)
1567
1568         obfuscated = 0
1569         blockdict = list()
1570
1571         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1572         for block in blocking:
1573             logger.debug("block[blocked]='%s'", block["blocked"])
1574             blocked = None
1575
1576             if block["blocked"] == "":
1577                 logger.debug("block[blocked] is empty - SKIPPED!")
1578                 continue
1579             elif block["blocked"].endswith(".arpa"):
1580                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1581                 continue
1582             elif block["blocked"].endswith(".tld"):
1583                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1584                 continue
1585             elif block["blocked"].endswith(".onion"):
1586                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1587                 continue
1588             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1589                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1590                 obfuscated = obfuscated + 1
1591                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1592             elif not domain_helper.is_wanted(block["blocked"]):
1593                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1594                 continue
1595             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1596                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1597                 continue
1598
1599             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1600             if blocked is not None and blocked != block["blocked"]:
1601                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1602                 obfuscated = obfuscated - 1
1603
1604                 if blacklist.is_blacklisted(blocked):
1605                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1606                     continue
1607                 elif blacklist.is_blacklisted(row["domain"]):
1608                     logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1609                     continue
1610                 elif blocks.is_instance_blocked(row["domain"], blocked):
1611                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1612                     continue
1613
1614                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1615
1616                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1617                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1618                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1619                     blockdict.append({
1620                         "blocked": blocked,
1621                         "reason" : block["reason"],
1622                     })
1623
1624         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1625         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1626
1627         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1628         if obfuscated == 0 and len(blocking) > 0:
1629             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1630             instances.set_has_obfuscation(row["domain"], False)
1631
1632         if instances.has_pending(row["domain"]):
1633             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1634             instances.update(row["domain"])
1635
1636         logger.debug("Invoking commit() ...")
1637         database.connection.commit()
1638
1639         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1640         if config.get("bot_enabled") and len(blockdict) > 0:
1641             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1642             network.send_bot_post(row["domain"], blockdict)
1643
1644     logger.debug("Success! - EXIT!")
1645     return 0
1646
1647 def fetch_fedilist(args: argparse.Namespace) -> int:
1648     logger.debug("args[]='%s' - CALLED!", type(args))
1649
1650     logger.debug("Invoking locking.acquire() ...")
1651     locking.acquire()
1652
1653     source_domain = "demo.fedilist.com"
1654     if sources.is_recent(source_domain):
1655         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1656         return 0
1657     else:
1658         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1659         sources.update(source_domain)
1660
1661     url = f"http://{source_domain}/instance/csv?onion=not"
1662     if args.software is not None and args.software != "":
1663         logger.debug("args.software='%s'", args.software)
1664         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1665
1666     logger.info("Fetching url='%s' ...", url)
1667     response = reqto.get(
1668         url,
1669         headers=network.web_headers,
1670         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1671         allow_redirects=False
1672     )
1673
1674     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1675     if not response.ok or response.status_code > 200 or len(response.content) == 0:
1676         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1677         return 1
1678
1679     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1680
1681     logger.debug("reader[]='%s'", type(reader))
1682     if reader is None:
1683         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1684         return 2
1685
1686     rows = list(reader)
1687
1688     logger.info("Checking %d rows ...", len(rows))
1689     for row in rows:
1690         logger.debug("row[]='%s'", type(row))
1691         if "hostname" not in row:
1692             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1693             continue
1694
1695         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1696         domain = tidyup.domain(row["hostname"])
1697         logger.debug("domain='%s' - AFTER!", domain)
1698
1699         if domain == "":
1700             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1701             continue
1702
1703         logger.debug("domain='%s' - BEFORE!", domain)
1704         domain = domain.encode("idna").decode("utf-8")
1705         logger.debug("domain='%s' - AFTER!", domain)
1706
1707         if not domain_helper.is_wanted(domain):
1708             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1709             continue
1710         elif (args.force is None or not args.force) and instances.is_registered(domain):
1711             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1712             continue
1713         elif instances.is_recent(domain):
1714             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1715             continue
1716
1717         logger.info("Fetching instances from domain='%s' ...", domain)
1718         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1719
1720     logger.debug("Success! - EXIT!")
1721     return 0
1722
1723 def update_nodeinfo(args: argparse.Namespace) -> int:
1724     logger.debug("args[]='%s' - CALLED!", type(args))
1725
1726     logger.debug("Invoking locking.acquire() ...")
1727     locking.acquire()
1728
1729     if args.domain is not None and args.domain != "":
1730         logger.debug("Fetching args.domain='%s'", args.domain)
1731         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1732     elif args.software is not None and args.software != "":
1733         logger.info("Fetching domains for args.software='%s'", args.software)
1734         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1735     elif args.mode is not None and args.mode != "":
1736         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1737         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1738     elif args.no_software:
1739         logger.info("Fetching domains with no software type detected ...")
1740         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1741     else:
1742         logger.info("Fetching domains for recently updated ...")
1743         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1744
1745     domains = database.cursor.fetchall()
1746
1747     logger.info("Checking %d domain(s) ...", len(domains))
1748     cnt = 0
1749     for row in domains:
1750         logger.debug("row[]='%s'", type(row))
1751         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1752             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1753             continue
1754
1755         try:
1756             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1757             software = federation.determine_software(row["domain"])
1758
1759             logger.debug("Determined software='%s'", software)
1760             if (software != row["software"] and software is not None) or args.force is True:
1761                 logger.debug("software='%s'", software)
1762                 if software is None:
1763                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1764                     instances.set_nodeinfo_url(row["domain"], None)
1765
1766                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1767                 instances.set_software(row["domain"], software)
1768
1769             if software is not None:
1770                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1771                 instances.set_success(row["domain"])
1772         except network.exceptions as exception:
1773             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1774             instances.set_last_error(row["domain"], exception)
1775
1776         instances.set_last_nodeinfo(row["domain"])
1777         instances.update(row["domain"])
1778         cnt = cnt + 1
1779
1780     logger.debug("Success! - EXIT!")
1781     return 0
1782
1783 def fetch_instances_social(args: argparse.Namespace) -> int:
1784     logger.debug("args[]='%s' - CALLED!", type(args))
1785
1786     logger.debug("Invoking locking.acquire() ...")
1787     locking.acquire()
1788
1789     source_domain = "instances.social"
1790
1791     if config.get("instances_social_api_key") == "":
1792         logger.error("API key not set. Please set in your config.json file.")
1793         return 1
1794     elif sources.is_recent(source_domain):
1795         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1796         return 0
1797     else:
1798         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1799         sources.update(source_domain)
1800
1801     headers = {
1802         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1803     }
1804
1805     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1806     fetched = network.get_json_api(
1807         source_domain,
1808         "/api/1.0/instances/list?count=0&sort_by=name",
1809         headers,
1810         (config.get("connection_timeout"), config.get("read_timeout"))
1811     )
1812     logger.debug("fetched[]='%s'", type(fetched))
1813
1814     if "error_message" in fetched:
1815         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1816         return 2
1817     elif "exception" in fetched:
1818         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1819         return 3
1820     elif "json" not in fetched:
1821         logger.warning("fetched has no element 'json' - EXIT!")
1822         return 4
1823     elif "instances" not in fetched["json"]:
1824         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1825         return 5
1826
1827     domains = list()
1828     rows = fetched["json"]["instances"]
1829
1830     logger.info("Checking %d row(s) ...", len(rows))
1831     for row in rows:
1832         logger.debug("row[]='%s'", type(row))
1833         domain = tidyup.domain(row["name"])
1834         logger.debug("domain='%s' - AFTER!", domain)
1835
1836         if domain == "":
1837             logger.debug("domain is empty - SKIPPED!")
1838             continue
1839
1840         logger.debug("domain='%s' - BEFORE!", domain)
1841         domain = domain.encode("idna").decode("utf-8")
1842         logger.debug("domain='%s' - AFTER!", domain)
1843
1844         if not domain_helper.is_wanted(domain):
1845             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1846             continue
1847         elif domain in domains:
1848             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1849             continue
1850         elif instances.is_registered(domain):
1851             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1852             continue
1853         elif instances.is_recent(domain):
1854             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1855             continue
1856
1857         logger.info("Fetching instances from domain='%s'", domain)
1858         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1859
1860     logger.debug("Success! - EXIT!")
1861     return 0
1862
1863 def fetch_relays(args: argparse.Namespace) -> int:
1864     logger.debug("args[]='%s' - CALLED!", type(args))
1865
1866     logger.debug("Invoking locking.acquire() ...")
1867     locking.acquire()
1868
1869     if args.domain is not None and args.domain != "":
1870         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1871     else:
1872         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1873
1874     domains = list()
1875     rows = database.cursor.fetchall()
1876
1877     logger.info("Checking %d relays ...", len(rows))
1878     for row in rows:
1879         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1880         peers = list()
1881         if not args.force and instances.is_recent(row["domain"]):
1882             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1883             continue
1884
1885         try:
1886             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1887             raw = utils.fetch_url(
1888                 f"https://{row['domain']}",
1889                 network.web_headers,
1890                 (config.get("connection_timeout"), config.get("read_timeout"))
1891             ).text
1892             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1893         except network.exceptions as exception:
1894             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1895             instances.set_last_error(row["domain"], exception)
1896             instances.set_last_instance_fetch(row["domain"])
1897             instances.update(row["domain"])
1898             continue
1899
1900         doc = bs4.BeautifulSoup(raw, features="html.parser")
1901         logger.debug("doc[]='%s'", type(doc))
1902
1903         logger.debug("row[software]='%s'", row["software"])
1904         if row["software"] == "activityrelay":
1905             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1906             tags = doc.findAll("p")
1907
1908             logger.debug("Checking %d paragraphs ...", len(tags))
1909             for tag in tags:
1910                 logger.debug("tag[]='%s'", type(tag))
1911                 if len(tag.contents) == 0:
1912                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1913                     continue
1914                 elif "registered instances" not in tag.contents[0]:
1915                     logger.debug("Skipping paragraph, text not found.")
1916                     continue
1917
1918                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1919                 for domain in tag.contents:
1920                     logger.debug("domain[%s]='%s'", type(domain), domain)
1921                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1922                         continue
1923
1924                     domain = str(domain)
1925                     logger.debug("domain='%s'", domain)
1926                     if not domain_helper.is_wanted(domain):
1927                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1928                         continue
1929
1930                     logger.debug("domain='%s' - BEFORE!", domain)
1931                     domain = tidyup.domain(domain)
1932                     logger.debug("domain='%s' - AFTER!", domain)
1933
1934                     if domain == "":
1935                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1936                         continue
1937                     elif domain not in peers:
1938                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1939                         peers.append(domain)
1940
1941                     if dict_helper.has_key(domains, "domain", domain):
1942                         logger.debug("domain='%s' already added", domain)
1943                         continue
1944
1945                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1946                     domains.append({
1947                         "domain": domain,
1948                         "origin": row["domain"],
1949                     })
1950         elif row["software"] in ["aoderelay", "selective-relay"]:
1951             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1952             if row["software"] == "aoderelay":
1953                 tags = doc.findAll("section", {"class": "instance"})
1954             else:
1955                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1956
1957             logger.debug("Checking %d tags ...", len(tags))
1958             for tag in tags:
1959                 logger.debug("tag[]='%s'", type(tag))
1960
1961                 link = tag.find("a")
1962                 logger.debug("link[%s]='%s'", type(link), link)
1963                 if link is None:
1964                     logger.warning("tag='%s' has no a-tag ...", tag)
1965                     continue
1966
1967                 components = urlparse(link["href"])
1968                 domain = components.netloc.lower()
1969
1970                 if not domain_helper.is_wanted(domain):
1971                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1972                     continue
1973
1974                 logger.debug("domain='%s' - BEFORE!", domain)
1975                 domain = tidyup.domain(domain)
1976                 logger.debug("domain='%s' - AFTER!", domain)
1977
1978                 if domain == "":
1979                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1980                     continue
1981                 elif domain not in peers:
1982                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1983                     peers.append(domain)
1984
1985                 if dict_helper.has_key(domains, "domain", domain):
1986                     logger.debug("domain='%s' already added", domain)
1987                     continue
1988
1989                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1990                 domains.append({
1991                     "domain": domain,
1992                     "origin": row["domain"],
1993                 })
1994         else:
1995             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1996
1997         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1998         instances.set_last_instance_fetch(row["domain"])
1999
2000         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
2001         instances.set_total_peers(row["domain"], peers)
2002
2003         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
2004         instances.update(row["domain"])
2005
2006     logger.info("Checking %d domains ...", len(domains))
2007     for row in domains:
2008         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2009         if instances.is_registered(row["domain"]):
2010             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2011             continue
2012
2013         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2014         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2015
2016     logger.debug("Success! - EXIT!")
2017     return 0
2018
2019 def convert_idna(args: argparse.Namespace) -> int:
2020     logger.debug("args[]='%s' - CALLED!", type(args))
2021
2022     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2023     rows = database.cursor.fetchall()
2024
2025     logger.debug("rows[]='%s'", type(rows))
2026     instances.translate_idnas(rows, "domain")
2027
2028     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2029     rows = database.cursor.fetchall()
2030
2031     logger.debug("rows[]='%s'", type(rows))
2032     instances.translate_idnas(rows, "origin")
2033
2034     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2035     rows = database.cursor.fetchall()
2036
2037     logger.debug("rows[]='%s'", type(rows))
2038     blocks.translate_idnas(rows, "blocker")
2039
2040     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2041     rows = database.cursor.fetchall()
2042
2043     logger.debug("rows[]='%s'", type(rows))
2044     blocks.translate_idnas(rows, "blocked")
2045
2046     logger.debug("Success! - EXIT!")
2047     return 0
2048
2049 def remove_invalid(args: argparse.Namespace) -> int:
2050     logger.debug("args[]='%s' - CALLED!", type(args))
2051
2052     logger.debug("Invoking locking.acquire() ...")
2053     locking.acquire()
2054
2055     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2056     rows = database.cursor.fetchall()
2057
2058     logger.info("Checking %d domains ...", len(rows))
2059     for row in rows:
2060         logger.debug("row[domain]='%s'", row["domain"])
2061         if not validators.domain(row["domain"].split("/")[0]):
2062             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2063             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2064             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2065
2066     logger.debug("Invoking commit() ...")
2067     database.connection.commit()
2068
2069     logger.info("Vaccum cleaning database ...")
2070     database.cursor.execute("VACUUM")
2071
2072     logger.debug("Success! - EXIT!")
2073     return 0