]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Renaming season:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 import argparse
24 import atoma
25 import bs4
26 import markdown
27 import reqto
28 import validators
29
30 from urllib.parse import urlparse
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import software as software_helper
41 from fba.helpers import tidyup
42
43 from fba.http import federation
44 from fba.http import network
45
46 from fba.models import blocks
47 from fba.models import instances
48 from fba.models import sources
49
50 from fba.networks import friendica
51 from fba.networks import lemmy
52 from fba.networks import mastodon
53 from fba.networks import misskey
54 from fba.networks import pleroma
55
56 logging.basicConfig(level=logging.INFO)
57 logger = logging.getLogger(__name__)
58 #logger.setLevel(logging.DEBUG)
59
60 def check_instance(args: argparse.Namespace) -> int:
61     logger.debug("args.domain='%s' - CALLED!", args.domain)
62     status = 0
63     if not validators.domain(args.domain):
64         logger.warning("args.domain='%s' is not valid", args.domain)
65         status = 100
66     elif blacklist.is_blacklisted(args.domain):
67         logger.warning("args.domain='%s' is blacklisted", args.domain)
68         status = 101
69     elif instances.is_registered(args.domain):
70         logger.warning("args.domain='%s' is already registered", args.domain)
71         status = 102
72     else:
73         logger.info("args.domain='%s' is not known", args.domain)
74
75     logger.debug("status=%d - EXIT!", status)
76     return status
77
78 def check_nodeinfo(args: argparse.Namespace) -> int:
79     logger.debug("args[]='%s' - CALLED!", type(args))
80
81     # Fetch rows
82     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
83
84     cnt = 0
85     for row in database.cursor.fetchall():
86         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
87         punycode = row["domain"].encode("idna").decode("utf-8")
88
89         if row["nodeinfo_url"].startswith("/"):
90             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
91             continue
92         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
93             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
94             cnt = cnt + 1
95
96     logger.info("Found %d row(s)", cnt)
97
98     logger.debug("EXIT!")
99     return 0
100
101 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
102     logger.debug("args[]='%s' - CALLED!", type(args))
103
104     # No CSRF by default, you don't have to add network.source_headers by yourself here
105     headers = tuple()
106     source_domain = "pixelfed.org"
107
108     if sources.is_recent(source_domain):
109         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
110         return 0
111     else:
112         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
113         sources.update(source_domain)
114
115     try:
116         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
117         headers = csrf.determine(source_domain, dict())
118     except network.exceptions as exception:
119         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
120         return list()
121
122     try:
123         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
124         fetched = network.get_json_api(
125             source_domain,
126             "/api/v1/servers/all.json?scope=All&country=all&language=all",
127             headers,
128             (config.get("connection_timeout"), config.get("read_timeout"))
129         )
130
131         logger.debug("JSON API returned %d elements", len(fetched))
132         if "error_message" in fetched:
133             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
134             return 101
135         elif "data" not in fetched["json"]:
136             logger.warning("API did not return JSON with 'data' element - EXIT!")
137             return 102
138
139         rows = fetched["json"]["data"]
140         logger.info("Checking %d fetched rows ...", len(rows))
141         for row in rows:
142             logger.debug("row[]='%s'", type(row))
143             if "domain" not in row:
144                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
145                 continue
146             elif row["domain"] == "":
147                 logger.debug("row[domain] is empty - SKIPPED!")
148                 continue
149             elif not utils.is_domain_wanted(row["domain"]):
150                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
151                 continue
152             elif instances.is_registered(row["domain"]):
153                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
154                 continue
155             elif instances.is_recent(row["domain"]):
156                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
157                 continue
158
159             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
160             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
161
162     except network.exceptions as exception:
163         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
164         return 103
165
166     logger.debug("Success! - EXIT!")
167     return 0
168
169 def fetch_bkali(args: argparse.Namespace) -> int:
170     logger.debug("args[]='%s' - CALLED!", type(args))
171
172     logger.debug("Invoking locking.acquire() ...")
173     locking.acquire()
174
175     source_domain = "gql.sources.bka.li"
176     if sources.is_recent(source_domain):
177         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
178         return 0
179     else:
180         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
181         sources.update(source_domain)
182
183     domains = list()
184     try:
185         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
186         fetched = network.post_json_api(
187             source_domain,
188             "/v1/graphql",
189             json.dumps({
190                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
191             })
192         )
193
194         logger.debug("fetched[]='%s'", type(fetched))
195         if "error_message" in fetched:
196             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
197             return 100
198         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
199             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
200             return 101
201
202         rows = fetched["json"]
203
204         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
205         if len(rows) == 0:
206             raise Exception("WARNING: Returned no records")
207         elif "data" not in rows:
208             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
209         elif "nodeinfo" not in rows["data"]:
210             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
211
212         for entry in rows["data"]["nodeinfo"]:
213             logger.debug("entry[%s]='%s'", type(entry), entry)
214             if "domain" not in entry:
215                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
216                 continue
217             elif entry["domain"] == "":
218                 logger.debug("entry[domain] is empty - SKIPPED!")
219                 continue
220             elif not utils.is_domain_wanted(entry["domain"]):
221                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
222                 continue
223             elif instances.is_registered(entry["domain"]):
224                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
225                 continue
226             elif instances.is_recent(entry["domain"]):
227                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
228                 continue
229
230             logger.debug("Adding domain='%s' ...", entry["domain"])
231             domains.append(entry["domain"])
232
233     except network.exceptions as exception:
234         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
235         return 102
236
237     logger.debug("domains()=%d", len(domains))
238     if len(domains) > 0:
239         logger.info("Adding %d new instances ...", len(domains))
240         for domain in domains:
241             try:
242                 logger.info("Fetching instances from domain='%s' ...", domain)
243                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
244             except network.exceptions as exception:
245                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
246                 instances.set_last_error(domain, exception)
247                 return 100
248
249     logger.debug("Success - EXIT!")
250     return 0
251
252 def fetch_blocks(args: argparse.Namespace) -> int:
253     logger.debug("args[]='%s' - CALLED!", type(args))
254     if args.domain is not None and args.domain != "":
255         logger.debug("args.domain='%s' - checking ...", args.domain)
256         if not validators.domain(args.domain):
257             logger.warning("args.domain='%s' is not valid.", args.domain)
258             return 100
259         elif blacklist.is_blacklisted(args.domain):
260             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
261             return 101
262         elif not instances.is_registered(args.domain):
263             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
264             return 102
265
266     logger.debug("Invoking locking.acquire() ...")
267     locking.acquire()
268
269     if args.domain is not None and args.domain != "":
270         # Re-check single domain
271         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
272         database.cursor.execute(
273             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
274         )
275     elif args.software is not None and args.software != "":
276         # Re-check single software
277         logger.debug("Querying database for args.software='%s' ...", args.software)
278         database.cursor.execute(
279             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
280         )
281     else:
282         # Re-check after "timeout" (aka. minimum interval)
283         database.cursor.execute(
284             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
285         )
286
287     rows = database.cursor.fetchall()
288     logger.info("Checking %d entries ...", len(rows))
289     for blocker, software, origin, nodeinfo_url in rows:
290         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
291         blocker = tidyup.domain(blocker)
292         logger.debug("blocker='%s' - AFTER!", blocker)
293
294         if blocker == "":
295             logger.warning("blocker is now empty!")
296             continue
297         elif nodeinfo_url is None or nodeinfo_url == "":
298             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
299             continue
300         elif not utils.is_domain_wanted(blocker):
301             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
302             continue
303
304         logger.debug("blocker='%s'", blocker)
305         instances.set_last_blocked(blocker)
306         instances.set_has_obfuscation(blocker, False)
307
308         blocking = list()
309         blockdict = list()
310         if software == "pleroma":
311             logger.info("blocker='%s',software='%s'", blocker, software)
312             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
313         elif software == "mastodon":
314             logger.info("blocker='%s',software='%s'", blocker, software)
315             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
316         elif software == "lemmy":
317             logger.info("blocker='%s',software='%s'", blocker, software)
318             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
319         elif software == "friendica":
320             logger.info("blocker='%s',software='%s'", blocker, software)
321             blocking = friendica.fetch_blocks(blocker)
322         elif software == "misskey":
323             logger.info("blocker='%s',software='%s'", blocker, software)
324             blocking = misskey.fetch_blocks(blocker)
325         else:
326             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
327
328         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
329         instances.set_total_blocks(blocker, blocking)
330
331         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
332         blockdict = list()
333         for block in blocking:
334             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
335
336             if block["block_level"] == "":
337                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
338                 continue
339
340             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
341             block["blocked"] = tidyup.domain(block["blocked"])
342             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
343             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
344
345             if block["blocked"] == "":
346                 logger.warning("blocked is empty, blocker='%s'", blocker)
347                 continue
348             elif block["blocked"].endswith(".onion"):
349                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
350                 continue
351             elif block["blocked"].endswith(".arpa"):
352                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
353                 continue
354             elif block["blocked"].endswith(".tld"):
355                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
356                 continue
357             elif block["blocked"].find("*") >= 0:
358                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
359
360                 # Some friendica servers also obscure domains without hash
361                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
362
363                 logger.debug("row[]='%s'", type(row))
364                 if row is None:
365                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
366                     instances.set_has_obfuscation(blocker, True)
367                     continue
368
369                 block["blocked"] = row["domain"]
370                 origin           = row["origin"]
371                 nodeinfo_url     = row["nodeinfo_url"]
372             elif block["blocked"].find("?") >= 0:
373                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
374
375                 # Some obscure them with question marks, not sure if that's dependent on version or not
376                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
377
378                 logger.debug("row[]='%s'", type(row))
379                 if row is None:
380                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
381                     instances.set_has_obfuscation(blocker, True)
382                     continue
383
384                 block["blocked"] = row["domain"]
385                 origin           = row["origin"]
386                 nodeinfo_url     = row["nodeinfo_url"]
387
388             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
389             if block["blocked"] == "":
390                 logger.debug("block[blocked] is empty - SKIPPED!")
391                 continue
392             elif not utils.is_domain_wanted(block["blocked"]):
393                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
394                 continue
395             elif block["block_level"] in ["accept", "accepted"]:
396                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
397                 continue
398             elif not instances.is_registered(block["blocked"]):
399                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
400                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
401
402             block["block_level"] = utils.alias_block_level(block["block_level"])
403
404             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
405                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
406                 blockdict.append({
407                     "blocked": block["blocked"],
408                     "reason" : block["reason"],
409                 })
410
411             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
412             cookies.clear(block["blocked"])
413
414         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
415         if instances.has_pending(blocker):
416             logger.debug("Flushing updates for blocker='%s' ...", blocker)
417             instances.update_data(blocker)
418
419         logger.debug("Invoking commit() ...")
420         database.connection.commit()
421
422         logger.debug("Invoking cookies.clear(%s) ...", blocker)
423         cookies.clear(blocker)
424
425         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
426         if config.get("bot_enabled") and len(blockdict) > 0:
427             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
428             network.send_bot_post(blocker, blockdict)
429
430     logger.debug("Success! - EXIT!")
431     return 0
432
433 def fetch_observer(args: argparse.Namespace) -> int:
434     logger.debug("args[]='%s' - CALLED!", type(args))
435
436     logger.debug("Invoking locking.acquire() ...")
437     locking.acquire()
438
439     source_domain = "fediverse.observer"
440     if sources.is_recent(source_domain):
441         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
442         return 0
443     else:
444         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
445         sources.update(source_domain)
446
447     types = list()
448     if args.software is None:
449         logger.info("Fetching software list ...")
450         raw = utils.fetch_url(
451             f"https://{source_domain}",
452             network.web_headers,
453             (config.get("connection_timeout"), config.get("read_timeout"))
454         ).text
455         logger.debug("raw[%s]()=%d", type(raw), len(raw))
456
457         doc = bs4.BeautifulSoup(raw, features="html.parser")
458         logger.debug("doc[]='%s'", type(doc))
459
460         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
461         logger.debug("items[]='%s'", type(items))
462
463         logger.info("Checking %d menu items ...", len(items))
464         for item in items:
465             logger.debug("item[%s]='%s'", type(item), item)
466             if item.text.lower() == "all":
467                 logger.debug("Skipping 'All' menu entry ...")
468                 continue
469
470             logger.debug("Appending item.text='%s' ...", item.text)
471             types.append(tidyup.domain(item.text))
472     else:
473         logger.info("Adding args.software='%s' as type ...", args.software)
474         types.append(args.software)
475
476     logger.info("Fetching %d different table data ...", len(types))
477     for software in types:
478         logger.debug("software='%s' - BEFORE!", software)
479         if args.software is not None and args.software != software:
480             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
481             continue
482
483         doc = None
484         try:
485             logger.debug("Fetching table data for software='%s' ...", software)
486             raw = utils.fetch_url(
487                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
488                 network.web_headers,
489                 (config.get("connection_timeout"), config.get("read_timeout"))
490             ).text
491             logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493             doc = bs4.BeautifulSoup(raw, features="html.parser")
494             logger.debug("doc[]='%s'", type(doc))
495         except network.exceptions as exception:
496             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
497             continue
498
499         items = doc.findAll("a", {"class": "url"})
500         logger.info("Checking %d items,software='%s' ...", len(items), software)
501         for item in items:
502             logger.debug("item[]='%s'", type(item))
503             domain = item.decode_contents()
504
505             logger.debug("domain='%s' - AFTER!", domain)
506             if domain == "":
507                 logger.debug("domain is empty - SKIPPED!")
508                 continue
509             elif not utils.is_domain_wanted(domain):
510                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
511                 continue
512             elif instances.is_registered(domain):
513                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
514                 continue
515             elif instances.is_recent(domain):
516                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
517                 continue
518
519             software = software_helper.alias(software)
520             logger.info("Fetching instances for domain='%s'", domain)
521             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
522
523     logger.debug("Success! - EXIT!")
524     return 0
525
526 def fetch_todon_wiki(args: argparse.Namespace) -> int:
527     logger.debug("args[]='%s' - CALLED!", type(args))
528
529     logger.debug("Invoking locking.acquire() ...")
530     locking.acquire()
531
532     source_domain = "wiki.todon.eu"
533     if sources.is_recent(source_domain):
534         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
535         return 0
536     else:
537         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
538         sources.update(source_domain)
539
540     blocklist = {
541         "silenced": list(),
542         "reject": list(),
543     }
544
545     raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
546     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
547
548     doc = bs4.BeautifulSoup(raw, "html.parser")
549     logger.debug("doc[]='%s'", type(doc))
550
551     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
552     logger.info("Checking %d silenced/limited entries ...", len(silenced))
553     blocklist["silenced"] = utils.find_domains(silenced, "div")
554
555     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
556     logger.info("Checking %d suspended entries ...", len(suspended))
557     blocklist["reject"] = utils.find_domains(suspended, "div")
558
559     blocking = blocklist["silenced"] + blocklist["reject"]
560     blocker = "todon.eu"
561
562     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
563     instances.set_total_blocks(blocker, blocking)
564
565     blockdict = list()
566     for block_level in blocklist:
567         blockers = blocklist[block_level]
568
569         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
570         for blocked in blockers:
571             logger.debug("blocked='%s'", blocked)
572
573             if not instances.is_registered(blocked):
574                 try:
575                     logger.info("Fetching instances from domain='%s' ...", blocked)
576                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
577                 except network.exceptions as exception:
578                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
579                     instances.set_last_error(blocked, exception)
580
581             if blocks.is_instance_blocked(blocker, blocked, block_level):
582                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
583                 continue
584
585             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
586             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
587                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
588                 blockdict.append({
589                     "blocked": blocked,
590                     "reason" : None,
591                 })
592
593         logger.debug("Invoking commit() ...")
594         database.connection.commit()
595
596         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
597         if config.get("bot_enabled") and len(blockdict) > 0:
598             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
599             network.send_bot_post(blocker, blockdict)
600
601     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
602     if instances.has_pending(blocker):
603         logger.debug("Flushing updates for blocker='%s' ...", blocker)
604         instances.update_data(blocker)
605
606     logger.debug("Success! - EXIT!")
607     return 0
608
609 def fetch_cs(args: argparse.Namespace):
610     logger.debug("args[]='%s' - CALLED!", type(args))
611
612     logger.debug("Invoking locking.acquire() ...")
613     locking.acquire()
614
615     extensions = [
616         "extra",
617         "abbr",
618         "attr_list",
619         "def_list",
620         "fenced_code",
621         "footnotes",
622         "md_in_html",
623         "admonition",
624         "codehilite",
625         "legacy_attrs",
626         "legacy_em",
627         "meta",
628         "nl2br",
629         "sane_lists",
630         "smarty",
631         "toc",
632         "wikilinks"
633     ]
634
635     domains = {
636         "silenced": list(),
637         "reject"  : list(),
638     }
639
640     source_domain = "raw.githubusercontent.com"
641     if sources.is_recent(source_domain):
642         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
643         return 0
644     else:
645         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
646         sources.update(source_domain)
647
648     raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
649     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
650
651     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
652     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
653
654     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
655     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
656     domains["silenced"] = federation.find_domains(silenced)
657
658     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
659     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
660     domains["reject"] = federation.find_domains(blocked)
661
662     blocking = blocklist["silenced"] + blocklist["reject"]
663     blocker = "chaos.social"
664
665     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
666     instances.set_total_blocks(blocker, blocking)
667
668     logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"]))
669     blockdict = list()
670     if len(domains) > 0:
671
672         for block_level in domains:
673             logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level]))
674
675             for row in domains[block_level]:
676                 logger.debug("row[%s]='%s'", type(row), row)
677                 if instances.is_recent(row["domain"], "last_blocked"):
678                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
679                     continue
680                 elif not instances.is_registered(row["domain"]):
681                     try:
682                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
683                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
684                     except network.exceptions as exception:
685                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
686                         instances.set_last_error(row["domain"], exception)
687
688                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
689                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
690                     blockdict.append({
691                         "blocked": row["domain"],
692                         "reason" : row["reason"],
693                     })
694
695         logger.debug("Invoking commit() ...")
696         database.connection.commit()
697
698         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
699         if config.get("bot_enabled") and len(blockdict) > 0:
700             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
701             network.send_bot_post(blocker, blockdict)
702
703     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
704     if instances.has_pending(blocker):
705         logger.debug("Flushing updates for blocker='%s' ...", blocker)
706         instances.update_data(blocker)
707
708     logger.debug("Success! - EXIT!")
709     return 0
710
711 def fetch_fba_rss(args: argparse.Namespace) -> int:
712     logger.debug("args[]='%s' - CALLED!", type(args))
713
714     domains = list()
715
716     logger.debug("Invoking locking.acquire() ...")
717     locking.acquire()
718
719     components = urlparse(args.feed)
720
721     if sources.is_recent(components.netloc):
722         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
723         return 0
724     else:
725         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
726         sources.update(components.netloc)
727
728     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
729     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
730
731     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
732     if response.ok and response.status_code < 300 and len(response.text) > 0:
733         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
734         rss = atoma.parse_rss_bytes(response.content)
735
736         logger.debug("rss[]='%s'", type(rss))
737         for item in rss.items:
738             logger.debug("item='%s'", item)
739             domain = tidyup.domain(item.link.split("=")[1])
740
741             logger.debug("domain='%s' - AFTER!", domain)
742             if domain == "":
743                 logger.debug("domain is empty - SKIPPED!")
744                 continue
745             elif not utils.is_domain_wanted(domain):
746                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
747                 continue
748             elif domain in domains:
749                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
750                 continue
751             elif instances.is_registered(domain):
752                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
753                 continue
754             elif instances.is_recent(domain):
755                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
756                 continue
757
758             logger.debug("Adding domain='%s'", domain)
759             domains.append(domain)
760
761     logger.debug("domains()=%d", len(domains))
762     if len(domains) > 0:
763         logger.info("Adding %d new instances ...", len(domains))
764         for domain in domains:
765             try:
766                 logger.info("Fetching instances from domain='%s' ...", domain)
767                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
768             except network.exceptions as exception:
769                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
770                 instances.set_last_error(domain, exception)
771                 return 100
772
773     logger.debug("Success! - EXIT!")
774     return 0
775
776 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
777     logger.debug("args[]='%s' - CALLED!", type(args))
778
779     logger.debug("Invoking locking.acquire() ...")
780     locking.acquire()
781
782     source_domain = "ryana.agency"
783     if sources.is_recent(source_domain):
784         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
785         return 0
786     else:
787         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
788         sources.update(source_domain)
789
790     feed = f"https://{source_domain}/users/fba/feed.atom"
791
792     domains = list()
793
794     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
795     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
796
797     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
798     if response.ok and response.status_code < 300 and len(response.text) > 0:
799         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
800         atom = atoma.parse_atom_bytes(response.content)
801
802         logger.debug("atom[]='%s'", type(atom))
803         for entry in atom.entries:
804             logger.debug("entry[]='%s'", type(entry))
805             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
806             logger.debug("doc[]='%s'", type(doc))
807             for element in doc.findAll("a"):
808                 logger.debug("element[]='%s'", type(element))
809                 for href in element["href"].split(","):
810                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
811                     domain = tidyup.domain(href)
812
813                     logger.debug("domain='%s' - AFTER!", domain)
814                     if domain == "":
815                         logger.debug("domain is empty - SKIPPED!")
816                         continue
817                     elif not utils.is_domain_wanted(domain):
818                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
819                         continue
820                     elif domain in domains:
821                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
822                         continue
823                     elif instances.is_registered(domain):
824                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
825                         continue
826                     elif instances.is_recent(domain):
827                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
828                         continue
829
830                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
831                     domains.append(domain)
832
833     logger.debug("domains()=%d", len(domains))
834     if len(domains) > 0:
835         logger.info("Adding %d new instances ...", len(domains))
836         for domain in domains:
837             logger.debug("domain='%s'", domain)
838             try:
839                 logger.info("Fetching instances from domain='%s' ...", domain)
840                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
841             except network.exceptions as exception:
842                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
843                 instances.set_last_error(domain, exception)
844                 return 100
845
846     logger.debug("Success! - EXIT!")
847     return 0
848
849 def fetch_instances(args: argparse.Namespace) -> int:
850     logger.debug("args[]='%s' - CALLED!", type(args))
851
852     logger.debug("args.domain='%s' - checking ...", args.domain)
853     if not validators.domain(args.domain):
854         logger.warning("args.domain='%s' is not valid.", args.domain)
855         return 100
856     elif blacklist.is_blacklisted(args.domain):
857         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
858         return 101
859
860     logger.debug("Invoking locking.acquire() ...")
861     locking.acquire()
862
863     # Initial fetch
864     try:
865         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
866         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
867     except network.exceptions as exception:
868         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
869         instances.set_last_error(args.domain, exception)
870         instances.update_data(args.domain)
871         return 100
872
873     if args.single:
874         logger.debug("Not fetching more instances - EXIT!")
875         return 0
876
877     # Loop through some instances
878     database.cursor.execute(
879         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
880     )
881
882     rows = database.cursor.fetchall()
883     logger.info("Checking %d entries ...", len(rows))
884     for row in rows:
885         logger.debug("row[domain]='%s'", row["domain"])
886         if row["domain"] == "":
887             logger.debug("row[domain] is empty - SKIPPED!")
888             continue
889         elif not utils.is_domain_wanted(row["domain"]):
890             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
891             continue
892
893         try:
894             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
895             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
896         except network.exceptions as exception:
897             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
898             instances.set_last_error(row["domain"], exception)
899
900     logger.debug("Success - EXIT!")
901     return 0
902
903 def fetch_oliphant(args: argparse.Namespace) -> int:
904     logger.debug("args[]='%s' - CALLED!", type(args))
905
906     logger.debug("Invoking locking.acquire() ...")
907     locking.acquire()
908
909     source_domain = "codeberg.org"
910     if sources.is_recent(source_domain):
911         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
912         return 0
913     else:
914         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
915         sources.update(source_domain)
916
917     # Base URL
918     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
919
920     # URLs to fetch
921     blocklists = (
922         {
923             "blocker": "artisan.chat",
924             "csv_url": "mastodon/artisan.chat.csv",
925         },{
926             "blocker": "mastodon.art",
927             "csv_url": "mastodon/mastodon.art.csv",
928         },{
929             "blocker": "pleroma.envs.net",
930             "csv_url": "mastodon/pleroma.envs.net.csv",
931         },{
932             "blocker": "oliphant.social",
933             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
934         },{
935             "blocker": "mastodon.online",
936             "csv_url": "mastodon/mastodon.online.csv",
937         },{
938             "blocker": "mastodon.social",
939             "csv_url": "mastodon/mastodon.social.csv",
940         },{
941             "blocker": "mastodon.social",
942             "csv_url": "other/missing-tier0-mastodon.social.csv",
943         },{
944             "blocker": "rage.love",
945             "csv_url": "mastodon/rage.love.csv",
946         },{
947             "blocker": "sunny.garden",
948             "csv_url": "mastodon/sunny.garden.csv",
949         },{
950             "blocker": "solarpunk.moe",
951             "csv_url": "mastodon/solarpunk.moe.csv",
952         },{
953             "blocker": "toot.wales",
954             "csv_url": "mastodon/toot.wales.csv",
955         },{
956             "blocker": "union.place",
957             "csv_url": "mastodon/union.place.csv",
958         }
959     )
960
961     domains = list()
962
963     logger.debug("Downloading %d files ...", len(blocklists))
964     for block in blocklists:
965         # Is domain given and not equal blocker?
966         if isinstance(args.domain, str) and args.domain != block["blocker"]:
967             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
968             continue
969         elif args.domain in domains:
970             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
971             continue
972         elif instances.is_recent(block["blocker"]):
973             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
974             continue
975
976         # Fetch this URL
977         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
978         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
979
980         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
981         if not response.ok or response.status_code >= 300 or response.content == "":
982             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
983             continue
984
985         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
986         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
987
988         blockdict = list()
989
990         logger.info("Processing %d rows ...", len(reader))
991         cnt = 0
992         for row in reader:
993             logger.debug("row[%s]='%s'", type(row), row)
994             domain = severity = None
995             reject_media = reject_reports = False
996
997             if "#domain" in row:
998                 domain = row["#domain"]
999             elif "domain" in row:
1000                 domain = row["domain"]
1001             else:
1002                 logger.debug("row='%s' does not contain domain column", row)
1003                 continue
1004
1005             if "#severity" in row:
1006                 severity = row["#severity"]
1007             elif "severity" in row:
1008                 severity = row["severity"]
1009             else:
1010                 logger.debug("row='%s' does not contain severity column", row)
1011                 continue
1012
1013             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1014                 reject_media = True
1015             elif "reject_media" in row and row["reject_media"].lower() == "true":
1016                 reject_media = True
1017
1018             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1019                 reject_reports = True
1020             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1021                 reject_reports = True
1022
1023             cnt = cnt + 1
1024             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1025             if domain == "":
1026                 logger.debug("domain is empty - SKIPPED!")
1027                 continue
1028             elif not utils.is_domain_wanted(domain):
1029                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1030                 continue
1031
1032             logger.debug("Marking domain='%s' as handled", domain)
1033             domains.append(domain)
1034
1035             logger.debug("Processing domain='%s' ...", domain)
1036             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1037             logger.debug("processed='%s'", processed)
1038
1039             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1040                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1041                 blockdict.append({
1042                     "blocked": domain,
1043                     "reason" : block["reason"],
1044                 })
1045
1046             if reject_media:
1047                 utils.process_block(block["blocker"], domain, None, "reject_media")
1048             if reject_reports:
1049                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1050
1051         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", block["blocker"], cnt)
1052         instances.set_total_blocks(block["blocker"], cnt)
1053
1054         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1055         if instances.has_pending(block["blocker"]):
1056             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1057             instances.update_data(block["blocker"])
1058
1059         logger.debug("Invoking commit() ...")
1060         database.connection.commit()
1061
1062         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1063         if config.get("bot_enabled") and len(blockdict) > 0:
1064             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1065             network.send_bot_post(block["blocker"], blockdict)
1066
1067     logger.debug("Success! - EXIT!")
1068     return 0
1069
1070 def fetch_txt(args: argparse.Namespace) -> int:
1071     logger.debug("args[]='%s' - CALLED!", type(args))
1072
1073     logger.debug("Invoking locking.acquire() ...")
1074     locking.acquire()
1075
1076     # Static URLs
1077     urls = ({
1078         "blocker": "seirdy.one",
1079         "url"    : "https://seirdy.one/pb/bsl.txt",
1080     },)
1081
1082     logger.info("Checking %d text file(s) ...", len(urls))
1083     for row in urls:
1084         logger.debug("Fetching row[url]='%s' ...", row["url"])
1085         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1086
1087         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1088         if response.ok and response.status_code < 300 and response.text != "":
1089             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1090             domains = response.text.split("\n")
1091
1092             logger.info("Processing %d domains ...", len(domains))
1093             for domain in domains:
1094                 logger.debug("domain='%s' - BEFORE!", domain)
1095                 domain = tidyup.domain(domain)
1096
1097                 logger.debug("domain='%s' - AFTER!", domain)
1098                 if domain == "":
1099                     logger.debug("domain is empty - SKIPPED!")
1100                     continue
1101                 elif not utils.is_domain_wanted(domain):
1102                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1103                     continue
1104                 elif instances.is_recent(domain):
1105                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1106                     continue
1107
1108                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1109                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1110
1111                 logger.debug("processed='%s'", processed)
1112                 if not processed:
1113                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1114                     continue
1115
1116     logger.debug("Success! - EXIT!")
1117     return 0
1118
1119 def fetch_fedipact(args: argparse.Namespace) -> int:
1120     logger.debug("args[]='%s' - CALLED!", type(args))
1121
1122     logger.debug("Invoking locking.acquire() ...")
1123     locking.acquire()
1124
1125     source_domain = "fedipact.online"
1126     if sources.is_recent(source_domain):
1127         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1128         return 0
1129     else:
1130         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1131         sources.update(source_domain)
1132
1133     response = utils.fetch_url(
1134         f"https://{source_domain}",
1135         network.web_headers,
1136         (config.get("connection_timeout"), config.get("read_timeout"))
1137     )
1138
1139     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1140     if response.ok and response.status_code < 300 and response.text != "":
1141         logger.debug("Parsing %d Bytes ...", len(response.text))
1142
1143         doc = bs4.BeautifulSoup(response.text, "html.parser")
1144         logger.debug("doc[]='%s'", type(doc))
1145
1146         rows = doc.findAll("li")
1147         logger.info("Checking %d row(s) ...", len(rows))
1148         for row in rows:
1149             logger.debug("row[]='%s'", type(row))
1150             domain = tidyup.domain(row.contents[0])
1151
1152             logger.debug("domain='%s' - AFTER!", domain)
1153             if domain == "":
1154                 logger.debug("domain is empty - SKIPPED!")
1155                 continue
1156             elif not utils.is_domain_wanted(domain):
1157                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1158                 continue
1159             elif instances.is_registered(domain):
1160                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1161                 continue
1162             elif instances.is_recent(domain):
1163                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1164                 continue
1165
1166             logger.info("Fetching domain='%s' ...", domain)
1167             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1168
1169     logger.debug("Success! - EXIT!")
1170     return 0
1171
1172 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1173     logger.debug("args[]='%s' - CALLED!", type(args))
1174
1175     logger.debug("Invoking locking.acquire() ...")
1176     locking.acquire()
1177
1178     source_domain = "joinfediverse.wiki"
1179     if sources.is_recent(source_domain):
1180         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1181         return 0
1182     else:
1183         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1184         sources.update(source_domain)
1185
1186     raw = utils.fetch_url(
1187         f"https://{source_domain}/FediBlock",
1188         network.web_headers,
1189         (config.get("connection_timeout"), config.get("read_timeout"))
1190     ).text
1191     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1192
1193     doc = bs4.BeautifulSoup(raw, "html.parser")
1194     logger.debug("doc[]='%s'", type(doc))
1195
1196     tables = doc.findAll("table", {"class": "wikitable"})
1197
1198     logger.info("Analyzing %d table(s) ...", len(tables))
1199     blocklist = list()
1200     for table in tables:
1201         logger.debug("table[]='%s'", type(table))
1202
1203         rows = table.findAll("tr")
1204         logger.info("Checking %d row(s) ...", len(rows))
1205         block_headers = dict()
1206         for row in rows:
1207             logger.debug("row[%s]='%s'", type(row), row)
1208
1209             headers = row.findAll("th")
1210             logger.debug("Found headers()=%d header(s)", len(headers))
1211             if len(headers) > 1:
1212                 block_headers = dict()
1213                 cnt = 0
1214                 for header in headers:
1215                     cnt = cnt + 1
1216                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1217                     text = header.contents[0]
1218
1219                     logger.debug("text[]='%s'", type(text))
1220                     if not isinstance(text, str):
1221                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1222                         continue
1223                     elif validators.domain(text.strip()):
1224                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1225                         continue
1226
1227                     text = tidyup.domain(text.strip())
1228                     logger.debug("text='%s'", text)
1229                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1230                         logger.debug("Found header: '%s'=%d", text, cnt)
1231                         block_headers[cnt] = text
1232
1233             elif len(block_headers) == 0:
1234                 logger.debug("row is not scrapable - SKIPPED!")
1235                 continue
1236             elif len(block_headers) > 0:
1237                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1238                 cnt = 0
1239                 block = dict()
1240
1241                 for element in row.find_all(["th", "td"]):
1242                     cnt = cnt + 1
1243                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1244                     if cnt in block_headers:
1245                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1246
1247                         text = element.text.strip()
1248                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1249
1250                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1251                         if key in ["domain", "instance"]:
1252                             block[key] = text
1253                         elif key == "reason":
1254                             block[key] = tidyup.reason(text)
1255                         elif key == "subdomain(s)":
1256                             block[key] = list()
1257                             if text != "":
1258                                 block[key] = text.split("/")
1259                         else:
1260                             logger.debug("key='%s'", key)
1261                             block[key] = text
1262
1263                 logger.debug("block()=%d ...", len(block))
1264                 if len(block) > 0:
1265                     logger.debug("Appending block()=%d ...", len(block))
1266                     blocklist.append(block)
1267
1268     logger.debug("blocklist()=%d", len(blocklist))
1269
1270     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1271     domains = database.cursor.fetchall()
1272
1273     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1274     blocking = list()
1275     for block in blocklist:
1276         logger.debug("block='%s'", block)
1277         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1278             origin = block["blocked"]
1279             for subdomain in block["subdomain(s)"]:
1280                 block["blocked"] = subdomain + "." + origin
1281                 blocking.append(block)
1282         else:
1283             blocking.append(block)
1284
1285     logger.debug("blocking()=%d", blocking)
1286     for block in blocking:
1287         logger.debug("block[]='%s'", type(block))
1288         block["blocked"] = tidyup.domain(block["blocked"])
1289
1290         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1291         if block["blocked"] == "":
1292             logger.debug("block[blocked] is empty - SKIPPED!")
1293             continue
1294         elif not utils.is_domain_wanted(block["blocked"]):
1295             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1296             continue
1297         elif instances.is_recent(block["blocked"]):
1298             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1299             continue
1300
1301         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1302         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1303
1304     blockdict = list()
1305     for blocker in domains:
1306         blocker = blocker[0]
1307         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1308
1309         for block in blocking:
1310             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1311             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1312
1313             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1314             if block["blocked"] == "":
1315                 logger.debug("block[blocked] is empty - SKIPPED!")
1316                 continue
1317             elif not utils.is_domain_wanted(block["blocked"]):
1318                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1319                 continue
1320
1321             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1322             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1323                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1324                 blockdict.append({
1325                     "blocked": block["blocked"],
1326                     "reason" : block["reason"],
1327                 })
1328
1329         if instances.has_pending(blocker):
1330             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1331             instances.update_data(blocker)
1332
1333         logger.debug("Invoking commit() ...")
1334         database.connection.commit()
1335
1336         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1337         if config.get("bot_enabled") and len(blockdict) > 0:
1338             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1339             network.send_bot_post(blocker, blockdict)
1340
1341     logger.debug("Success! - EXIT!")
1342     return 0
1343
1344 def recheck_obfuscation(args: argparse.Namespace) -> int:
1345     logger.debug("args[]='%s' - CALLED!", type(args))
1346
1347     logger.debug("Invoking locking.acquire() ...")
1348     locking.acquire()
1349
1350     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1351         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1352     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1353         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1354     else:
1355         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1356
1357     rows = database.cursor.fetchall()
1358     logger.info("Checking %d domains ...", len(rows))
1359     for row in rows:
1360         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1361         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1362             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1363             continue
1364
1365         blocking = list()
1366         if row["software"] == "pleroma":
1367             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1368             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1369         elif row["software"] == "mastodon":
1370             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1371             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1372         elif row["software"] == "lemmy":
1373             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1374             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1375         elif row["software"] == "friendica":
1376             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1377             blocking = friendica.fetch_blocks(row["domain"])
1378         elif row["software"] == "misskey":
1379             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1380             blocking = misskey.fetch_blocks(row["domain"])
1381         else:
1382             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1383
1384         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1385         instances.set_total_blocks(row["domain"], blocking)
1386
1387         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1388         obfuscated = 0
1389         blockdict = list()
1390         for block in blocking:
1391             logger.debug("block[blocked]='%s'", block["blocked"])
1392             blocked = None
1393
1394             if block["blocked"] == "":
1395                 logger.debug("block[blocked] is empty - SKIPPED!")
1396                 continue
1397             elif block["blocked"].endswith(".arpa"):
1398                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1399                 continue
1400             elif block["blocked"].endswith(".tld"):
1401                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1402                 continue
1403             elif block["blocked"].endswith(".onion"):
1404                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1405                 continue
1406             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1407                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1408                 obfuscated = obfuscated + 1
1409                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1410             elif not utils.is_domain_wanted(block["blocked"]):
1411                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1412                 continue
1413             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1414                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1415                 continue
1416
1417             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1418             if blocked is not None and blocked != block["blocked"]:
1419                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1420                 obfuscated = obfuscated - 1
1421                 if blocks.is_instance_blocked(row["domain"], blocked):
1422                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1423                     continue
1424
1425                 block["block_level"] = utils.alias_block_level(block["block_level"])
1426
1427                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1428                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1429                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1430                     blockdict.append({
1431                         "blocked": blocked,
1432                         "reason" : block["reason"],
1433                     })
1434
1435         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1436         if obfuscated == 0 and len(blocking) > 0:
1437             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1438             instances.set_has_obfuscation(row["domain"], False)
1439
1440         if instances.has_pending(row["domain"]):
1441             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1442             instances.update_data(row["domain"])
1443
1444         logger.debug("Invoking commit() ...")
1445         database.connection.commit()
1446
1447         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1448         if config.get("bot_enabled") and len(blockdict) > 0:
1449             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1450             network.send_bot_post(row["domain"], blockdict)
1451
1452     logger.debug("Success! - EXIT!")
1453     return 0
1454
1455 def fetch_fedilist(args: argparse.Namespace) -> int:
1456     logger.debug("args[]='%s' - CALLED!", type(args))
1457
1458     logger.debug("Invoking locking.acquire() ...")
1459     locking.acquire()
1460
1461     source_domain = "demo.fedilist.com"
1462     if sources.is_recent(source_domain):
1463         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1464         return 0
1465     else:
1466         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1467         sources.update(source_domain)
1468
1469     url = f"http://{source_domain}/instance/csv?onion=not"
1470     if args.software is not None and args.software != "":
1471         logger.debug("args.software='%s'", args.software)
1472         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1473
1474     logger.info("Fetching url='%s' ...", url)
1475     response = reqto.get(
1476         url,
1477         headers=network.web_headers,
1478         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1479         allow_redirects=False
1480     )
1481
1482     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1483     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1484         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1485         return 1
1486
1487     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1488
1489     logger.debug("reader[]='%s'", type(reader))
1490     blockdict = list()
1491     for row in reader:
1492         logger.debug("row[]='%s'", type(row))
1493         domain = tidyup.domain(row["hostname"])
1494         logger.debug("domain='%s' - AFTER!", domain)
1495
1496         if domain == "":
1497             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1498             continue
1499         elif not utils.is_domain_wanted(domain):
1500             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1501             continue
1502         elif (args.all is None or not args.all) and instances.is_registered(domain):
1503             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1504             continue
1505         elif instances.is_recent(domain):
1506             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1507             continue
1508
1509         logger.info("Fetching instances from domain='%s' ...", domain)
1510         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1511
1512     logger.debug("Success! - EXIT!")
1513     return 0
1514
1515 def update_nodeinfo(args: argparse.Namespace) -> int:
1516     logger.debug("args[]='%s' - CALLED!", type(args))
1517
1518     logger.debug("Invoking locking.acquire() ...")
1519     locking.acquire()
1520
1521     if args.domain is not None and args.domain != "":
1522         logger.debug("Fetching args.domain='%s'", args.domain)
1523         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1524     elif args.software is not None and args.software != "":
1525         logger.info("Fetching domains for args.software='%s'", args.software)
1526         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1527     else:
1528         logger.info("Fetching domains for recently updated ...")
1529         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1530
1531     domains = database.cursor.fetchall()
1532
1533     logger.info("Checking %d domain(s) ...", len(domains))
1534     cnt = 0
1535     for row in domains:
1536         logger.debug("row[]='%s'", type(row))
1537         try:
1538             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1539             software = federation.determine_software(row["domain"])
1540
1541             logger.debug("Determined software='%s'", software)
1542             if software != row["software"] and software is not None:
1543                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1544                 instances.set_software(row["domain"], software)
1545
1546             instances.set_success(row["domain"])
1547         except network.exceptions as exception:
1548             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1549             instances.set_last_error(row["domain"], exception)
1550
1551         instances.set_last_nodeinfo(row["domain"])
1552         instances.update_data(row["domain"])
1553         cnt = cnt + 1
1554
1555     logger.debug("Success! - EXIT!")
1556     return 0
1557
1558 def fetch_instances_social(args: argparse.Namespace) -> int:
1559     logger.debug("args[]='%s' - CALLED!", type(args))
1560
1561     logger.debug("Invoking locking.acquire() ...")
1562     locking.acquire()
1563
1564     source_domain = "instances.social"
1565
1566     if config.get("instances_social_api_key") == "":
1567         logger.error("API key not set. Please set in your config.json file.")
1568         return 1
1569     elif sources.is_recent(source_domain):
1570         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1571         return 0
1572     else:
1573         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1574         sources.update(source_domain)
1575
1576     headers = {
1577         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1578     }
1579
1580     fetched = network.get_json_api(
1581         source_domain,
1582         "/api/1.0/instances/list?count=0&sort_by=name",
1583         headers,
1584         (config.get("connection_timeout"), config.get("read_timeout"))
1585     )
1586     logger.debug("fetched[]='%s'", type(fetched))
1587
1588     if "error_message" in fetched:
1589         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1590         return 2
1591     elif "exception" in fetched:
1592         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1593         return 3
1594     elif "json" not in fetched:
1595         logger.warning("fetched has no element 'json' - EXIT!")
1596         return 4
1597     elif "instances" not in fetched["json"]:
1598         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1599         return 5
1600
1601     domains = list()
1602     rows = fetched["json"]["instances"]
1603
1604     logger.info("Checking %d row(s) ...", len(rows))
1605     for row in rows:
1606         logger.debug("row[]='%s'", type(row))
1607         domain = tidyup.domain(row["name"])
1608
1609         logger.debug("domain='%s' - AFTER!", domain)
1610         if domain == "":
1611             logger.debug("domain is empty - SKIPPED!")
1612             continue
1613         elif not utils.is_domain_wanted(domain):
1614             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1615             continue
1616         elif domain in domains:
1617             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1618             continue
1619         elif instances.is_registered(domain):
1620             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1621             continue
1622         elif instances.is_recent(domain):
1623             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1624             continue
1625
1626         logger.info("Fetching instances from domain='%s'", domain)
1627         federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
1628
1629     logger.debug("Success! - EXIT!")
1630     return 0