]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import software as software_helper
41 from fba.helpers import tidyup
42
43 from fba.http import federation
44 from fba.http import network
45
46 from fba.models import blocks
47 from fba.models import instances
48 from fba.models import sources
49
50 from fba.networks import friendica
51 from fba.networks import lemmy
52 from fba.networks import mastodon
53 from fba.networks import misskey
54 from fba.networks import pleroma
55
56 logging.basicConfig(level=logging.INFO)
57 logger = logging.getLogger(__name__)
58 #logger.setLevel(logging.DEBUG)
59
60 def check_instance(args: argparse.Namespace) -> int:
61     logger.debug("args.domain='%s' - CALLED!", args.domain)
62     status = 0
63     if not validators.domain(args.domain):
64         logger.warning("args.domain='%s' is not valid", args.domain)
65         status = 100
66     elif blacklist.is_blacklisted(args.domain):
67         logger.warning("args.domain='%s' is blacklisted", args.domain)
68         status = 101
69     elif instances.is_registered(args.domain):
70         logger.warning("args.domain='%s' is already registered", args.domain)
71         status = 102
72     else:
73         logger.info("args.domain='%s' is not known", args.domain)
74
75     logger.debug("status=%d - EXIT!", status)
76     return status
77
78 def check_nodeinfo(args: argparse.Namespace) -> int:
79     logger.debug("args[]='%s' - CALLED!", type(args))
80
81     # Fetch rows
82     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
83
84     cnt = 0
85     for row in database.cursor.fetchall():
86         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
87         punycode = row["domain"].encode("idna").decode("utf-8")
88
89         if row["nodeinfo_url"].startswith("/"):
90             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
91             continue
92         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
93             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
94             cnt = cnt + 1
95
96     logger.info("Found %d row(s)", cnt)
97
98     logger.debug("EXIT!")
99     return 0
100
101 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
102     logger.debug("args[]='%s' - CALLED!", type(args))
103
104     # No CSRF by default, you don't have to add network.source_headers by yourself here
105     headers = tuple()
106     source_domain = "pixelfed.org"
107
108     if sources.is_recent(source_domain):
109         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
110         return 0
111     else:
112         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
113         sources.update(source_domain)
114
115     try:
116         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
117         headers = csrf.determine(source_domain, dict())
118     except network.exceptions as exception:
119         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
120         return list()
121
122     try:
123         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
124         fetched = network.get_json_api(
125             source_domain,
126             "/api/v1/servers/all.json?scope=All&country=all&language=all",
127             headers,
128             (config.get("connection_timeout"), config.get("read_timeout"))
129         )
130
131         logger.debug("JSON API returned %d elements", len(fetched))
132         if "error_message" in fetched:
133             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
134             return 101
135         elif "data" not in fetched["json"]:
136             logger.warning("API did not return JSON with 'data' element - EXIT!")
137             return 102
138
139         rows = fetched["json"]["data"]
140         logger.info("Checking %d fetched rows ...", len(rows))
141         for row in rows:
142             logger.debug("row[]='%s'", type(row))
143             if "domain" not in row:
144                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
145                 continue
146             elif row["domain"] == "":
147                 logger.debug("row[domain] is empty - SKIPPED!")
148                 continue
149             elif not utils.is_domain_wanted(row["domain"]):
150                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
151                 continue
152             elif instances.is_registered(row["domain"]):
153                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
154                 continue
155             elif instances.is_recent(row["domain"]):
156                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
157                 continue
158
159             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
160             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
161
162     except network.exceptions as exception:
163         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
164         return 103
165
166     logger.debug("Success! - EXIT!")
167     return 0
168
169 def fetch_bkali(args: argparse.Namespace) -> int:
170     logger.debug("args[]='%s' - CALLED!", type(args))
171
172     logger.debug("Invoking locking.acquire() ...")
173     locking.acquire()
174
175     source_domain = "gql.api.bka.li"
176     if sources.is_recent(source_domain):
177         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
178         return 0
179     else:
180         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
181         sources.update(source_domain)
182
183     domains = list()
184     try:
185         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
186         fetched = network.post_json_api(
187             source_domain,
188             "/v1/graphql",
189             json.dumps({
190                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
191             })
192         )
193
194         logger.debug("fetched[]='%s'", type(fetched))
195         if "error_message" in fetched:
196             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
197             return 100
198         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
199             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
200             return 101
201
202         rows = fetched["json"]
203
204         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
205         if len(rows) == 0:
206             raise Exception("WARNING: Returned no records")
207         elif "data" not in rows:
208             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
209         elif "nodeinfo" not in rows["data"]:
210             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
211
212         for entry in rows["data"]["nodeinfo"]:
213             logger.debug("entry[%s]='%s'", type(entry), entry)
214             if "domain" not in entry:
215                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
216                 continue
217             elif entry["domain"] == "":
218                 logger.debug("entry[domain] is empty - SKIPPED!")
219                 continue
220             elif not utils.is_domain_wanted(entry["domain"]):
221                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
222                 continue
223             elif instances.is_registered(entry["domain"]):
224                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
225                 continue
226             elif instances.is_recent(entry["domain"]):
227                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
228                 continue
229
230             logger.debug("Adding domain='%s' ...", entry["domain"])
231             domains.append(entry["domain"])
232
233     except network.exceptions as exception:
234         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
235         return 102
236
237     logger.debug("domains()=%d", len(domains))
238     if len(domains) > 0:
239         logger.info("Adding %d new instances ...", len(domains))
240         for domain in domains:
241             try:
242                 logger.info("Fetching instances from domain='%s' ...", domain)
243                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
244             except network.exceptions as exception:
245                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
246                 instances.set_last_error(domain, exception)
247                 return 100
248
249     logger.debug("Success - EXIT!")
250     return 0
251
252 def fetch_blocks(args: argparse.Namespace) -> int:
253     logger.debug("args[]='%s' - CALLED!", type(args))
254     if args.domain is not None and args.domain != "":
255         logger.debug("args.domain='%s' - checking ...", args.domain)
256         if not validators.domain(args.domain):
257             logger.warning("args.domain='%s' is not valid.", args.domain)
258             return 100
259         elif blacklist.is_blacklisted(args.domain):
260             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
261             return 101
262         elif not instances.is_registered(args.domain):
263             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
264             return 102
265
266     logger.debug("Invoking locking.acquire() ...")
267     locking.acquire()
268
269     if args.domain is not None and args.domain != "":
270         # Re-check single domain
271         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
272         database.cursor.execute(
273             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
274         )
275     elif args.software is not None and args.software != "":
276         # Re-check single software
277         logger.debug("Querying database for args.software='%s' ...", args.software)
278         database.cursor.execute(
279             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
280         )
281     else:
282         # Re-check after "timeout" (aka. minimum interval)
283         database.cursor.execute(
284             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
285         )
286
287     rows = database.cursor.fetchall()
288     logger.info("Checking %d entries ...", len(rows))
289     for blocker, software, origin, nodeinfo_url in rows:
290         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
291         blocker = tidyup.domain(blocker)
292         logger.debug("blocker='%s' - AFTER!", blocker)
293
294         if blocker == "":
295             logger.warning("blocker is now empty!")
296             continue
297         elif nodeinfo_url is None or nodeinfo_url == "":
298             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
299             continue
300         elif not utils.is_domain_wanted(blocker):
301             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
302             continue
303
304         logger.debug("blocker='%s'", blocker)
305         instances.set_last_blocked(blocker)
306         instances.set_has_obfuscation(blocker, False)
307
308         blocking = list()
309         blockdict = list()
310         if software == "pleroma":
311             logger.info("blocker='%s',software='%s'", blocker, software)
312             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
313         elif software == "mastodon":
314             logger.info("blocker='%s',software='%s'", blocker, software)
315             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
316         elif software == "lemmy":
317             logger.info("blocker='%s',software='%s'", blocker, software)
318             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
319         elif software == "friendica":
320             logger.info("blocker='%s',software='%s'", blocker, software)
321             blocking = friendica.fetch_blocks(blocker)
322         elif software == "misskey":
323             logger.info("blocker='%s',software='%s'", blocker, software)
324             blocking = misskey.fetch_blocks(blocker)
325         else:
326             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
327
328         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
329         instances.set_total_blocks(blocker, blocking)
330
331         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
332         blockdict = list()
333         for block in blocking:
334             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
335
336             if block["block_level"] == "":
337                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
338                 continue
339
340             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
341             block["blocked"] = tidyup.domain(block["blocked"])
342             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
343             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
344
345             if block["blocked"] == "":
346                 logger.warning("blocked is empty, blocker='%s'", blocker)
347                 continue
348             elif block["blocked"].endswith(".onion"):
349                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
350                 continue
351             elif block["blocked"].endswith(".arpa"):
352                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
353                 continue
354             elif block["blocked"].endswith(".tld"):
355                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
356                 continue
357             elif block["blocked"].find("*") >= 0:
358                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
359
360                 # Some friendica servers also obscure domains without hash
361                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
362
363                 logger.debug("row[]='%s'", type(row))
364                 if row is None:
365                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
366                     instances.set_has_obfuscation(blocker, True)
367                     continue
368
369                 block["blocked"] = row["domain"]
370                 origin           = row["origin"]
371                 nodeinfo_url     = row["nodeinfo_url"]
372             elif block["blocked"].find("?") >= 0:
373                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
374
375                 # Some obscure them with question marks, not sure if that's dependent on version or not
376                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
377
378                 logger.debug("row[]='%s'", type(row))
379                 if row is None:
380                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
381                     instances.set_has_obfuscation(blocker, True)
382                     continue
383
384                 block["blocked"] = row["domain"]
385                 origin           = row["origin"]
386                 nodeinfo_url     = row["nodeinfo_url"]
387
388             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
389             if block["blocked"] == "":
390                 logger.debug("block[blocked] is empty - SKIPPED!")
391                 continue
392             elif not utils.is_domain_wanted(block["blocked"]):
393                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
394                 continue
395             elif block["block_level"] in ["accept", "accepted"]:
396                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
397                 continue
398             elif not instances.is_registered(block["blocked"]):
399                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
400                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
401
402             block["block_level"] = utils.alias_block_level(block["block_level"])
403
404             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
405                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
406                 blockdict.append({
407                     "blocked": block["blocked"],
408                     "reason" : block["reason"],
409                 })
410
411             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
412             cookies.clear(block["blocked"])
413
414         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
415         if instances.has_pending(blocker):
416             logger.debug("Flushing updates for blocker='%s' ...", blocker)
417             instances.update_data(blocker)
418
419         logger.debug("Invoking commit() ...")
420         database.connection.commit()
421
422         logger.debug("Invoking cookies.clear(%s) ...", blocker)
423         cookies.clear(blocker)
424
425         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
426         if config.get("bot_enabled") and len(blockdict) > 0:
427             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
428             network.send_bot_post(blocker, blockdict)
429
430     logger.debug("Success! - EXIT!")
431     return 0
432
433 def fetch_observer(args: argparse.Namespace) -> int:
434     logger.debug("args[]='%s' - CALLED!", type(args))
435
436     logger.debug("Invoking locking.acquire() ...")
437     locking.acquire()
438
439     source_domain = "fediverse.observer"
440     if sources.is_recent(source_domain):
441         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
442         return 0
443     else:
444         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
445         sources.update(source_domain)
446
447     types = list()
448     if args.software is None:
449         logger.info("Fetching software list ...")
450         raw = utils.fetch_url(
451             f"https://{source_domain}",
452             network.web_headers,
453             (config.get("connection_timeout"), config.get("read_timeout"))
454         ).text
455         logger.debug("raw[%s]()=%d", type(raw), len(raw))
456
457         doc = bs4.BeautifulSoup(raw, features="html.parser")
458         logger.debug("doc[]='%s'", type(doc))
459
460         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
461         logger.debug("items[]='%s'", type(items))
462
463         logger.info("Checking %d menu items ...", len(items))
464         for item in items:
465             logger.debug("item[%s]='%s'", type(item), item)
466             if item.text.lower() == "all":
467                 logger.debug("Skipping 'All' menu entry ...")
468                 continue
469
470             logger.debug("Appending item.text='%s' ...", item.text)
471             types.append(tidyup.domain(item.text))
472     else:
473         logger.info("Adding args.software='%s' as type ...", args.software)
474         types.append(args.software)
475
476     logger.info("Fetching %d different table data ...", len(types))
477     for software in types:
478         logger.debug("software='%s' - BEFORE!", software)
479         if args.software is not None and args.software != software:
480             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
481             continue
482
483         doc = None
484         try:
485             logger.debug("Fetching table data for software='%s' ...", software)
486             raw = utils.fetch_url(
487                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
488                 network.web_headers,
489                 (config.get("connection_timeout"), config.get("read_timeout"))
490             ).text
491             logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493             doc = bs4.BeautifulSoup(raw, features="html.parser")
494             logger.debug("doc[]='%s'", type(doc))
495         except network.exceptions as exception:
496             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
497             continue
498
499         items = doc.findAll("a", {"class": "url"})
500         logger.info("Checking %d items,software='%s' ...", len(items), software)
501         for item in items:
502             logger.debug("item[]='%s'", type(item))
503             domain = item.decode_contents()
504
505             logger.debug("domain='%s' - AFTER!", domain)
506             if domain == "":
507                 logger.debug("domain is empty - SKIPPED!")
508                 continue
509             elif not utils.is_domain_wanted(domain):
510                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
511                 continue
512             elif instances.is_registered(domain):
513                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
514                 continue
515             elif instances.is_recent(domain):
516                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
517                 continue
518
519             software = software_helper.alias(software)
520             logger.info("Fetching instances for domain='%s'", domain)
521             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
522
523     logger.debug("Success! - EXIT!")
524     return 0
525
526 def fetch_todon_wiki(args: argparse.Namespace) -> int:
527     logger.debug("args[]='%s' - CALLED!", type(args))
528
529     logger.debug("Invoking locking.acquire() ...")
530     locking.acquire()
531
532     source_domain = "wiki.todon.eu"
533     if sources.is_recent(source_domain):
534         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
535         return 0
536     else:
537         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
538         sources.update(source_domain)
539
540     blocklist = {
541         "silenced": list(),
542         "reject": list(),
543     }
544
545     raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
546     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
547
548     doc = bs4.BeautifulSoup(raw, "html.parser")
549     logger.debug("doc[]='%s'", type(doc))
550
551     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
552     logger.info("Checking %d silenced/limited entries ...", len(silenced))
553     blocklist["silenced"] = utils.find_domains(silenced, "div")
554
555     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
556     logger.info("Checking %d suspended entries ...", len(suspended))
557     blocklist["reject"] = utils.find_domains(suspended, "div")
558
559     blocking = blocklist["silenced"] + blocklist["reject"]
560     blocker = "todon.eu"
561
562     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
563     instances.set_total_blocks(blocker, blocking)
564
565     blockdict = list()
566     for block_level in blocklist:
567         blockers = blocklist[block_level]
568
569         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
570         for blocked in blockers:
571             logger.debug("blocked='%s'", blocked)
572
573             if not instances.is_registered(blocked):
574                 try:
575                     logger.info("Fetching instances from domain='%s' ...", blocked)
576                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
577                 except network.exceptions as exception:
578                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
579                     instances.set_last_error(blocked, exception)
580
581             if blocks.is_instance_blocked(blocker, blocked, block_level):
582                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
583                 continue
584
585             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
586             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
587                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
588                 blockdict.append({
589                     "blocked": blocked,
590                     "reason" : None,
591                 })
592
593         logger.debug("Invoking commit() ...")
594         database.connection.commit()
595
596         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
597         if config.get("bot_enabled") and len(blockdict) > 0:
598             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
599             network.send_bot_post(blocker, blockdict)
600
601     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
602     if instances.has_pending(blocker):
603         logger.debug("Flushing updates for blocker='%s' ...", blocker)
604         instances.update_data(blocker)
605
606     logger.debug("Success! - EXIT!")
607     return 0
608
609 def fetch_cs(args: argparse.Namespace):
610     logger.debug("args[]='%s' - CALLED!", type(args))
611
612     logger.debug("Invoking locking.acquire() ...")
613     locking.acquire()
614
615     extensions = [
616         "extra",
617         "abbr",
618         "attr_list",
619         "def_list",
620         "fenced_code",
621         "footnotes",
622         "md_in_html",
623         "admonition",
624         "codehilite",
625         "legacy_attrs",
626         "legacy_em",
627         "meta",
628         "nl2br",
629         "sane_lists",
630         "smarty",
631         "toc",
632         "wikilinks"
633     ]
634
635     blocklist = {
636         "silenced": list(),
637         "reject"  : list(),
638     }
639
640     source_domain = "raw.githubusercontent.com"
641     if sources.is_recent(source_domain):
642         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
643         return 0
644     else:
645         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
646         sources.update(source_domain)
647
648     raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
649     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
650
651     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
652     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
653
654     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
655     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
656     blocklist["silenced"] = federation.find_domains(silenced)
657
658     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
659     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
660     blocklist["reject"] = federation.find_domains(blocked)
661
662     blocking = blocklist["silenced"] + blocklist["reject"]
663     blocker = "chaos.social"
664
665     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
666     instances.set_total_blocks(blocker, blocking)
667
668     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
669     blockdict = list()
670     if len(blocking) > 0:
671         for block_level in blocklist:
672             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
673
674             for row in blocklist[block_level]:
675                 logger.debug("row[%s]='%s'", type(row), row)
676                 if instances.is_recent(row["domain"], "last_blocked"):
677                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
678                     continue
679                 elif not instances.is_registered(row["domain"]):
680                     try:
681                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
682                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
683                     except network.exceptions as exception:
684                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
685                         instances.set_last_error(row["domain"], exception)
686
687                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
688                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
689                     blockdict.append({
690                         "blocked": row["domain"],
691                         "reason" : row["reason"],
692                     })
693
694         logger.debug("Invoking commit() ...")
695         database.connection.commit()
696
697         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
698         if config.get("bot_enabled") and len(blockdict) > 0:
699             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
700             network.send_bot_post(blocker, blockdict)
701
702     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
703     if instances.has_pending(blocker):
704         logger.debug("Flushing updates for blocker='%s' ...", blocker)
705         instances.update_data(blocker)
706
707     logger.debug("Success! - EXIT!")
708     return 0
709
710 def fetch_fba_rss(args: argparse.Namespace) -> int:
711     logger.debug("args[]='%s' - CALLED!", type(args))
712
713     domains = list()
714
715     logger.debug("Invoking locking.acquire() ...")
716     locking.acquire()
717
718     components = urlparse(args.feed)
719
720     if sources.is_recent(components.netloc):
721         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
722         return 0
723     else:
724         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
725         sources.update(components.netloc)
726
727     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
728     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
729
730     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
731     if response.ok and response.status_code < 300 and len(response.text) > 0:
732         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
733         rss = atoma.parse_rss_bytes(response.content)
734
735         logger.debug("rss[]='%s'", type(rss))
736         for item in rss.items:
737             logger.debug("item='%s'", item)
738             domain = tidyup.domain(item.link.split("=")[1])
739
740             logger.debug("domain='%s' - AFTER!", domain)
741             if domain == "":
742                 logger.debug("domain is empty - SKIPPED!")
743                 continue
744             elif not utils.is_domain_wanted(domain):
745                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
746                 continue
747             elif domain in domains:
748                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
749                 continue
750             elif instances.is_registered(domain):
751                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
752                 continue
753             elif instances.is_recent(domain):
754                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
755                 continue
756
757             logger.debug("Adding domain='%s'", domain)
758             domains.append(domain)
759
760     logger.debug("domains()=%d", len(domains))
761     if len(domains) > 0:
762         logger.info("Adding %d new instances ...", len(domains))
763         for domain in domains:
764             try:
765                 logger.info("Fetching instances from domain='%s' ...", domain)
766                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
767             except network.exceptions as exception:
768                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
769                 instances.set_last_error(domain, exception)
770                 return 100
771
772     logger.debug("Success! - EXIT!")
773     return 0
774
775 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
776     logger.debug("args[]='%s' - CALLED!", type(args))
777
778     logger.debug("Invoking locking.acquire() ...")
779     locking.acquire()
780
781     source_domain = "ryona.agency"
782     if sources.is_recent(source_domain):
783         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
784         return 0
785     else:
786         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
787         sources.update(source_domain)
788
789     feed = f"https://{source_domain}/users/fba/feed.atom"
790
791     domains = list()
792
793     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
794     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
795
796     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
797     if response.ok and response.status_code < 300 and len(response.text) > 0:
798         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
799         atom = atoma.parse_atom_bytes(response.content)
800
801         logger.debug("atom[]='%s'", type(atom))
802         for entry in atom.entries:
803             logger.debug("entry[]='%s'", type(entry))
804             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
805             logger.debug("doc[]='%s'", type(doc))
806             for element in doc.findAll("a"):
807                 logger.debug("element[]='%s'", type(element))
808                 for href in element["href"].split(","):
809                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
810                     domain = tidyup.domain(href)
811
812                     logger.debug("domain='%s' - AFTER!", domain)
813                     if domain == "":
814                         logger.debug("domain is empty - SKIPPED!")
815                         continue
816                     elif not utils.is_domain_wanted(domain):
817                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
818                         continue
819                     elif domain in domains:
820                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
821                         continue
822                     elif instances.is_registered(domain):
823                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
824                         continue
825                     elif instances.is_recent(domain):
826                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
827                         continue
828
829                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
830                     domains.append(domain)
831
832     logger.debug("domains()=%d", len(domains))
833     if len(domains) > 0:
834         logger.info("Adding %d new instances ...", len(domains))
835         for domain in domains:
836             logger.debug("domain='%s'", domain)
837             try:
838                 logger.info("Fetching instances from domain='%s' ...", domain)
839                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
840             except network.exceptions as exception:
841                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
842                 instances.set_last_error(domain, exception)
843                 return 100
844
845     logger.debug("Success! - EXIT!")
846     return 0
847
848 def fetch_instances(args: argparse.Namespace) -> int:
849     logger.debug("args[]='%s' - CALLED!", type(args))
850
851     logger.debug("args.domain='%s' - checking ...", args.domain)
852     if not validators.domain(args.domain):
853         logger.warning("args.domain='%s' is not valid.", args.domain)
854         return 100
855     elif blacklist.is_blacklisted(args.domain):
856         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
857         return 101
858
859     logger.debug("Invoking locking.acquire() ...")
860     locking.acquire()
861
862     # Initial fetch
863     try:
864         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
865         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
866     except network.exceptions as exception:
867         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
868         instances.set_last_error(args.domain, exception)
869         instances.update_data(args.domain)
870         return 100
871
872     if args.single:
873         logger.debug("Not fetching more instances - EXIT!")
874         return 0
875
876     # Loop through some instances
877     database.cursor.execute(
878         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
879     )
880
881     rows = database.cursor.fetchall()
882     logger.info("Checking %d entries ...", len(rows))
883     for row in rows:
884         logger.debug("row[domain]='%s'", row["domain"])
885         if row["domain"] == "":
886             logger.debug("row[domain] is empty - SKIPPED!")
887             continue
888         elif not utils.is_domain_wanted(row["domain"]):
889             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
890             continue
891
892         try:
893             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
894             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
895         except network.exceptions as exception:
896             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
897             instances.set_last_error(row["domain"], exception)
898
899     logger.debug("Success - EXIT!")
900     return 0
901
902 def fetch_oliphant(args: argparse.Namespace) -> int:
903     logger.debug("args[]='%s' - CALLED!", type(args))
904
905     logger.debug("Invoking locking.acquire() ...")
906     locking.acquire()
907
908     source_domain = "codeberg.org"
909     if sources.is_recent(source_domain):
910         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
911         return 0
912     else:
913         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
914         sources.update(source_domain)
915
916     # Base URL
917     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
918
919     # URLs to fetch
920     blocklists = (
921         {
922             "blocker": "artisan.chat",
923             "csv_url": "mastodon/artisan.chat.csv",
924         },{
925             "blocker": "mastodon.art",
926             "csv_url": "mastodon/mastodon.art.csv",
927         },{
928             "blocker": "pleroma.envs.net",
929             "csv_url": "mastodon/pleroma.envs.net.csv",
930         },{
931             "blocker": "oliphant.social",
932             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
933         },{
934             "blocker": "mastodon.online",
935             "csv_url": "mastodon/mastodon.online.csv",
936         },{
937             "blocker": "mastodon.social",
938             "csv_url": "mastodon/mastodon.social.csv",
939         },{
940             "blocker": "mastodon.social",
941             "csv_url": "other/missing-tier0-mastodon.social.csv",
942         },{
943             "blocker": "rage.love",
944             "csv_url": "mastodon/rage.love.csv",
945         },{
946             "blocker": "sunny.garden",
947             "csv_url": "mastodon/sunny.garden.csv",
948         },{
949             "blocker": "solarpunk.moe",
950             "csv_url": "mastodon/solarpunk.moe.csv",
951         },{
952             "blocker": "toot.wales",
953             "csv_url": "mastodon/toot.wales.csv",
954         },{
955             "blocker": "union.place",
956             "csv_url": "mastodon/union.place.csv",
957         }
958     )
959
960     domains = list()
961
962     logger.debug("Downloading %d files ...", len(blocklists))
963     for block in blocklists:
964         # Is domain given and not equal blocker?
965         if isinstance(args.domain, str) and args.domain != block["blocker"]:
966             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
967             continue
968         elif args.domain in domains:
969             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
970             continue
971         elif instances.is_recent(block["blocker"]):
972             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
973             continue
974
975         # Fetch this URL
976         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
977         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
978
979         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
980         if not response.ok or response.status_code >= 300 or response.content == "":
981             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
982             continue
983
984         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
985         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
986
987         blockdict = list()
988
989         cnt = 0
990         for row in reader:
991             logger.debug("row[%s]='%s'", type(row), row)
992             domain = severity = None
993             reject_media = reject_reports = False
994
995             if "#domain" in row:
996                 domain = row["#domain"]
997             elif "domain" in row:
998                 domain = row["domain"]
999             else:
1000                 logger.debug("row='%s' does not contain domain column", row)
1001                 continue
1002
1003             if "#severity" in row:
1004                 severity = row["#severity"]
1005             elif "severity" in row:
1006                 severity = row["severity"]
1007             else:
1008                 logger.debug("row='%s' does not contain severity column", row)
1009                 continue
1010
1011             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1012                 reject_media = True
1013             elif "reject_media" in row and row["reject_media"].lower() == "true":
1014                 reject_media = True
1015
1016             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1017                 reject_reports = True
1018             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1019                 reject_reports = True
1020
1021             cnt = cnt + 1
1022             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1023             if domain == "":
1024                 logger.debug("domain is empty - SKIPPED!")
1025                 continue
1026             elif domain.endswith(".onion"):
1027                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1028                 continue
1029             elif domain.endswith(".arpa"):
1030                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1031                 continue
1032             elif domain.endswith(".tld"):
1033                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1034                 continue
1035             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1036                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1037                 domain = utils.deobfuscate(domain, block["blocker"])
1038                 logger.debug("domain='%s' - AFTER!", domain)
1039
1040             if not validators.domain(domain):
1041                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1042                 continue
1043             elif blacklist.is_blacklisted(domain):
1044                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1045                 continue
1046
1047             logger.debug("Marking domain='%s' as handled", domain)
1048             domains.append(domain)
1049
1050             logger.debug("Processing domain='%s' ...", domain)
1051             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1052             logger.debug("processed='%s'", processed)
1053
1054             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1055                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1056                 blockdict.append({
1057                     "blocked": domain,
1058                     "reason" : block["reason"],
1059                 })
1060
1061             if reject_media:
1062                 utils.process_block(block["blocker"], domain, None, "reject_media")
1063             if reject_reports:
1064                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1065
1066         logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1067         instances.set_total_blocks(block["blocker"], domains)
1068
1069         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1070         if instances.has_pending(block["blocker"]):
1071             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1072             instances.update_data(block["blocker"])
1073
1074         logger.debug("Invoking commit() ...")
1075         database.connection.commit()
1076
1077         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1078         if config.get("bot_enabled") and len(blockdict) > 0:
1079             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1080             network.send_bot_post(block["blocker"], blockdict)
1081
1082     logger.debug("Success! - EXIT!")
1083     return 0
1084
1085 def fetch_txt(args: argparse.Namespace) -> int:
1086     logger.debug("args[]='%s' - CALLED!", type(args))
1087
1088     logger.debug("Invoking locking.acquire() ...")
1089     locking.acquire()
1090
1091     # Static URLs
1092     urls = ({
1093         "blocker": "seirdy.one",
1094         "url"    : "https://seirdy.one/pb/bsl.txt",
1095     },)
1096
1097     logger.info("Checking %d text file(s) ...", len(urls))
1098     for row in urls:
1099         logger.debug("Fetching row[url]='%s' ...", row["url"])
1100         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1101
1102         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1103         if response.ok and response.status_code < 300 and response.text != "":
1104             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1105             domains = response.text.split("\n")
1106
1107             logger.info("Processing %d domains ...", len(domains))
1108             for domain in domains:
1109                 logger.debug("domain='%s' - BEFORE!", domain)
1110                 domain = tidyup.domain(domain)
1111
1112                 logger.debug("domain='%s' - AFTER!", domain)
1113                 if domain == "":
1114                     logger.debug("domain is empty - SKIPPED!")
1115                     continue
1116                 elif not utils.is_domain_wanted(domain):
1117                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1118                     continue
1119                 elif instances.is_recent(domain):
1120                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1121                     continue
1122
1123                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1124                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1125
1126                 logger.debug("processed='%s'", processed)
1127                 if not processed:
1128                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1129                     continue
1130
1131     logger.debug("Success! - EXIT!")
1132     return 0
1133
1134 def fetch_fedipact(args: argparse.Namespace) -> int:
1135     logger.debug("args[]='%s' - CALLED!", type(args))
1136
1137     logger.debug("Invoking locking.acquire() ...")
1138     locking.acquire()
1139
1140     source_domain = "fedipact.online"
1141     if sources.is_recent(source_domain):
1142         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1143         return 0
1144     else:
1145         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1146         sources.update(source_domain)
1147
1148     response = utils.fetch_url(
1149         f"https://{source_domain}",
1150         network.web_headers,
1151         (config.get("connection_timeout"), config.get("read_timeout"))
1152     )
1153
1154     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1155     if response.ok and response.status_code < 300 and response.text != "":
1156         logger.debug("Parsing %d Bytes ...", len(response.text))
1157
1158         doc = bs4.BeautifulSoup(response.text, "html.parser")
1159         logger.debug("doc[]='%s'", type(doc))
1160
1161         rows = doc.findAll("li")
1162         logger.info("Checking %d row(s) ...", len(rows))
1163         for row in rows:
1164             logger.debug("row[]='%s'", type(row))
1165             domain = tidyup.domain(row.contents[0])
1166
1167             logger.debug("domain='%s' - AFTER!", domain)
1168             if domain == "":
1169                 logger.debug("domain is empty - SKIPPED!")
1170                 continue
1171             elif not utils.is_domain_wanted(domain):
1172                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1173                 continue
1174             elif instances.is_registered(domain):
1175                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1176                 continue
1177             elif instances.is_recent(domain):
1178                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1179                 continue
1180
1181             logger.info("Fetching domain='%s' ...", domain)
1182             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1183
1184     logger.debug("Success! - EXIT!")
1185     return 0
1186
1187 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1188     logger.debug("args[]='%s' - CALLED!", type(args))
1189
1190     logger.debug("Invoking locking.acquire() ...")
1191     locking.acquire()
1192
1193     source_domain = "joinfediverse.wiki"
1194     if sources.is_recent(source_domain):
1195         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1196         return 0
1197     else:
1198         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1199         sources.update(source_domain)
1200
1201     raw = utils.fetch_url(
1202         f"https://{source_domain}/FediBlock",
1203         network.web_headers,
1204         (config.get("connection_timeout"), config.get("read_timeout"))
1205     ).text
1206     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1207
1208     doc = bs4.BeautifulSoup(raw, "html.parser")
1209     logger.debug("doc[]='%s'", type(doc))
1210
1211     tables = doc.findAll("table", {"class": "wikitable"})
1212
1213     logger.info("Analyzing %d table(s) ...", len(tables))
1214     blocklist = list()
1215     for table in tables:
1216         logger.debug("table[]='%s'", type(table))
1217
1218         rows = table.findAll("tr")
1219         logger.info("Checking %d row(s) ...", len(rows))
1220         block_headers = dict()
1221         for row in rows:
1222             logger.debug("row[%s]='%s'", type(row), row)
1223
1224             headers = row.findAll("th")
1225             logger.debug("Found headers()=%d header(s)", len(headers))
1226             if len(headers) > 1:
1227                 block_headers = dict()
1228                 cnt = 0
1229                 for header in headers:
1230                     cnt = cnt + 1
1231                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1232                     text = header.contents[0]
1233
1234                     logger.debug("text[]='%s'", type(text))
1235                     if not isinstance(text, str):
1236                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1237                         continue
1238                     elif validators.domain(text.strip()):
1239                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1240                         continue
1241
1242                     text = tidyup.domain(text.strip())
1243                     logger.debug("text='%s'", text)
1244                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1245                         logger.debug("Found header: '%s'=%d", text, cnt)
1246                         block_headers[cnt] = text
1247
1248             elif len(block_headers) == 0:
1249                 logger.debug("row is not scrapable - SKIPPED!")
1250                 continue
1251             elif len(block_headers) > 0:
1252                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1253                 cnt = 0
1254                 block = dict()
1255
1256                 for element in row.find_all(["th", "td"]):
1257                     cnt = cnt + 1
1258                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1259                     if cnt in block_headers:
1260                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1261
1262                         text = element.text.strip()
1263                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1264
1265                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1266                         if key in ["domain", "instance"]:
1267                             block[key] = text
1268                         elif key == "reason":
1269                             block[key] = tidyup.reason(text)
1270                         elif key == "subdomain(s)":
1271                             block[key] = list()
1272                             if text != "":
1273                                 block[key] = text.split("/")
1274                         else:
1275                             logger.debug("key='%s'", key)
1276                             block[key] = text
1277
1278                 logger.debug("block()=%d ...", len(block))
1279                 if len(block) > 0:
1280                     logger.debug("Appending block()=%d ...", len(block))
1281                     blocklist.append(block)
1282
1283     logger.debug("blocklist()=%d", len(blocklist))
1284
1285     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1286     domains = database.cursor.fetchall()
1287
1288     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1289     blocking = list()
1290     for block in blocklist:
1291         logger.debug("block='%s'", block)
1292         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1293             origin = block["blocked"]
1294             for subdomain in block["subdomain(s)"]:
1295                 block["blocked"] = subdomain + "." + origin
1296                 blocking.append(block)
1297         else:
1298             blocking.append(block)
1299
1300     logger.debug("blocking()=%d", blocking)
1301     for block in blocking:
1302         logger.debug("block[]='%s'", type(block))
1303         block["blocked"] = tidyup.domain(block["blocked"])
1304
1305         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1306         if block["blocked"] == "":
1307             logger.debug("block[blocked] is empty - SKIPPED!")
1308             continue
1309         elif not utils.is_domain_wanted(block["blocked"]):
1310             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1311             continue
1312         elif instances.is_recent(block["blocked"]):
1313             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1314             continue
1315
1316         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1317         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1318
1319     blockdict = list()
1320     for blocker in domains:
1321         blocker = blocker[0]
1322         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1323
1324         for block in blocking:
1325             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1326             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1327
1328             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1329             if block["blocked"] == "":
1330                 logger.debug("block[blocked] is empty - SKIPPED!")
1331                 continue
1332             elif not utils.is_domain_wanted(block["blocked"]):
1333                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1334                 continue
1335
1336             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1337             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1338                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1339                 blockdict.append({
1340                     "blocked": block["blocked"],
1341                     "reason" : block["reason"],
1342                 })
1343
1344         if instances.has_pending(blocker):
1345             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1346             instances.update_data(blocker)
1347
1348         logger.debug("Invoking commit() ...")
1349         database.connection.commit()
1350
1351         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1352         if config.get("bot_enabled") and len(blockdict) > 0:
1353             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1354             network.send_bot_post(blocker, blockdict)
1355
1356     logger.debug("Success! - EXIT!")
1357     return 0
1358
1359 def recheck_obfuscation(args: argparse.Namespace) -> int:
1360     logger.debug("args[]='%s' - CALLED!", type(args))
1361
1362     logger.debug("Invoking locking.acquire() ...")
1363     locking.acquire()
1364
1365     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1366         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1367     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1368         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1369     else:
1370         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1371
1372     rows = database.cursor.fetchall()
1373     logger.info("Checking %d domains ...", len(rows))
1374     for row in rows:
1375         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1376         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1377             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1378             continue
1379
1380         blocking = list()
1381         if row["software"] == "pleroma":
1382             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1383             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1384         elif row["software"] == "mastodon":
1385             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1386             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1387         elif row["software"] == "lemmy":
1388             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1389             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1390         elif row["software"] == "friendica":
1391             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1392             blocking = friendica.fetch_blocks(row["domain"])
1393         elif row["software"] == "misskey":
1394             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1395             blocking = misskey.fetch_blocks(row["domain"])
1396         else:
1397             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1398
1399         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1400         instances.set_total_blocks(row["domain"], blocking)
1401
1402         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1403         obfuscated = 0
1404         blockdict = list()
1405         for block in blocking:
1406             logger.debug("block[blocked]='%s'", block["blocked"])
1407             blocked = None
1408
1409             if block["blocked"] == "":
1410                 logger.debug("block[blocked] is empty - SKIPPED!")
1411                 continue
1412             elif block["blocked"].endswith(".arpa"):
1413                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1414                 continue
1415             elif block["blocked"].endswith(".tld"):
1416                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1417                 continue
1418             elif block["blocked"].endswith(".onion"):
1419                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1420                 continue
1421             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1422                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1423                 obfuscated = obfuscated + 1
1424                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1425             elif not utils.is_domain_wanted(block["blocked"]):
1426                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1427                 continue
1428             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1429                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1430                 continue
1431
1432             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1433             if blocked is not None and blocked != block["blocked"]:
1434                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1435                 obfuscated = obfuscated - 1
1436                 if blocks.is_instance_blocked(row["domain"], blocked):
1437                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1438                     continue
1439
1440                 block["block_level"] = utils.alias_block_level(block["block_level"])
1441
1442                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1443                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1444                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1445                     blockdict.append({
1446                         "blocked": blocked,
1447                         "reason" : block["reason"],
1448                     })
1449
1450         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1451         if obfuscated == 0 and len(blocking) > 0:
1452             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1453             instances.set_has_obfuscation(row["domain"], False)
1454
1455         if instances.has_pending(row["domain"]):
1456             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1457             instances.update_data(row["domain"])
1458
1459         logger.debug("Invoking commit() ...")
1460         database.connection.commit()
1461
1462         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1463         if config.get("bot_enabled") and len(blockdict) > 0:
1464             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1465             network.send_bot_post(row["domain"], blockdict)
1466
1467     logger.debug("Success! - EXIT!")
1468     return 0
1469
1470 def fetch_fedilist(args: argparse.Namespace) -> int:
1471     logger.debug("args[]='%s' - CALLED!", type(args))
1472
1473     logger.debug("Invoking locking.acquire() ...")
1474     locking.acquire()
1475
1476     source_domain = "demo.fedilist.com"
1477     if sources.is_recent(source_domain):
1478         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1479         return 0
1480     else:
1481         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1482         sources.update(source_domain)
1483
1484     url = f"http://{source_domain}/instance/csv?onion=not"
1485     if args.software is not None and args.software != "":
1486         logger.debug("args.software='%s'", args.software)
1487         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1488
1489     logger.info("Fetching url='%s' ...", url)
1490     response = reqto.get(
1491         url,
1492         headers=network.web_headers,
1493         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1494         allow_redirects=False
1495     )
1496
1497     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1498     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1499         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1500         return 1
1501
1502     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1503
1504     logger.debug("reader[]='%s'", type(reader))
1505     blockdict = list()
1506     for row in reader:
1507         logger.debug("row[]='%s'", type(row))
1508         domain = tidyup.domain(row["hostname"])
1509         logger.debug("domain='%s' - AFTER!", domain)
1510
1511         if domain == "":
1512             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1513             continue
1514         elif not utils.is_domain_wanted(domain):
1515             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1516             continue
1517         elif (args.all is None or not args.all) and instances.is_registered(domain):
1518             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1519             continue
1520         elif instances.is_recent(domain):
1521             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1522             continue
1523
1524         logger.info("Fetching instances from domain='%s' ...", domain)
1525         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1526
1527     logger.debug("Success! - EXIT!")
1528     return 0
1529
1530 def update_nodeinfo(args: argparse.Namespace) -> int:
1531     logger.debug("args[]='%s' - CALLED!", type(args))
1532
1533     logger.debug("Invoking locking.acquire() ...")
1534     locking.acquire()
1535
1536     if args.domain is not None and args.domain != "":
1537         logger.debug("Fetching args.domain='%s'", args.domain)
1538         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1539     elif args.software is not None and args.software != "":
1540         logger.info("Fetching domains for args.software='%s'", args.software)
1541         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1542     else:
1543         logger.info("Fetching domains for recently updated ...")
1544         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1545
1546     domains = database.cursor.fetchall()
1547
1548     logger.info("Checking %d domain(s) ...", len(domains))
1549     cnt = 0
1550     for row in domains:
1551         logger.debug("row[]='%s'", type(row))
1552         try:
1553             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1554             software = federation.determine_software(row["domain"])
1555
1556             logger.debug("Determined software='%s'", software)
1557             if software != row["software"] and software is not None:
1558                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1559                 instances.set_software(row["domain"], software)
1560
1561             instances.set_success(row["domain"])
1562         except network.exceptions as exception:
1563             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1564             instances.set_last_error(row["domain"], exception)
1565
1566         instances.set_last_nodeinfo(row["domain"])
1567         instances.update_data(row["domain"])
1568         cnt = cnt + 1
1569
1570     logger.debug("Success! - EXIT!")
1571     return 0
1572
1573 def fetch_instances_social(args: argparse.Namespace) -> int:
1574     logger.debug("args[]='%s' - CALLED!", type(args))
1575
1576     logger.debug("Invoking locking.acquire() ...")
1577     locking.acquire()
1578
1579     source_domain = "instances.social"
1580
1581     if config.get("instances_social_api_key") == "":
1582         logger.error("API key not set. Please set in your config.json file.")
1583         return 1
1584     elif sources.is_recent(source_domain):
1585         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1586         return 0
1587     else:
1588         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1589         sources.update(source_domain)
1590
1591     headers = {
1592         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1593     }
1594
1595     fetched = network.get_json_api(
1596         source_domain,
1597         "/api/1.0/instances/list?count=0&sort_by=name",
1598         headers,
1599         (config.get("connection_timeout"), config.get("read_timeout"))
1600     )
1601     logger.debug("fetched[]='%s'", type(fetched))
1602
1603     if "error_message" in fetched:
1604         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1605         return 2
1606     elif "exception" in fetched:
1607         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1608         return 3
1609     elif "json" not in fetched:
1610         logger.warning("fetched has no element 'json' - EXIT!")
1611         return 4
1612     elif "instances" not in fetched["json"]:
1613         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1614         return 5
1615
1616     domains = list()
1617     rows = fetched["json"]["instances"]
1618
1619     logger.info("Checking %d row(s) ...", len(rows))
1620     for row in rows:
1621         logger.debug("row[]='%s'", type(row))
1622         domain = tidyup.domain(row["name"])
1623
1624         logger.debug("domain='%s' - AFTER!", domain)
1625         if domain == "":
1626             logger.debug("domain is empty - SKIPPED!")
1627             continue
1628         elif not utils.is_domain_wanted(domain):
1629             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1630             continue
1631         elif domain in domains:
1632             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1633             continue
1634         elif instances.is_registered(domain):
1635             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1636             continue
1637         elif instances.is_recent(domain):
1638             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1639             continue
1640
1641         logger.info("Fetching instances from domain='%s'", domain)
1642         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1643
1644     logger.debug("Success! - EXIT!")
1645     return 0