]> git.mxchange.org Git - fba.git/blob - fba/commands.py
6b63c8debf8527e3a5838aa03d4d77b519b03ad1
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import software as software_helper
41 from fba.helpers import tidyup
42
43 from fba.http import federation
44 from fba.http import network
45
46 from fba.models import blocks
47 from fba.models import instances
48 from fba.models import sources
49
50 from fba.networks import friendica
51 from fba.networks import lemmy
52 from fba.networks import mastodon
53 from fba.networks import misskey
54 from fba.networks import pleroma
55
56 logging.basicConfig(level=logging.INFO)
57 logger = logging.getLogger(__name__)
58 #logger.setLevel(logging.DEBUG)
59
60 def check_instance(args: argparse.Namespace) -> int:
61     logger.debug("args.domain='%s' - CALLED!", args.domain)
62     status = 0
63     if not validators.domain(args.domain):
64         logger.warning("args.domain='%s' is not valid", args.domain)
65         status = 100
66     elif blacklist.is_blacklisted(args.domain):
67         logger.warning("args.domain='%s' is blacklisted", args.domain)
68         status = 101
69     elif instances.is_registered(args.domain):
70         logger.warning("args.domain='%s' is already registered", args.domain)
71         status = 102
72     else:
73         logger.info("args.domain='%s' is not known", args.domain)
74
75     logger.debug("status=%d - EXIT!", status)
76     return status
77
78 def check_nodeinfo(args: argparse.Namespace) -> int:
79     logger.debug("args[]='%s' - CALLED!", type(args))
80
81     # Fetch rows
82     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
83
84     cnt = 0
85     for row in database.cursor.fetchall():
86         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
87         punycode = row["domain"].encode("idna").decode("utf-8")
88
89         if row["nodeinfo_url"].startswith("/"):
90             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
91             continue
92         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
93             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
94             cnt = cnt + 1
95
96     logger.info("Found %d row(s)", cnt)
97
98     logger.debug("EXIT!")
99     return 0
100
101 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
102     logger.debug("args[]='%s' - CALLED!", type(args))
103
104     # No CSRF by default, you don't have to add network.source_headers by yourself here
105     headers = tuple()
106     source_domain = "pixelfed.org"
107
108     if sources.is_recent(source_domain):
109         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
110         return 0
111     else:
112         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
113         sources.update(source_domain)
114
115     try:
116         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
117         headers = csrf.determine(source_domain, dict())
118     except network.exceptions as exception:
119         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
120         return list()
121
122     try:
123         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
124         fetched = network.get_json_api(
125             source_domain,
126             "/api/v1/servers/all.json?scope=All&country=all&language=all",
127             headers,
128             (config.get("connection_timeout"), config.get("read_timeout"))
129         )
130
131         logger.debug("JSON API returned %d elements", len(fetched))
132         if "error_message" in fetched:
133             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
134             return 101
135         elif "data" not in fetched["json"]:
136             logger.warning("API did not return JSON with 'data' element - EXIT!")
137             return 102
138
139         rows = fetched["json"]["data"]
140         logger.info("Checking %d fetched rows ...", len(rows))
141         for row in rows:
142             logger.debug("row[]='%s'", type(row))
143             if "domain" not in row:
144                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
145                 continue
146             elif row["domain"] == "":
147                 logger.debug("row[domain] is empty - SKIPPED!")
148                 continue
149             elif not utils.is_domain_wanted(row["domain"]):
150                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
151                 continue
152             elif instances.is_registered(row["domain"]):
153                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
154                 continue
155             elif instances.is_recent(row["domain"]):
156                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
157                 continue
158
159             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
160             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
161
162     except network.exceptions as exception:
163         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
164         return 103
165
166     logger.debug("Success! - EXIT!")
167     return 0
168
169 def fetch_bkali(args: argparse.Namespace) -> int:
170     logger.debug("args[]='%s' - CALLED!", type(args))
171
172     logger.debug("Invoking locking.acquire() ...")
173     locking.acquire()
174
175     source_domain = "gql.api.bka.li"
176     if sources.is_recent(source_domain):
177         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
178         return 0
179     else:
180         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
181         sources.update(source_domain)
182
183     domains = list()
184     try:
185         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
186         fetched = network.post_json_api(
187             source_domain,
188             "/v1/graphql",
189             json.dumps({
190                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
191             })
192         )
193
194         logger.debug("fetched[]='%s'", type(fetched))
195         if "error_message" in fetched:
196             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
197             return 100
198         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
199             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
200             return 101
201
202         rows = fetched["json"]
203
204         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
205         if len(rows) == 0:
206             raise Exception("WARNING: Returned no records")
207         elif "data" not in rows:
208             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
209         elif "nodeinfo" not in rows["data"]:
210             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
211
212         for entry in rows["data"]["nodeinfo"]:
213             logger.debug("entry[%s]='%s'", type(entry), entry)
214             if "domain" not in entry:
215                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
216                 continue
217             elif entry["domain"] == "":
218                 logger.debug("entry[domain] is empty - SKIPPED!")
219                 continue
220             elif not utils.is_domain_wanted(entry["domain"]):
221                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
222                 continue
223             elif instances.is_registered(entry["domain"]):
224                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
225                 continue
226             elif instances.is_recent(entry["domain"]):
227                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
228                 continue
229
230             logger.debug("Adding domain='%s' ...", entry["domain"])
231             domains.append(entry["domain"])
232
233     except network.exceptions as exception:
234         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
235         return 102
236
237     logger.debug("domains()=%d", len(domains))
238     if len(domains) > 0:
239         logger.info("Adding %d new instances ...", len(domains))
240         for domain in domains:
241             try:
242                 logger.info("Fetching instances from domain='%s' ...", domain)
243                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
244             except network.exceptions as exception:
245                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
246                 instances.set_last_error(domain, exception)
247                 return 100
248
249     logger.debug("Success - EXIT!")
250     return 0
251
252 def fetch_blocks(args: argparse.Namespace) -> int:
253     logger.debug("args[]='%s' - CALLED!", type(args))
254     if args.domain is not None and args.domain != "":
255         logger.debug("args.domain='%s' - checking ...", args.domain)
256         if not validators.domain(args.domain):
257             logger.warning("args.domain='%s' is not valid.", args.domain)
258             return 100
259         elif blacklist.is_blacklisted(args.domain):
260             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
261             return 101
262         elif not instances.is_registered(args.domain):
263             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
264             return 102
265
266     logger.debug("Invoking locking.acquire() ...")
267     locking.acquire()
268
269     if args.domain is not None and args.domain != "":
270         # Re-check single domain
271         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
272         database.cursor.execute(
273             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
274         )
275     elif args.software is not None and args.software != "":
276         # Re-check single software
277         logger.debug("Querying database for args.software='%s' ...", args.software)
278         database.cursor.execute(
279             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
280         )
281     else:
282         # Re-check after "timeout" (aka. minimum interval)
283         database.cursor.execute(
284             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
285         )
286
287     rows = database.cursor.fetchall()
288     logger.info("Checking %d entries ...", len(rows))
289     for blocker, software, origin, nodeinfo_url in rows:
290         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
291         blocker = tidyup.domain(blocker)
292         logger.debug("blocker='%s' - AFTER!", blocker)
293
294         if blocker == "":
295             logger.warning("blocker is now empty!")
296             continue
297         elif nodeinfo_url is None or nodeinfo_url == "":
298             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
299             continue
300         elif not utils.is_domain_wanted(blocker):
301             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
302             continue
303
304         logger.debug("blocker='%s'", blocker)
305         instances.set_last_blocked(blocker)
306         instances.set_has_obfuscation(blocker, False)
307
308         blocking = list()
309         blockdict = list()
310         if software == "pleroma":
311             logger.info("blocker='%s',software='%s'", blocker, software)
312             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
313         elif software == "mastodon":
314             logger.info("blocker='%s',software='%s'", blocker, software)
315             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
316         elif software == "lemmy":
317             logger.info("blocker='%s',software='%s'", blocker, software)
318             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
319         elif software == "friendica":
320             logger.info("blocker='%s',software='%s'", blocker, software)
321             blocking = friendica.fetch_blocks(blocker)
322         elif software == "misskey":
323             logger.info("blocker='%s',software='%s'", blocker, software)
324             blocking = misskey.fetch_blocks(blocker)
325         else:
326             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
327
328         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
329         instances.set_total_blocks(blocker, blocking)
330
331         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
332         blockdict = list()
333         for block in blocking:
334             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
335
336             if block["block_level"] == "":
337                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
338                 continue
339
340             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
341             block["blocked"] = tidyup.domain(block["blocked"])
342             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
343             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
344
345             if block["blocked"] == "":
346                 logger.warning("blocked is empty, blocker='%s'", blocker)
347                 continue
348             elif block["blocked"].endswith(".onion"):
349                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
350                 continue
351             elif block["blocked"].endswith(".arpa"):
352                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
353                 continue
354             elif block["blocked"].endswith(".tld"):
355                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
356                 continue
357             elif block["blocked"].find("*") >= 0:
358                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
359
360                 # Some friendica servers also obscure domains without hash
361                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
362
363                 logger.debug("row[]='%s'", type(row))
364                 if row is None:
365                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
366                     instances.set_has_obfuscation(blocker, True)
367                     continue
368
369                 block["blocked"] = row["domain"]
370                 origin           = row["origin"]
371                 nodeinfo_url     = row["nodeinfo_url"]
372             elif block["blocked"].find("?") >= 0:
373                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
374
375                 # Some obscure them with question marks, not sure if that's dependent on version or not
376                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
377
378                 logger.debug("row[]='%s'", type(row))
379                 if row is None:
380                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
381                     instances.set_has_obfuscation(blocker, True)
382                     continue
383
384                 block["blocked"] = row["domain"]
385                 origin           = row["origin"]
386                 nodeinfo_url     = row["nodeinfo_url"]
387
388             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
389             if block["blocked"] == "":
390                 logger.debug("block[blocked] is empty - SKIPPED!")
391                 continue
392             elif not utils.is_domain_wanted(block["blocked"]):
393                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
394                 continue
395             elif block["block_level"] in ["accept", "accepted"]:
396                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
397                 continue
398             elif not instances.is_registered(block["blocked"]):
399                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
400                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
401
402             block["block_level"] = utils.alias_block_level(block["block_level"])
403
404             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
405                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
406                 blockdict.append({
407                     "blocked": block["blocked"],
408                     "reason" : block["reason"],
409                 })
410
411             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
412             cookies.clear(block["blocked"])
413
414         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
415         if instances.has_pending(blocker):
416             logger.debug("Flushing updates for blocker='%s' ...", blocker)
417             instances.update_data(blocker)
418
419         logger.debug("Invoking commit() ...")
420         database.connection.commit()
421
422         logger.debug("Invoking cookies.clear(%s) ...", blocker)
423         cookies.clear(blocker)
424
425         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
426         if config.get("bot_enabled") and len(blockdict) > 0:
427             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
428             network.send_bot_post(blocker, blockdict)
429
430     logger.debug("Success! - EXIT!")
431     return 0
432
433 def fetch_observer(args: argparse.Namespace) -> int:
434     logger.debug("args[]='%s' - CALLED!", type(args))
435
436     logger.debug("Invoking locking.acquire() ...")
437     locking.acquire()
438
439     source_domain = "fediverse.observer"
440     if sources.is_recent(source_domain):
441         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
442         return 0
443     else:
444         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
445         sources.update(source_domain)
446
447     types = list()
448     if args.software is None:
449         logger.info("Fetching software list ...")
450         raw = utils.fetch_url(
451             f"https://{source_domain}",
452             network.web_headers,
453             (config.get("connection_timeout"), config.get("read_timeout"))
454         ).text
455         logger.debug("raw[%s]()=%d", type(raw), len(raw))
456
457         doc = bs4.BeautifulSoup(raw, features="html.parser")
458         logger.debug("doc[]='%s'", type(doc))
459
460         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
461         logger.debug("items[]='%s'", type(items))
462
463         logger.info("Checking %d menu items ...", len(items))
464         for item in items:
465             logger.debug("item[%s]='%s'", type(item), item)
466             if item.text.lower() == "all":
467                 logger.debug("Skipping 'All' menu entry ...")
468                 continue
469
470             logger.debug("Appending item.text='%s' ...", item.text)
471             types.append(tidyup.domain(item.text))
472     else:
473         logger.info("Adding args.software='%s' as type ...", args.software)
474         types.append(args.software)
475
476     logger.info("Fetching %d different table data ...", len(types))
477     for software in types:
478         logger.debug("software='%s' - BEFORE!", software)
479         if args.software is not None and args.software != software:
480             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
481             continue
482
483         doc = None
484         try:
485             logger.debug("Fetching table data for software='%s' ...", software)
486             raw = utils.fetch_url(
487                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
488                 network.web_headers,
489                 (config.get("connection_timeout"), config.get("read_timeout"))
490             ).text
491             logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493             doc = bs4.BeautifulSoup(raw, features="html.parser")
494             logger.debug("doc[]='%s'", type(doc))
495         except network.exceptions as exception:
496             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
497             continue
498
499         items = doc.findAll("a", {"class": "url"})
500         logger.info("Checking %d items,software='%s' ...", len(items), software)
501         for item in items:
502             logger.debug("item[]='%s'", type(item))
503             domain = item.decode_contents()
504
505             logger.debug("domain='%s' - AFTER!", domain)
506             if domain == "":
507                 logger.debug("domain is empty - SKIPPED!")
508                 continue
509             elif not utils.is_domain_wanted(domain):
510                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
511                 continue
512             elif instances.is_registered(domain):
513                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
514                 continue
515             elif instances.is_recent(domain):
516                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
517                 continue
518
519             software = software_helper.alias(software)
520             logger.info("Fetching instances for domain='%s'", domain)
521             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
522
523     logger.debug("Success! - EXIT!")
524     return 0
525
526 def fetch_todon_wiki(args: argparse.Namespace) -> int:
527     logger.debug("args[]='%s' - CALLED!", type(args))
528
529     logger.debug("Invoking locking.acquire() ...")
530     locking.acquire()
531
532     source_domain = "wiki.todon.eu"
533     if sources.is_recent(source_domain):
534         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
535         return 0
536     else:
537         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
538         sources.update(source_domain)
539
540     blocklist = {
541         "silenced": list(),
542         "reject": list(),
543     }
544
545     raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
546     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
547
548     doc = bs4.BeautifulSoup(raw, "html.parser")
549     logger.debug("doc[]='%s'", type(doc))
550
551     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
552     logger.info("Checking %d silenced/limited entries ...", len(silenced))
553     blocklist["silenced"] = utils.find_domains(silenced, "div")
554
555     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
556     logger.info("Checking %d suspended entries ...", len(suspended))
557     blocklist["reject"] = utils.find_domains(suspended, "div")
558
559     blocking = blocklist["silenced"] + blocklist["reject"]
560     blocker = "todon.eu"
561
562     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
563     instances.set_total_blocks(blocker, blocking)
564
565     blockdict = list()
566     for block_level in blocklist:
567         blockers = blocklist[block_level]
568
569         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
570         for blocked in blockers:
571             logger.debug("blocked='%s'", blocked)
572
573             if not instances.is_registered(blocked):
574                 try:
575                     logger.info("Fetching instances from domain='%s' ...", blocked)
576                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
577                 except network.exceptions as exception:
578                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
579                     instances.set_last_error(blocked, exception)
580
581             if blocks.is_instance_blocked(blocker, blocked, block_level):
582                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
583                 continue
584
585             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
586             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
587                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
588                 blockdict.append({
589                     "blocked": blocked,
590                     "reason" : None,
591                 })
592
593         logger.debug("Invoking commit() ...")
594         database.connection.commit()
595
596         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
597         if config.get("bot_enabled") and len(blockdict) > 0:
598             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
599             network.send_bot_post(blocker, blockdict)
600
601     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
602     if instances.has_pending(blocker):
603         logger.debug("Flushing updates for blocker='%s' ...", blocker)
604         instances.update_data(blocker)
605
606     logger.debug("Success! - EXIT!")
607     return 0
608
609 def fetch_cs(args: argparse.Namespace):
610     logger.debug("args[]='%s' - CALLED!", type(args))
611
612     logger.debug("Invoking locking.acquire() ...")
613     locking.acquire()
614
615     extensions = [
616         "extra",
617         "abbr",
618         "attr_list",
619         "def_list",
620         "fenced_code",
621         "footnotes",
622         "md_in_html",
623         "admonition",
624         "codehilite",
625         "legacy_attrs",
626         "legacy_em",
627         "meta",
628         "nl2br",
629         "sane_lists",
630         "smarty",
631         "toc",
632         "wikilinks"
633     ]
634
635     blocklist = {
636         "silenced": list(),
637         "reject"  : list(),
638     }
639
640     source_domain = "raw.githubusercontent.com"
641     if sources.is_recent(source_domain):
642         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
643         return 0
644     else:
645         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
646         sources.update(source_domain)
647
648     raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
649     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
650
651     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
652     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
653
654     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
655     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
656     blocklist["silenced"] = federation.find_domains(silenced)
657
658     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
659     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
660     blocklist["reject"] = federation.find_domains(blocked)
661
662     blocking = blocklist["silenced"] + blocklist["reject"]
663     blocker = "chaos.social"
664
665     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
666     instances.set_total_blocks(blocker, blocking)
667
668     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
669     blockdict = list()
670     if len(blocking) > 0:
671         for block_level in blocklist:
672             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
673
674             for row in blocklist[block_level]:
675                 logger.debug("row[%s]='%s'", type(row), row)
676                 if instances.is_recent(row["domain"], "last_blocked"):
677                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
678                     continue
679                 elif not instances.is_registered(row["domain"]):
680                     try:
681                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
682                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
683                     except network.exceptions as exception:
684                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
685                         instances.set_last_error(row["domain"], exception)
686
687                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
688                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
689                     blockdict.append({
690                         "blocked": row["domain"],
691                         "reason" : row["reason"],
692                     })
693
694         logger.debug("Invoking commit() ...")
695         database.connection.commit()
696
697         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
698         if config.get("bot_enabled") and len(blockdict) > 0:
699             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
700             network.send_bot_post(blocker, blockdict)
701
702     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
703     if instances.has_pending(blocker):
704         logger.debug("Flushing updates for blocker='%s' ...", blocker)
705         instances.update_data(blocker)
706
707     logger.debug("Success! - EXIT!")
708     return 0
709
710 def fetch_fba_rss(args: argparse.Namespace) -> int:
711     logger.debug("args[]='%s' - CALLED!", type(args))
712
713     domains = list()
714
715     logger.debug("Invoking locking.acquire() ...")
716     locking.acquire()
717
718     components = urlparse(args.feed)
719
720     if sources.is_recent(components.netloc):
721         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
722         return 0
723     else:
724         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
725         sources.update(components.netloc)
726
727     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
728     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
729
730     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
731     if response.ok and response.status_code < 300 and len(response.text) > 0:
732         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
733         rss = atoma.parse_rss_bytes(response.content)
734
735         logger.debug("rss[]='%s'", type(rss))
736         for item in rss.items:
737             logger.debug("item='%s'", item)
738             domain = tidyup.domain(item.link.split("=")[1])
739
740             logger.debug("domain='%s' - AFTER!", domain)
741             if domain == "":
742                 logger.debug("domain is empty - SKIPPED!")
743                 continue
744             elif not utils.is_domain_wanted(domain):
745                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
746                 continue
747             elif domain in domains:
748                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
749                 continue
750             elif instances.is_registered(domain):
751                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
752                 continue
753             elif instances.is_recent(domain):
754                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
755                 continue
756
757             logger.debug("Adding domain='%s'", domain)
758             domains.append(domain)
759
760     logger.debug("domains()=%d", len(domains))
761     if len(domains) > 0:
762         logger.info("Adding %d new instances ...", len(domains))
763         for domain in domains:
764             try:
765                 logger.info("Fetching instances from domain='%s' ...", domain)
766                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
767             except network.exceptions as exception:
768                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
769                 instances.set_last_error(domain, exception)
770                 return 100
771
772     logger.debug("Success! - EXIT!")
773     return 0
774
775 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
776     logger.debug("args[]='%s' - CALLED!", type(args))
777
778     logger.debug("Invoking locking.acquire() ...")
779     locking.acquire()
780
781     source_domain = "ryona.agency"
782     if sources.is_recent(source_domain):
783         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
784         return 0
785     else:
786         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
787         sources.update(source_domain)
788
789     feed = f"https://{source_domain}/users/fba/feed.atom"
790
791     domains = list()
792
793     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
794     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
795
796     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
797     if response.ok and response.status_code < 300 and len(response.text) > 0:
798         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
799         atom = atoma.parse_atom_bytes(response.content)
800
801         logger.debug("atom[]='%s'", type(atom))
802         for entry in atom.entries:
803             logger.debug("entry[]='%s'", type(entry))
804             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
805             logger.debug("doc[]='%s'", type(doc))
806             for element in doc.findAll("a"):
807                 logger.debug("element[]='%s'", type(element))
808                 for href in element["href"].split(","):
809                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
810                     domain = tidyup.domain(href)
811
812                     logger.debug("domain='%s' - AFTER!", domain)
813                     if domain == "":
814                         logger.debug("domain is empty - SKIPPED!")
815                         continue
816                     elif not utils.is_domain_wanted(domain):
817                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
818                         continue
819                     elif domain in domains:
820                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
821                         continue
822                     elif instances.is_registered(domain):
823                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
824                         continue
825                     elif instances.is_recent(domain):
826                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
827                         continue
828
829                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
830                     domains.append(domain)
831
832     logger.debug("domains()=%d", len(domains))
833     if len(domains) > 0:
834         logger.info("Adding %d new instances ...", len(domains))
835         for domain in domains:
836             logger.debug("domain='%s'", domain)
837             try:
838                 logger.info("Fetching instances from domain='%s' ...", domain)
839                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
840             except network.exceptions as exception:
841                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
842                 instances.set_last_error(domain, exception)
843                 return 100
844
845     logger.debug("Success! - EXIT!")
846     return 0
847
848 def fetch_instances(args: argparse.Namespace) -> int:
849     logger.debug("args[]='%s' - CALLED!", type(args))
850
851     logger.debug("args.domain='%s' - checking ...", args.domain)
852     if not validators.domain(args.domain):
853         logger.warning("args.domain='%s' is not valid.", args.domain)
854         return 100
855     elif blacklist.is_blacklisted(args.domain):
856         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
857         return 101
858
859     logger.debug("Invoking locking.acquire() ...")
860     locking.acquire()
861
862     # Initial fetch
863     try:
864         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
865         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
866     except network.exceptions as exception:
867         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
868         instances.set_last_error(args.domain, exception)
869         instances.update_data(args.domain)
870         return 100
871
872     if args.single:
873         logger.debug("Not fetching more instances - EXIT!")
874         return 0
875
876     # Loop through some instances
877     database.cursor.execute(
878         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
879     )
880
881     rows = database.cursor.fetchall()
882     logger.info("Checking %d entries ...", len(rows))
883     for row in rows:
884         logger.debug("row[domain]='%s'", row["domain"])
885         if row["domain"] == "":
886             logger.debug("row[domain] is empty - SKIPPED!")
887             continue
888         elif not utils.is_domain_wanted(row["domain"]):
889             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
890             continue
891
892         try:
893             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
894             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
895         except network.exceptions as exception:
896             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
897             instances.set_last_error(row["domain"], exception)
898
899     logger.debug("Success - EXIT!")
900     return 0
901
902 def fetch_oliphant(args: argparse.Namespace) -> int:
903     logger.debug("args[]='%s' - CALLED!", type(args))
904
905     logger.debug("Invoking locking.acquire() ...")
906     locking.acquire()
907
908     source_domain = "codeberg.org"
909     if sources.is_recent(source_domain):
910         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
911         return 0
912     else:
913         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
914         sources.update(source_domain)
915
916     # Base URL
917     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
918
919     # URLs to fetch
920     blocklists = (
921         {
922             "blocker": "artisan.chat",
923             "csv_url": "mastodon/artisan.chat.csv",
924         },{
925             "blocker": "mastodon.art",
926             "csv_url": "mastodon/mastodon.art.csv",
927         },{
928             "blocker": "pleroma.envs.net",
929             "csv_url": "mastodon/pleroma.envs.net.csv",
930         },{
931             "blocker": "oliphant.social",
932             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
933         },{
934             "blocker": "mastodon.online",
935             "csv_url": "mastodon/mastodon.online.csv",
936         },{
937             "blocker": "mastodon.social",
938             "csv_url": "mastodon/mastodon.social.csv",
939         },{
940             "blocker": "mastodon.social",
941             "csv_url": "other/missing-tier0-mastodon.social.csv",
942         },{
943             "blocker": "rage.love",
944             "csv_url": "mastodon/rage.love.csv",
945         },{
946             "blocker": "sunny.garden",
947             "csv_url": "mastodon/sunny.garden.csv",
948         },{
949             "blocker": "solarpunk.moe",
950             "csv_url": "mastodon/solarpunk.moe.csv",
951         },{
952             "blocker": "toot.wales",
953             "csv_url": "mastodon/toot.wales.csv",
954         },{
955             "blocker": "union.place",
956             "csv_url": "mastodon/union.place.csv",
957         }
958     )
959
960     domains = list()
961
962     logger.debug("Downloading %d files ...", len(blocklists))
963     for block in blocklists:
964         # Is domain given and not equal blocker?
965         if isinstance(args.domain, str) and args.domain != block["blocker"]:
966             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
967             continue
968         elif args.domain in domains:
969             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
970             continue
971         elif instances.is_recent(block["blocker"]):
972             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
973             continue
974
975         # Fetch this URL
976         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
977         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
978
979         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
980         if not response.ok or response.status_code >= 300 or response.content == "":
981             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
982             continue
983
984         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
985         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
986
987         blockdict = list()
988
989         cnt = 0
990         for row in reader:
991             logger.debug("row[%s]='%s'", type(row), row)
992             domain = severity = None
993             reject_media = reject_reports = False
994
995             if "#domain" in row:
996                 domain = row["#domain"]
997             elif "domain" in row:
998                 domain = row["domain"]
999             else:
1000                 logger.debug("row='%s' does not contain domain column", row)
1001                 continue
1002
1003             if "#severity" in row:
1004                 severity = row["#severity"]
1005             elif "severity" in row:
1006                 severity = row["severity"]
1007             else:
1008                 logger.debug("row='%s' does not contain severity column", row)
1009                 continue
1010
1011             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1012                 reject_media = True
1013             elif "reject_media" in row and row["reject_media"].lower() == "true":
1014                 reject_media = True
1015
1016             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1017                 reject_reports = True
1018             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1019                 reject_reports = True
1020
1021             cnt = cnt + 1
1022             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1023             if domain == "":
1024                 logger.debug("domain is empty - SKIPPED!")
1025                 continue
1026             elif not utils.is_domain_wanted(domain):
1027                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1028                 continue
1029
1030             logger.debug("Marking domain='%s' as handled", domain)
1031             domains.append(domain)
1032
1033             logger.debug("Processing domain='%s' ...", domain)
1034             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1035             logger.debug("processed='%s'", processed)
1036
1037             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1038                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1039                 blockdict.append({
1040                     "blocked": domain,
1041                     "reason" : block["reason"],
1042                 })
1043
1044             if reject_media:
1045                 utils.process_block(block["blocker"], domain, None, "reject_media")
1046             if reject_reports:
1047                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1048
1049         logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1050         instances.set_total_blocks(block["blocker"], domains)
1051
1052         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1053         if instances.has_pending(block["blocker"]):
1054             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1055             instances.update_data(block["blocker"])
1056
1057         logger.debug("Invoking commit() ...")
1058         database.connection.commit()
1059
1060         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1061         if config.get("bot_enabled") and len(blockdict) > 0:
1062             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1063             network.send_bot_post(block["blocker"], blockdict)
1064
1065     logger.debug("Success! - EXIT!")
1066     return 0
1067
1068 def fetch_txt(args: argparse.Namespace) -> int:
1069     logger.debug("args[]='%s' - CALLED!", type(args))
1070
1071     logger.debug("Invoking locking.acquire() ...")
1072     locking.acquire()
1073
1074     # Static URLs
1075     urls = ({
1076         "blocker": "seirdy.one",
1077         "url"    : "https://seirdy.one/pb/bsl.txt",
1078     },)
1079
1080     logger.info("Checking %d text file(s) ...", len(urls))
1081     for row in urls:
1082         logger.debug("Fetching row[url]='%s' ...", row["url"])
1083         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1084
1085         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1086         if response.ok and response.status_code < 300 and response.text != "":
1087             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1088             domains = response.text.split("\n")
1089
1090             logger.info("Processing %d domains ...", len(domains))
1091             for domain in domains:
1092                 logger.debug("domain='%s' - BEFORE!", domain)
1093                 domain = tidyup.domain(domain)
1094
1095                 logger.debug("domain='%s' - AFTER!", domain)
1096                 if domain == "":
1097                     logger.debug("domain is empty - SKIPPED!")
1098                     continue
1099                 elif not utils.is_domain_wanted(domain):
1100                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1101                     continue
1102                 elif instances.is_recent(domain):
1103                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1104                     continue
1105
1106                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1107                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1108
1109                 logger.debug("processed='%s'", processed)
1110                 if not processed:
1111                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1112                     continue
1113
1114     logger.debug("Success! - EXIT!")
1115     return 0
1116
1117 def fetch_fedipact(args: argparse.Namespace) -> int:
1118     logger.debug("args[]='%s' - CALLED!", type(args))
1119
1120     logger.debug("Invoking locking.acquire() ...")
1121     locking.acquire()
1122
1123     source_domain = "fedipact.online"
1124     if sources.is_recent(source_domain):
1125         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1126         return 0
1127     else:
1128         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1129         sources.update(source_domain)
1130
1131     response = utils.fetch_url(
1132         f"https://{source_domain}",
1133         network.web_headers,
1134         (config.get("connection_timeout"), config.get("read_timeout"))
1135     )
1136
1137     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1138     if response.ok and response.status_code < 300 and response.text != "":
1139         logger.debug("Parsing %d Bytes ...", len(response.text))
1140
1141         doc = bs4.BeautifulSoup(response.text, "html.parser")
1142         logger.debug("doc[]='%s'", type(doc))
1143
1144         rows = doc.findAll("li")
1145         logger.info("Checking %d row(s) ...", len(rows))
1146         for row in rows:
1147             logger.debug("row[]='%s'", type(row))
1148             domain = tidyup.domain(row.contents[0])
1149
1150             logger.debug("domain='%s' - AFTER!", domain)
1151             if domain == "":
1152                 logger.debug("domain is empty - SKIPPED!")
1153                 continue
1154             elif not utils.is_domain_wanted(domain):
1155                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1156                 continue
1157             elif instances.is_registered(domain):
1158                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1159                 continue
1160             elif instances.is_recent(domain):
1161                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1162                 continue
1163
1164             logger.info("Fetching domain='%s' ...", domain)
1165             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1166
1167     logger.debug("Success! - EXIT!")
1168     return 0
1169
1170 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1171     logger.debug("args[]='%s' - CALLED!", type(args))
1172
1173     logger.debug("Invoking locking.acquire() ...")
1174     locking.acquire()
1175
1176     source_domain = "joinfediverse.wiki"
1177     if sources.is_recent(source_domain):
1178         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1179         return 0
1180     else:
1181         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1182         sources.update(source_domain)
1183
1184     raw = utils.fetch_url(
1185         f"https://{source_domain}/FediBlock",
1186         network.web_headers,
1187         (config.get("connection_timeout"), config.get("read_timeout"))
1188     ).text
1189     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1190
1191     doc = bs4.BeautifulSoup(raw, "html.parser")
1192     logger.debug("doc[]='%s'", type(doc))
1193
1194     tables = doc.findAll("table", {"class": "wikitable"})
1195
1196     logger.info("Analyzing %d table(s) ...", len(tables))
1197     blocklist = list()
1198     for table in tables:
1199         logger.debug("table[]='%s'", type(table))
1200
1201         rows = table.findAll("tr")
1202         logger.info("Checking %d row(s) ...", len(rows))
1203         block_headers = dict()
1204         for row in rows:
1205             logger.debug("row[%s]='%s'", type(row), row)
1206
1207             headers = row.findAll("th")
1208             logger.debug("Found headers()=%d header(s)", len(headers))
1209             if len(headers) > 1:
1210                 block_headers = dict()
1211                 cnt = 0
1212                 for header in headers:
1213                     cnt = cnt + 1
1214                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1215                     text = header.contents[0]
1216
1217                     logger.debug("text[]='%s'", type(text))
1218                     if not isinstance(text, str):
1219                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1220                         continue
1221                     elif validators.domain(text.strip()):
1222                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1223                         continue
1224
1225                     text = tidyup.domain(text.strip())
1226                     logger.debug("text='%s'", text)
1227                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1228                         logger.debug("Found header: '%s'=%d", text, cnt)
1229                         block_headers[cnt] = text
1230
1231             elif len(block_headers) == 0:
1232                 logger.debug("row is not scrapable - SKIPPED!")
1233                 continue
1234             elif len(block_headers) > 0:
1235                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1236                 cnt = 0
1237                 block = dict()
1238
1239                 for element in row.find_all(["th", "td"]):
1240                     cnt = cnt + 1
1241                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1242                     if cnt in block_headers:
1243                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1244
1245                         text = element.text.strip()
1246                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1247
1248                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1249                         if key in ["domain", "instance"]:
1250                             block[key] = text
1251                         elif key == "reason":
1252                             block[key] = tidyup.reason(text)
1253                         elif key == "subdomain(s)":
1254                             block[key] = list()
1255                             if text != "":
1256                                 block[key] = text.split("/")
1257                         else:
1258                             logger.debug("key='%s'", key)
1259                             block[key] = text
1260
1261                 logger.debug("block()=%d ...", len(block))
1262                 if len(block) > 0:
1263                     logger.debug("Appending block()=%d ...", len(block))
1264                     blocklist.append(block)
1265
1266     logger.debug("blocklist()=%d", len(blocklist))
1267
1268     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1269     domains = database.cursor.fetchall()
1270
1271     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1272     blocking = list()
1273     for block in blocklist:
1274         logger.debug("block='%s'", block)
1275         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1276             origin = block["blocked"]
1277             for subdomain in block["subdomain(s)"]:
1278                 block["blocked"] = subdomain + "." + origin
1279                 blocking.append(block)
1280         else:
1281             blocking.append(block)
1282
1283     logger.debug("blocking()=%d", blocking)
1284     for block in blocking:
1285         logger.debug("block[]='%s'", type(block))
1286         block["blocked"] = tidyup.domain(block["blocked"])
1287
1288         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1289         if block["blocked"] == "":
1290             logger.debug("block[blocked] is empty - SKIPPED!")
1291             continue
1292         elif not utils.is_domain_wanted(block["blocked"]):
1293             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1294             continue
1295         elif instances.is_recent(block["blocked"]):
1296             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1297             continue
1298
1299         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1300         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1301
1302     blockdict = list()
1303     for blocker in domains:
1304         blocker = blocker[0]
1305         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1306
1307         for block in blocking:
1308             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1309             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1310
1311             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1312             if block["blocked"] == "":
1313                 logger.debug("block[blocked] is empty - SKIPPED!")
1314                 continue
1315             elif not utils.is_domain_wanted(block["blocked"]):
1316                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1317                 continue
1318
1319             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1320             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1321                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1322                 blockdict.append({
1323                     "blocked": block["blocked"],
1324                     "reason" : block["reason"],
1325                 })
1326
1327         if instances.has_pending(blocker):
1328             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1329             instances.update_data(blocker)
1330
1331         logger.debug("Invoking commit() ...")
1332         database.connection.commit()
1333
1334         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1335         if config.get("bot_enabled") and len(blockdict) > 0:
1336             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1337             network.send_bot_post(blocker, blockdict)
1338
1339     logger.debug("Success! - EXIT!")
1340     return 0
1341
1342 def recheck_obfuscation(args: argparse.Namespace) -> int:
1343     logger.debug("args[]='%s' - CALLED!", type(args))
1344
1345     logger.debug("Invoking locking.acquire() ...")
1346     locking.acquire()
1347
1348     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1349         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1350     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1351         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1352     else:
1353         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1354
1355     rows = database.cursor.fetchall()
1356     logger.info("Checking %d domains ...", len(rows))
1357     for row in rows:
1358         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1359         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1360             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1361             continue
1362
1363         blocking = list()
1364         if row["software"] == "pleroma":
1365             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1366             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1367         elif row["software"] == "mastodon":
1368             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1369             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1370         elif row["software"] == "lemmy":
1371             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1372             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1373         elif row["software"] == "friendica":
1374             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1375             blocking = friendica.fetch_blocks(row["domain"])
1376         elif row["software"] == "misskey":
1377             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1378             blocking = misskey.fetch_blocks(row["domain"])
1379         else:
1380             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1381
1382         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1383         instances.set_total_blocks(row["domain"], blocking)
1384
1385         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1386         obfuscated = 0
1387         blockdict = list()
1388         for block in blocking:
1389             logger.debug("block[blocked]='%s'", block["blocked"])
1390             blocked = None
1391
1392             if block["blocked"] == "":
1393                 logger.debug("block[blocked] is empty - SKIPPED!")
1394                 continue
1395             elif block["blocked"].endswith(".arpa"):
1396                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1397                 continue
1398             elif block["blocked"].endswith(".tld"):
1399                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1400                 continue
1401             elif block["blocked"].endswith(".onion"):
1402                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1403                 continue
1404             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1405                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1406                 obfuscated = obfuscated + 1
1407                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1408             elif not utils.is_domain_wanted(block["blocked"]):
1409                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1410                 continue
1411             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1412                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1413                 continue
1414
1415             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1416             if blocked is not None and blocked != block["blocked"]:
1417                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1418                 obfuscated = obfuscated - 1
1419                 if blocks.is_instance_blocked(row["domain"], blocked):
1420                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1421                     continue
1422
1423                 block["block_level"] = utils.alias_block_level(block["block_level"])
1424
1425                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1426                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1427                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1428                     blockdict.append({
1429                         "blocked": blocked,
1430                         "reason" : block["reason"],
1431                     })
1432
1433         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1434         if obfuscated == 0 and len(blocking) > 0:
1435             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1436             instances.set_has_obfuscation(row["domain"], False)
1437
1438         if instances.has_pending(row["domain"]):
1439             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1440             instances.update_data(row["domain"])
1441
1442         logger.debug("Invoking commit() ...")
1443         database.connection.commit()
1444
1445         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1446         if config.get("bot_enabled") and len(blockdict) > 0:
1447             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1448             network.send_bot_post(row["domain"], blockdict)
1449
1450     logger.debug("Success! - EXIT!")
1451     return 0
1452
1453 def fetch_fedilist(args: argparse.Namespace) -> int:
1454     logger.debug("args[]='%s' - CALLED!", type(args))
1455
1456     logger.debug("Invoking locking.acquire() ...")
1457     locking.acquire()
1458
1459     source_domain = "demo.fedilist.com"
1460     if sources.is_recent(source_domain):
1461         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1462         return 0
1463     else:
1464         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1465         sources.update(source_domain)
1466
1467     url = f"http://{source_domain}/instance/csv?onion=not"
1468     if args.software is not None and args.software != "":
1469         logger.debug("args.software='%s'", args.software)
1470         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1471
1472     logger.info("Fetching url='%s' ...", url)
1473     response = reqto.get(
1474         url,
1475         headers=network.web_headers,
1476         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1477         allow_redirects=False
1478     )
1479
1480     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1481     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1482         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1483         return 1
1484
1485     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1486
1487     logger.debug("reader[]='%s'", type(reader))
1488     blockdict = list()
1489     for row in reader:
1490         logger.debug("row[]='%s'", type(row))
1491         domain = tidyup.domain(row["hostname"])
1492         logger.debug("domain='%s' - AFTER!", domain)
1493
1494         if domain == "":
1495             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1496             continue
1497         elif not utils.is_domain_wanted(domain):
1498             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1499             continue
1500         elif (args.all is None or not args.all) and instances.is_registered(domain):
1501             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1502             continue
1503         elif instances.is_recent(domain):
1504             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1505             continue
1506
1507         logger.info("Fetching instances from domain='%s' ...", domain)
1508         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1509
1510     logger.debug("Success! - EXIT!")
1511     return 0
1512
1513 def update_nodeinfo(args: argparse.Namespace) -> int:
1514     logger.debug("args[]='%s' - CALLED!", type(args))
1515
1516     logger.debug("Invoking locking.acquire() ...")
1517     locking.acquire()
1518
1519     if args.domain is not None and args.domain != "":
1520         logger.debug("Fetching args.domain='%s'", args.domain)
1521         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1522     elif args.software is not None and args.software != "":
1523         logger.info("Fetching domains for args.software='%s'", args.software)
1524         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1525     else:
1526         logger.info("Fetching domains for recently updated ...")
1527         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1528
1529     domains = database.cursor.fetchall()
1530
1531     logger.info("Checking %d domain(s) ...", len(domains))
1532     cnt = 0
1533     for row in domains:
1534         logger.debug("row[]='%s'", type(row))
1535         try:
1536             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1537             software = federation.determine_software(row["domain"])
1538
1539             logger.debug("Determined software='%s'", software)
1540             if software != row["software"] and software is not None:
1541                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1542                 instances.set_software(row["domain"], software)
1543
1544             instances.set_success(row["domain"])
1545         except network.exceptions as exception:
1546             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1547             instances.set_last_error(row["domain"], exception)
1548
1549         instances.set_last_nodeinfo(row["domain"])
1550         instances.update_data(row["domain"])
1551         cnt = cnt + 1
1552
1553     logger.debug("Success! - EXIT!")
1554     return 0
1555
1556 def fetch_instances_social(args: argparse.Namespace) -> int:
1557     logger.debug("args[]='%s' - CALLED!", type(args))
1558
1559     logger.debug("Invoking locking.acquire() ...")
1560     locking.acquire()
1561
1562     source_domain = "instances.social"
1563
1564     if config.get("instances_social_api_key") == "":
1565         logger.error("API key not set. Please set in your config.json file.")
1566         return 1
1567     elif sources.is_recent(source_domain):
1568         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1569         return 0
1570     else:
1571         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1572         sources.update(source_domain)
1573
1574     headers = {
1575         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1576     }
1577
1578     fetched = network.get_json_api(
1579         source_domain,
1580         "/api/1.0/instances/list?count=0&sort_by=name",
1581         headers,
1582         (config.get("connection_timeout"), config.get("read_timeout"))
1583     )
1584     logger.debug("fetched[]='%s'", type(fetched))
1585
1586     if "error_message" in fetched:
1587         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1588         return 2
1589     elif "exception" in fetched:
1590         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1591         return 3
1592     elif "json" not in fetched:
1593         logger.warning("fetched has no element 'json' - EXIT!")
1594         return 4
1595     elif "instances" not in fetched["json"]:
1596         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1597         return 5
1598
1599     domains = list()
1600     rows = fetched["json"]["instances"]
1601
1602     logger.info("Checking %d row(s) ...", len(rows))
1603     for row in rows:
1604         logger.debug("row[]='%s'", type(row))
1605         domain = tidyup.domain(row["name"])
1606
1607         logger.debug("domain='%s' - AFTER!", domain)
1608         if domain == "":
1609             logger.debug("domain is empty - SKIPPED!")
1610             continue
1611         elif not utils.is_domain_wanted(domain):
1612             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1613             continue
1614         elif domain in domains:
1615             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1616             continue
1617         elif instances.is_registered(domain):
1618             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1619             continue
1620         elif instances.is_recent(domain):
1621             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1622             continue
1623
1624         logger.info("Fetching instances from domain='%s'", domain)
1625         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1626
1627     logger.debug("Success! - EXIT!")
1628     return 0