]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 import argparse
24 import atoma
25 import bs4
26 import markdown
27 import reqto
28 import validators
29
30 from fba import csrf
31 from fba import database
32 from fba import utils
33
34 from fba.helpers import blacklist
35 from fba.helpers import config
36 from fba.helpers import cookies
37 from fba.helpers import locking
38 from fba.helpers import software as software_helper
39 from fba.helpers import tidyup
40
41 from fba.http import federation
42 from fba.http import network
43
44 from fba.models import apis
45 from fba.models import blocks
46 from fba.models import instances
47
48 from fba.networks import friendica
49 from fba.networks import lemmy
50 from fba.networks import mastodon
51 from fba.networks import misskey
52 from fba.networks import pleroma
53
54 logging.basicConfig(level=logging.INFO)
55 logger = logging.getLogger(__name__)
56 #logger.setLevel(logging.DEBUG)
57
58 def check_instance(args: argparse.Namespace) -> int:
59     logger.debug("args.domain='%s' - CALLED!", args.domain)
60     status = 0
61     if not validators.domain(args.domain):
62         logger.warning("args.domain='%s' is not valid", args.domain)
63         status = 100
64     elif blacklist.is_blacklisted(args.domain):
65         logger.warning("args.domain='%s' is blacklisted", args.domain)
66         status = 101
67     elif instances.is_registered(args.domain):
68         logger.warning("args.domain='%s' is already registered", args.domain)
69         status = 102
70     else:
71         logger.info("args.domain='%s' is not known", args.domain)
72
73     logger.debug("status=%d - EXIT!", status)
74     return status
75
76 def check_nodeinfo(args: argparse.Namespace) -> int:
77     logger.debug("args[]='%s' - CALLED!", type(args))
78
79     # Fetch rows
80     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
81
82     cnt = 0
83     for row in database.cursor.fetchall():
84         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
85         punycode = row["domain"].encode("idna").decode("utf-8")
86
87         if row["nodeinfo_url"].startswith("/"):
88             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
89             continue
90         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
91             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
92             cnt = cnt + 1
93
94     logger.info("Found %d row(s)", cnt)
95
96     logger.debug("EXIT!")
97     return 0
98
99 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
100     logger.debug("args[]='%s' - CALLED!", type(args))
101
102     # No CSRF by default, you don't have to add network.api_headers by yourself here
103     headers = tuple()
104     api_domain = "pixelfed.org"
105
106     if apis.is_recent(api_domain):
107         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
108         return 0
109     else:
110         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
111         apis.update(api_domain)
112
113     try:
114         logger.debug("Checking CSRF from api_domain='%s' ...", api_domain)
115         headers = csrf.determine(api_domain, dict())
116     except network.exceptions as exception:
117         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
118         return list()
119
120     try:
121         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
122         fetched = network.get_json_api(
123             api_domain,
124             "/api/v1/servers/all.json?scope=All&country=all&language=all",
125             headers,
126             (config.get("connection_timeout"), config.get("read_timeout"))
127         )
128
129         logger.debug("JSON API returned %d elements", len(fetched))
130         if "error_message" in fetched:
131             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
132             return 101
133         elif "data" not in fetched["json"]:
134             logger.warning("API did not return JSON with 'data' element - EXIT!")
135             return 102
136
137         rows = fetched["json"]["data"]
138         logger.info("Checking %d fetched rows ...", len(rows))
139         for row in rows:
140             logger.debug("row[]='%s'", type(row))
141             if "domain" not in row:
142                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
143                 continue
144             elif row["domain"] == "":
145                 logger.debug("row[domain] is empty - SKIPPED!")
146                 continue
147             elif not utils.is_domain_wanted(row["domain"]):
148                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
149                 continue
150             elif instances.is_registered(row["domain"]):
151                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
152                 continue
153             elif instances.is_recent(row["domain"]):
154                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
155                 continue
156
157             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
158             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
159
160     except network.exceptions as exception:
161         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
162         return 103
163
164     logger.debug("Success! - EXIT!")
165     return 0
166
167 def fetch_bkali(args: argparse.Namespace) -> int:
168     logger.debug("args[]='%s' - CALLED!", type(args))
169
170     api_domain = "gql.apis.bka.li"
171     if apis.is_recent(api_domain):
172         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
173         return 0
174     else:
175         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
176         apis.update(api_domain)
177
178     domains = list()
179     try:
180         logger.info("Fetching domainlist from api_domain='%s' ...", api_domain)
181         fetched = network.post_json_api(
182             api_domain,
183             "/v1/graphql",
184             json.dumps({
185                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
186             })
187         )
188
189         logger.debug("fetched[]='%s'", type(fetched))
190         if "error_message" in fetched:
191             logger.warning("post_json_api() for 'gql.apis.bka.li' returned error message='%s", fetched["error_message"])
192             return 100
193         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
194             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
195             return 101
196
197         rows = fetched["json"]
198
199         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
200         if len(rows) == 0:
201             raise Exception("WARNING: Returned no records")
202         elif "data" not in rows:
203             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
204         elif "nodeinfo" not in rows["data"]:
205             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
206
207         for entry in rows["data"]["nodeinfo"]:
208             logger.debug("entry[%s]='%s'", type(entry), entry)
209             if "domain" not in entry:
210                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
211                 continue
212             elif entry["domain"] == "":
213                 logger.debug("entry[domain] is empty - SKIPPED!")
214                 continue
215             elif not utils.is_domain_wanted(entry["domain"]):
216                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
217                 continue
218             elif instances.is_registered(entry["domain"]):
219                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
220                 continue
221             elif instances.is_recent(entry["domain"]):
222                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
223                 continue
224
225             logger.debug("Adding domain='%s' ...", entry["domain"])
226             domains.append(entry["domain"])
227
228     except network.exceptions as exception:
229         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
230         return 102
231
232     logger.debug("domains()=%d", len(domains))
233     if len(domains) > 0:
234         locking.acquire()
235
236         logger.info("Adding %d new instances ...", len(domains))
237         for domain in domains:
238             try:
239                 logger.info("Fetching instances from domain='%s' ...", domain)
240                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
241             except network.exceptions as exception:
242                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
243                 instances.set_last_error(domain, exception)
244                 return 100
245
246     logger.debug("Success - EXIT!")
247     return 0
248
249 def fetch_blocks(args: argparse.Namespace) -> int:
250     logger.debug("args[]='%s' - CALLED!", type(args))
251     if args.domain is not None and args.domain != "":
252         logger.debug("args.domain='%s' - checking ...", args.domain)
253         if not validators.domain(args.domain):
254             logger.warning("args.domain='%s' is not valid.", args.domain)
255             return 100
256         elif blacklist.is_blacklisted(args.domain):
257             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
258             return 101
259         elif not instances.is_registered(args.domain):
260             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
261             return 102
262
263     locking.acquire()
264
265     if args.domain is not None and args.domain != "":
266         # Re-check single domain
267         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
268         database.cursor.execute(
269             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
270         )
271     elif args.software is not None and args.software != "":
272         # Re-check single software
273         logger.debug("Querying database for args.software='%s' ...", args.software)
274         database.cursor.execute(
275             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
276         )
277     else:
278         # Re-check after "timeout" (aka. minimum interval)
279         database.cursor.execute(
280             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
281         )
282
283     rows = database.cursor.fetchall()
284     logger.info("Checking %d entries ...", len(rows))
285     for blocker, software, origin, nodeinfo_url in rows:
286         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
287         blocker = tidyup.domain(blocker)
288         logger.debug("blocker='%s' - AFTER!", blocker)
289
290         if blocker == "":
291             logger.warning("blocker is now empty!")
292             continue
293         elif nodeinfo_url is None or nodeinfo_url == "":
294             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
295             continue
296         elif not utils.is_domain_wanted(blocker):
297             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
298             continue
299
300         logger.debug("blocker='%s'", blocker)
301         instances.set_last_blocked(blocker)
302         instances.set_has_obfuscation(blocker, False)
303
304         blocking = list()
305         blockdict = list()
306         if software == "pleroma":
307             logger.info("blocker='%s',software='%s'", blocker, software)
308             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
309         elif software == "mastodon":
310             logger.info("blocker='%s',software='%s'", blocker, software)
311             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
312         elif software == "lemmy":
313             logger.info("blocker='%s',software='%s'", blocker, software)
314             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
315         elif software == "friendica":
316             logger.info("blocker='%s',software='%s'", blocker, software)
317             blocking = friendica.fetch_blocks(blocker)
318         elif software == "misskey":
319             logger.info("blocker='%s',software='%s'", blocker, software)
320             blocking = misskey.fetch_blocks(blocker)
321         else:
322             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
323
324         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
325         instances.set_total_blocks(blocker, blocking)
326
327         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
328         blockdict = list()
329         for block in blocking:
330             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
331
332             if block["block_level"] == "":
333                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
334                 continue
335
336             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
337             block["blocked"] = tidyup.domain(block["blocked"])
338             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
339             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
340
341             if block["blocked"] == "":
342                 logger.warning("blocked is empty, blocker='%s'", blocker)
343                 continue
344             elif block["blocked"].endswith(".onion"):
345                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
346                 continue
347             elif block["blocked"].endswith(".arpa"):
348                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
349                 continue
350             elif block["blocked"].endswith(".tld"):
351                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
352                 continue
353             elif block["blocked"].find("*") >= 0:
354                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
355
356                 # Some friendica servers also obscure domains without hash
357                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
358
359                 logger.debug("row[]='%s'", type(row))
360                 if row is None:
361                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
362                     instances.set_has_obfuscation(blocker, True)
363                     continue
364
365                 block["blocked"] = row["domain"]
366                 origin           = row["origin"]
367                 nodeinfo_url     = row["nodeinfo_url"]
368             elif block["blocked"].find("?") >= 0:
369                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
370
371                 # Some obscure them with question marks, not sure if that's dependent on version or not
372                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
373
374                 logger.debug("row[]='%s'", type(row))
375                 if row is None:
376                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
377                     instances.set_has_obfuscation(blocker, True)
378                     continue
379
380                 block["blocked"] = row["domain"]
381                 origin           = row["origin"]
382                 nodeinfo_url     = row["nodeinfo_url"]
383
384             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
385             if block["blocked"] == "":
386                 logger.debug("block[blocked] is empty - SKIPPED!")
387                 continue
388             elif not utils.is_domain_wanted(block["blocked"]):
389                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
390                 continue
391             elif block["block_level"] in ["accept", "accepted"]:
392                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
393                 continue
394             elif not instances.is_registered(block["blocked"]):
395                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
396                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
397
398             block["block_level"] = utils.alias_block_level(block["block_level"])
399
400             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
401                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
402                 blockdict.append({
403                     "blocked": block["blocked"],
404                     "reason" : block["reason"],
405                 })
406
407             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
408             cookies.clear(block["blocked"])
409
410         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
411         if instances.has_pending(blocker):
412             logger.debug("Flushing updates for blocker='%s' ...", blocker)
413             instances.update_data(blocker)
414
415         logger.debug("Invoking commit() ...")
416         database.connection.commit()
417
418         logger.debug("Invoking cookies.clear(%s) ...", blocker)
419         cookies.clear(blocker)
420
421         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
422         if config.get("bot_enabled") and len(blockdict) > 0:
423             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
424             network.send_bot_post(blocker, blockdict)
425
426     logger.debug("Success! - EXIT!")
427     return 0
428
429 def fetch_observer(args: argparse.Namespace) -> int:
430     logger.debug("args[]='%s' - CALLED!", type(args))
431
432     api_domain = "fediverse.observer"
433     if apis.is_recent(api_domain):
434         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
435         return 0
436     else:
437         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
438         apis.update(api_domain)
439
440     # Acquire lock
441     locking.acquire()
442
443     types = list()
444     if args.software is None:
445         logger.info("Fetching software list ...")
446         raw = utils.fetch_url(
447             f"https://{api_domain}",
448             network.web_headers,
449             (config.get("connection_timeout"), config.get("read_timeout"))
450         ).text
451         logger.debug("raw[%s]()=%d", type(raw), len(raw))
452
453         doc = bs4.BeautifulSoup(raw, features="html.parser")
454         logger.debug("doc[]='%s'", type(doc))
455
456         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
457         logger.debug("items[]='%s'", type(items))
458
459         logger.info("Checking %d menu items ...", len(items))
460         for item in items:
461             logger.debug("item[%s]='%s'", type(item), item)
462             if item.text.lower() == "all":
463                 logger.debug("Skipping 'All' menu entry ...")
464                 continue
465
466             logger.debug("Appending item.text='%s' ...", item.text)
467             types.append(tidyup.domain(item.text))
468     else:
469         logger.info("Adding args.software='%s' as type ...", args.software)
470         types.append(args.software)
471
472     logger.info("Fetching %d different table data ...", len(types))
473     for software in types:
474         logger.debug("software='%s' - BEFORE!", software)
475         if args.software is not None and args.software != software:
476             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
477             continue
478
479         doc = None
480         try:
481             logger.debug("Fetching table data for software='%s' ...", software)
482             raw = utils.fetch_url(
483                 f"https://{api_domain}/app/views/tabledata.php?software={software}",
484                 network.web_headers,
485                 (config.get("connection_timeout"), config.get("read_timeout"))
486             ).text
487             logger.debug("raw[%s]()=%d", type(raw), len(raw))
488
489             doc = bs4.BeautifulSoup(raw, features="html.parser")
490             logger.debug("doc[]='%s'", type(doc))
491         except network.exceptions as exception:
492             logger.warning("Cannot fetch software='%s' from api_domain='%s': '%s'", software, api_domain, type(exception))
493             continue
494
495         items = doc.findAll("a", {"class": "url"})
496         logger.info("Checking %d items,software='%s' ...", len(items), software)
497         for item in items:
498             logger.debug("item[]='%s'", type(item))
499             domain = item.decode_contents()
500
501             logger.debug("domain='%s' - AFTER!", domain)
502             if domain == "":
503                 logger.debug("domain is empty - SKIPPED!")
504                 continue
505             elif not utils.is_domain_wanted(domain):
506                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
507                 continue
508             elif instances.is_registered(domain):
509                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
510                 continue
511             elif instances.is_recent(domain):
512                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
513                 continue
514
515             software = software_helper.alias(software)
516             logger.info("Fetching instances for domain='%s'", domain)
517             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
518
519     logger.debug("Success! - EXIT!")
520     return 0
521
522 def fetch_todon_wiki(args: argparse.Namespace) -> int:
523     logger.debug("args[]='%s' - CALLED!", type(args))
524
525     api_domain = "wiki.todon.eu"
526     if apis.is_recent(api_domain):
527         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
528         return 0
529     else:
530         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
531         apis.update(api_domain)
532
533     locking.acquire()
534
535     blocklist = {
536         "silenced": list(),
537         "reject": list(),
538     }
539
540     raw = utils.fetch_url(f"https://{api_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
541     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
542
543     doc = bs4.BeautifulSoup(raw, "html.parser")
544     logger.debug("doc[]='%s'", type(doc))
545
546     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
547     logger.info("Checking %d silenced/limited entries ...", len(silenced))
548     blocklist["silenced"] = utils.find_domains(silenced, "div")
549
550     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
551     logger.info("Checking %d suspended entries ...", len(suspended))
552     blocklist["reject"] = utils.find_domains(suspended, "div")
553
554     blocking = blocklist["silenced"] + blocklist["reject"]
555     blocker = "todon.eu"
556
557     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
558     instances.set_total_blocks(blocker, blocking)
559
560     blockdict = list()
561     for block_level in blocklist:
562         blockers = blocklist[block_level]
563
564         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
565         for blocked in blockers:
566             logger.debug("blocked='%s'", blocked)
567
568             if not instances.is_registered(blocked):
569                 try:
570                     logger.info("Fetching instances from domain='%s' ...", blocked)
571                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
572                 except network.exceptions as exception:
573                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
574                     instances.set_last_error(blocked, exception)
575
576             if blocks.is_instance_blocked(blocker, blocked, block_level):
577                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
578                 continue
579
580             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
581             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
582                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
583                 blockdict.append({
584                     "blocked": blocked,
585                     "reason" : None,
586                 })
587
588         logger.debug("Invoking commit() ...")
589         database.connection.commit()
590
591         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
592         if config.get("bot_enabled") and len(blockdict) > 0:
593             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
594             network.send_bot_post(blocker, blockdict)
595
596     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
597     if instances.has_pending(blocker):
598         logger.debug("Flushing updates for blocker='%s' ...", blocker)
599         instances.update_data(blocker)
600
601     logger.debug("Success! - EXIT!")
602     return 0
603
604 def fetch_cs(args: argparse.Namespace):
605     logger.debug("args[]='%s' - CALLED!", type(args))
606     extensions = [
607         "extra",
608         "abbr",
609         "attr_list",
610         "def_list",
611         "fenced_code",
612         "footnotes",
613         "md_in_html",
614         "admonition",
615         "codehilite",
616         "legacy_attrs",
617         "legacy_em",
618         "meta",
619         "nl2br",
620         "sane_lists",
621         "smarty",
622         "toc",
623         "wikilinks"
624     ]
625
626     domains = {
627         "silenced": list(),
628         "reject"  : list(),
629     }
630
631     api_domain = "raw.githubusercontent.com"
632     if apis.is_recent(api_domain):
633         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
634         return 0
635     else:
636         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
637         apis.update(api_domain)
638
639     raw = utils.fetch_url(f"https://{api_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
640     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
641
642     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
643     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
644
645     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
646     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
647     domains["silenced"] = federation.find_domains(silenced)
648
649     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
650     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
651     domains["reject"] = federation.find_domains(blocked)
652
653     blocking = blocklist["silenced"] + blocklist["reject"]
654     blocker = "chaos.social"
655
656     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
657     instances.set_total_blocks(blocker, blocking)
658
659     logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"]))
660     blockdict = list()
661     if len(domains) > 0:
662         locking.acquire()
663
664         for block_level in domains:
665             logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level]))
666
667             for row in domains[block_level]:
668                 logger.debug("row[%s]='%s'", type(row), row)
669                 if instances.is_recent(row["domain"], "last_blocked"):
670                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
671                     continue
672                 elif not instances.is_registered(row["domain"]):
673                     try:
674                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
675                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
676                     except network.exceptions as exception:
677                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
678                         instances.set_last_error(row["domain"], exception)
679
680                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
681                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
682                     blockdict.append({
683                         "blocked": row["domain"],
684                         "reason" : row["reason"],
685                     })
686
687         logger.debug("Invoking commit() ...")
688         database.connection.commit()
689
690         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
691         if config.get("bot_enabled") and len(blockdict) > 0:
692             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
693             network.send_bot_post(blocker, blockdict)
694
695     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
696     if instances.has_pending(blocker):
697         logger.debug("Flushing updates for blocker='%s' ...", blocker)
698         instances.update_data(blocker)
699
700     logger.debug("Success! - EXIT!")
701     return 0
702
703 def fetch_fba_rss(args: argparse.Namespace) -> int:
704     logger.debug("args[]='%s' - CALLED!", type(args))
705     domains = list()
706
707     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
708     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
709
710     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
711     if response.ok and response.status_code < 300 and len(response.text) > 0:
712         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
713         rss = atoma.parse_rss_bytes(response.content)
714
715         logger.debug("rss[]='%s'", type(rss))
716         for item in rss.items:
717             logger.debug("item='%s'", item)
718             domain = tidyup.domain(item.link.split("=")[1])
719
720             logger.debug("domain='%s' - AFTER!", domain)
721             if domain == "":
722                 logger.debug("domain is empty - SKIPPED!")
723                 continue
724             elif not utils.is_domain_wanted(domain):
725                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
726                 continue
727             elif domain in domains:
728                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
729                 continue
730             elif instances.is_registered(domain):
731                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
732                 continue
733             elif instances.is_recent(domain):
734                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
735                 continue
736
737             logger.debug("Adding domain='%s'", domain)
738             domains.append(domain)
739
740     logger.debug("domains()=%d", len(domains))
741     if len(domains) > 0:
742         locking.acquire()
743
744         logger.info("Adding %d new instances ...", len(domains))
745         for domain in domains:
746             try:
747                 logger.info("Fetching instances from domain='%s' ...", domain)
748                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
749             except network.exceptions as exception:
750                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
751                 instances.set_last_error(domain, exception)
752                 return 100
753
754     logger.debug("Success! - EXIT!")
755     return 0
756
757 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
758     logger.debug("args[]='%s' - CALLED!", type(args))
759
760     api_domain = "ryana.agency"
761     if apis.is_recent(api_domain):
762         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
763         return 0
764     else:
765         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
766         apis.update(api_domain)
767
768     feed = f"https://{api_domain}/users/fba/feed.atom"
769
770     domains = list()
771
772     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
773     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
774
775     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
776     if response.ok and response.status_code < 300 and len(response.text) > 0:
777         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
778         atom = atoma.parse_atom_bytes(response.content)
779
780         logger.debug("atom[]='%s'", type(atom))
781         for entry in atom.entries:
782             logger.debug("entry[]='%s'", type(entry))
783             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
784             logger.debug("doc[]='%s'", type(doc))
785             for element in doc.findAll("a"):
786                 logger.debug("element[]='%s'", type(element))
787                 for href in element["href"].split(","):
788                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
789                     domain = tidyup.domain(href)
790
791                     logger.debug("domain='%s' - AFTER!", domain)
792                     if domain == "":
793                         logger.debug("domain is empty - SKIPPED!")
794                         continue
795                     elif not utils.is_domain_wanted(domain):
796                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
797                         continue
798                     elif domain in domains:
799                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
800                         continue
801                     elif instances.is_registered(domain):
802                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
803                         continue
804                     elif instances.is_recent(domain):
805                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
806                         continue
807
808                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
809                     domains.append(domain)
810
811     logger.debug("domains()=%d", len(domains))
812     if len(domains) > 0:
813         locking.acquire()
814
815         logger.info("Adding %d new instances ...", len(domains))
816         for domain in domains:
817             logger.debug("domain='%s'", domain)
818             try:
819                 logger.info("Fetching instances from domain='%s' ...", domain)
820                 federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name)
821             except network.exceptions as exception:
822                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
823                 instances.set_last_error(domain, exception)
824                 return 100
825
826     logger.debug("Success! - EXIT!")
827     return 0
828
829 def fetch_instances(args: argparse.Namespace) -> int:
830     logger.debug("args[]='%s' - CALLED!", type(args))
831
832     locking.acquire()
833
834     # Initial fetch
835     try:
836         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
837         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
838     except network.exceptions as exception:
839         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
840         instances.set_last_error(args.domain, exception)
841         instances.update_data(args.domain)
842         return 100
843
844     if args.single:
845         logger.debug("Not fetching more instances - EXIT!")
846         return 0
847
848     # Loop through some instances
849     database.cursor.execute(
850         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
851     )
852
853     rows = database.cursor.fetchall()
854     logger.info("Checking %d entries ...", len(rows))
855     for row in rows:
856         logger.debug("row[domain]='%s'", row["domain"])
857         if row["domain"] == "":
858             logger.debug("row[domain] is empty - SKIPPED!")
859             continue
860         elif not utils.is_domain_wanted(row["domain"]):
861             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
862             continue
863
864         try:
865             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
866             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
867         except network.exceptions as exception:
868             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
869             instances.set_last_error(row["domain"], exception)
870
871     logger.debug("Success - EXIT!")
872     return 0
873
874 def fetch_oliphant(args: argparse.Namespace) -> int:
875     logger.debug("args[]='%s' - CALLED!", type(args))
876
877     api_domain = "codeberg.org"
878     if apis.is_recent(api_domain):
879         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
880         return 0
881     else:
882         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
883         apis.update(api_domain)
884
885     locking.acquire()
886
887     # Base URL
888     base_url = f"https://{api_domain}/oliphant/blocklists/raw/branch/main/blocklists"
889
890     # URLs to fetch
891     blocklists = (
892         {
893             "blocker": "artisan.chat",
894             "csv_url": "mastodon/artisan.chat.csv",
895         },{
896             "blocker": "mastodon.art",
897             "csv_url": "mastodon/mastodon.art.csv",
898         },{
899             "blocker": "pleroma.envs.net",
900             "csv_url": "mastodon/pleroma.envs.net.csv",
901         },{
902             "blocker": "oliphant.social",
903             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
904         },{
905             "blocker": "mastodon.online",
906             "csv_url": "mastodon/mastodon.online.csv",
907         },{
908             "blocker": "mastodon.social",
909             "csv_url": "mastodon/mastodon.social.csv",
910         },{
911             "blocker": "mastodon.social",
912             "csv_url": "other/missing-tier0-mastodon.social.csv",
913         },{
914             "blocker": "rage.love",
915             "csv_url": "mastodon/rage.love.csv",
916         },{
917             "blocker": "sunny.garden",
918             "csv_url": "mastodon/sunny.garden.csv",
919         },{
920             "blocker": "solarpunk.moe",
921             "csv_url": "mastodon/solarpunk.moe.csv",
922         },{
923             "blocker": "toot.wales",
924             "csv_url": "mastodon/toot.wales.csv",
925         },{
926             "blocker": "union.place",
927             "csv_url": "mastodon/union.place.csv",
928         }
929     )
930
931     domains = list()
932
933     logger.debug("Downloading %d files ...", len(blocklists))
934     for block in blocklists:
935         # Is domain given and not equal blocker?
936         if isinstance(args.domain, str) and args.domain != block["blocker"]:
937             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
938             continue
939         elif args.domain in domains:
940             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
941             continue
942         elif instances.is_recent(block["blocker"]):
943             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
944             continue
945
946         # Fetch this URL
947         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
948         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
949
950         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
951         if not response.ok or response.status_code >= 300 or response.content == "":
952             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
953             continue
954
955         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
956         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
957
958         blockdict = list()
959
960         logger.info("Processing %d rows ...", len(reader))
961         cnt = 0
962         for row in reader:
963             logger.debug("row[%s]='%s'", type(row), row)
964             domain = severity = None
965             reject_media = reject_reports = False
966
967             if "#domain" in row:
968                 domain = row["#domain"]
969             elif "domain" in row:
970                 domain = row["domain"]
971             else:
972                 logger.debug("row='%s' does not contain domain column", row)
973                 continue
974
975             if "#severity" in row:
976                 severity = row["#severity"]
977             elif "severity" in row:
978                 severity = row["severity"]
979             else:
980                 logger.debug("row='%s' does not contain severity column", row)
981                 continue
982
983             if "#reject_media" in row and row["#reject_media"].lower() == "true":
984                 reject_media = True
985             elif "reject_media" in row and row["reject_media"].lower() == "true":
986                 reject_media = True
987
988             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
989                 reject_reports = True
990             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
991                 reject_reports = True
992
993             cnt = cnt + 1
994             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
995             if domain == "":
996                 logger.debug("domain is empty - SKIPPED!")
997                 continue
998             elif not utils.is_domain_wanted(domain):
999                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1000                 continue
1001
1002             logger.debug("Marking domain='%s' as handled", domain)
1003             domains.append(domain)
1004
1005             logger.debug("Processing domain='%s' ...", domain)
1006             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1007             logger.debug("processed='%s'", processed)
1008
1009             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1010                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1011                 blockdict.append({
1012                     "blocked": domain,
1013                     "reason" : block["reason"],
1014                 })
1015
1016             if reject_media:
1017                 utils.process_block(block["blocker"], domain, None, "reject_media")
1018             if reject_reports:
1019                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1020
1021         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", block["blocker"], cnt)
1022         instances.set_total_blocks(block["blocker"], cnt)
1023
1024         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1025         if instances.has_pending(block["blocker"]):
1026             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1027             instances.update_data(block["blocker"])
1028
1029         logger.debug("Invoking commit() ...")
1030         database.connection.commit()
1031
1032         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1033         if config.get("bot_enabled") and len(blockdict) > 0:
1034             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1035             network.send_bot_post(block["blocker"], blockdict)
1036
1037     logger.debug("Success! - EXIT!")
1038     return 0
1039
1040 def fetch_txt(args: argparse.Namespace) -> int:
1041     logger.debug("args[]='%s' - CALLED!", type(args))
1042
1043     locking.acquire()
1044
1045     # Static URLs
1046     urls = ({
1047         "blocker": "seirdy.one",
1048         "url"    : "https://seirdy.one/pb/bsl.txt",
1049     },)
1050
1051     logger.info("Checking %d text file(s) ...", len(urls))
1052     for row in urls:
1053         logger.debug("Fetching row[url]='%s' ...", row["url"])
1054         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1055
1056         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1057         if response.ok and response.status_code < 300 and response.text != "":
1058             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1059             domains = response.text.split("\n")
1060
1061             logger.info("Processing %d domains ...", len(domains))
1062             for domain in domains:
1063                 logger.debug("domain='%s' - BEFORE!", domain)
1064                 domain = tidyup.domain(domain)
1065
1066                 logger.debug("domain='%s' - AFTER!", domain)
1067                 if domain == "":
1068                     logger.debug("domain is empty - SKIPPED!")
1069                     continue
1070                 elif not utils.is_domain_wanted(domain):
1071                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1072                     continue
1073                 elif instances.is_recent(domain):
1074                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1075                     continue
1076
1077                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1078                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1079
1080                 logger.debug("processed='%s'", processed)
1081                 if not processed:
1082                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1083                     continue
1084
1085     logger.debug("Success! - EXIT!")
1086     return 0
1087
1088 def fetch_fedipact(args: argparse.Namespace) -> int:
1089     logger.debug("args[]='%s' - CALLED!", type(args))
1090
1091     api_domain = "fedipact.online"
1092     if apis.is_recent(api_domain):
1093         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1094         return 0
1095     else:
1096         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1097         apis.update(api_domain)
1098
1099     locking.acquire()
1100
1101     response = utils.fetch_url(
1102         f"https://{api_domain}",
1103         network.web_headers,
1104         (config.get("connection_timeout"), config.get("read_timeout"))
1105     )
1106
1107     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1108     if response.ok and response.status_code < 300 and response.text != "":
1109         logger.debug("Parsing %d Bytes ...", len(response.text))
1110
1111         doc = bs4.BeautifulSoup(response.text, "html.parser")
1112         logger.debug("doc[]='%s'", type(doc))
1113
1114         rows = doc.findAll("li")
1115         logger.info("Checking %d row(s) ...", len(rows))
1116         for row in rows:
1117             logger.debug("row[]='%s'", type(row))
1118             domain = tidyup.domain(row.contents[0])
1119
1120             logger.debug("domain='%s' - AFTER!", domain)
1121             if domain == "":
1122                 logger.debug("domain is empty - SKIPPED!")
1123                 continue
1124             elif not utils.is_domain_wanted(domain):
1125                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1126                 continue
1127             elif instances.is_registered(domain):
1128                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1129                 continue
1130             elif instances.is_recent(domain):
1131                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1132                 continue
1133
1134             logger.info("Fetching domain='%s' ...", domain)
1135             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1136
1137     logger.debug("Success! - EXIT!")
1138     return 0
1139
1140 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1141     logger.debug("args[]='%s' - CALLED!", type(args))
1142
1143     api_domain = "joinfediverse.wiki"
1144     if apis.is_recent(api_domain):
1145         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1146         return 0
1147     else:
1148         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1149         apis.update(api_domain)
1150
1151     locking.acquire()
1152
1153     raw = utils.fetch_url(
1154         f"https://{api_domain}/FediBlock",
1155         network.web_headers,
1156         (config.get("connection_timeout"), config.get("read_timeout"))
1157     ).text
1158     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1159
1160     doc = bs4.BeautifulSoup(raw, "html.parser")
1161     logger.debug("doc[]='%s'", type(doc))
1162
1163     tables = doc.findAll("table", {"class": "wikitable"})
1164
1165     logger.info("Analyzing %d table(s) ...", len(tables))
1166     blocklist = list()
1167     for table in tables:
1168         logger.debug("table[]='%s'", type(table))
1169
1170         rows = table.findAll("tr")
1171         logger.info("Checking %d row(s) ...", len(rows))
1172         block_headers = dict()
1173         for row in rows:
1174             logger.debug("row[%s]='%s'", type(row), row)
1175
1176             headers = row.findAll("th")
1177             logger.debug("Found headers()=%d header(s)", len(headers))
1178             if len(headers) > 1:
1179                 block_headers = dict()
1180                 cnt = 0
1181                 for header in headers:
1182                     cnt = cnt + 1
1183                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1184                     text = header.contents[0]
1185
1186                     logger.debug("text[]='%s'", type(text))
1187                     if not isinstance(text, str):
1188                         logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text))
1189                         continue
1190                     elif validators.domain(text.strip()):
1191                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1192                         continue
1193
1194                     text = tidyup.domain(text.strip())
1195                     logger.debug("text='%s'", text)
1196                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1197                         logger.debug("Found header: '%s'=%d", text, cnt)
1198                         block_headers[cnt] = text
1199
1200             elif len(block_headers) == 0:
1201                 logger.debug("row is not scrapable - SKIPPED!")
1202                 continue
1203             elif len(block_headers) > 0:
1204                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1205                 cnt = 0
1206                 block = dict()
1207
1208                 for element in row.find_all(["th", "td"]):
1209                     cnt = cnt + 1
1210                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1211                     if cnt in block_headers:
1212                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1213
1214                         text = element.text.strip()
1215                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1216
1217                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1218                         if key in ["domain", "instance"]:
1219                             block[key] = text
1220                         elif key == "reason":
1221                             block[key] = tidyup.reason(text)
1222                         elif key == "subdomain(s)":
1223                             block[key] = list()
1224                             if text != "":
1225                                 block[key] = text.split("/")
1226                         else:
1227                             logger.debug("key='%s'", key)
1228                             block[key] = text
1229
1230                 logger.debug("block()=%d ...", len(block))
1231                 if len(block) > 0:
1232                     logger.debug("Appending block()=%d ...", len(block))
1233                     blocklist.append(block)
1234
1235     logger.debug("blocklist()=%d", len(blocklist))
1236
1237     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1238     domains = database.cursor.fetchall()
1239
1240     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1241     blocking = list()
1242     for block in blocklist:
1243         logger.debug("block='%s'", block)
1244         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1245             origin = block["blocked"]
1246             for subdomain in block["subdomain(s)"]:
1247                 block["blocked"] = subdomain + "." + origin
1248                 blocking.append(block)
1249         else:
1250             blocking.append(block)
1251
1252     logger.debug("blocking()=%d", blocking)
1253     for block in blocking:
1254         logger.debug("block[]='%s'", type(block))
1255         block["blocked"] = tidyup.domain(block["blocked"])
1256
1257         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1258         if block["blocked"] == "":
1259             logger.debug("block[blocked] is empty - SKIPPED!")
1260             continue
1261         elif not utils.is_domain_wanted(block["blocked"]):
1262             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1263             continue
1264         elif instances.is_recent(block["blocked"]):
1265             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1266             continue
1267
1268         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1269         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1270
1271     blockdict = list()
1272     for blocker in domains:
1273         blocker = blocker[0]
1274         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1275
1276         for block in blocking:
1277             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1278             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1279
1280             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1281             if block["blocked"] == "":
1282                 logger.debug("block[blocked] is empty - SKIPPED!")
1283                 continue
1284             elif not utils.is_domain_wanted(block["blocked"]):
1285                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1286                 continue
1287
1288             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1289             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1290                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1291                 blockdict.append({
1292                     "blocked": block["blocked"],
1293                     "reason" : block["reason"],
1294                 })
1295
1296         if instances.has_pending(blocker):
1297             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1298             instances.update_data(blocker)
1299
1300         logger.debug("Invoking commit() ...")
1301         database.connection.commit()
1302
1303         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1304         if config.get("bot_enabled") and len(blockdict) > 0:
1305             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1306             network.send_bot_post(blocker, blockdict)
1307
1308     logger.debug("Success! - EXIT!")
1309     return 0
1310
1311 def recheck_obfuscation(args: argparse.Namespace) -> int:
1312     logger.debug("args[]='%s' - CALLED!", type(args))
1313
1314     locking.acquire()
1315
1316     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1317         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1318     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1319         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1320     else:
1321         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1322
1323     rows = database.cursor.fetchall()
1324     logger.info("Checking %d domains ...", len(rows))
1325     for row in rows:
1326         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1327         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1328             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1329             continue
1330
1331         blocking = list()
1332         if row["software"] == "pleroma":
1333             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1334             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1335         elif row["software"] == "mastodon":
1336             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1337             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1338         elif row["software"] == "lemmy":
1339             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1340             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1341         elif row["software"] == "friendica":
1342             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1343             blocking = friendica.fetch_blocks(row["domain"])
1344         elif row["software"] == "misskey":
1345             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1346             blocking = misskey.fetch_blocks(row["domain"])
1347         else:
1348             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1349
1350         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1351         instances.set_total_blocks(row["domain"], blocking)
1352
1353         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1354         obfuscated = 0
1355         blockdict = list()
1356         for block in blocking:
1357             logger.debug("block[blocked]='%s'", block["blocked"])
1358             blocked = None
1359
1360             if block["blocked"] == "":
1361                 logger.debug("block[blocked] is empty - SKIPPED!")
1362                 continue
1363             elif block["blocked"].endswith(".arpa"):
1364                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1365                 continue
1366             elif block["blocked"].endswith(".tld"):
1367                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1368                 continue
1369             elif block["blocked"].endswith(".onion"):
1370                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1371                 continue
1372             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1373                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1374                 obfuscated = obfuscated + 1
1375                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1376             elif not utils.is_domain_wanted(block["blocked"]):
1377                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1378                 continue
1379             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1380                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1381                 continue
1382
1383             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1384             if blocked is not None and blocked != block["blocked"]:
1385                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1386                 obfuscated = obfuscated - 1
1387                 if blocks.is_instance_blocked(row["domain"], blocked):
1388                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1389                     continue
1390
1391                 block["block_level"] = utils.alias_block_level(block["block_level"])
1392
1393                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1394                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1395                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1396                     blockdict.append({
1397                         "blocked": blocked,
1398                         "reason" : block["reason"],
1399                     })
1400
1401         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1402         if obfuscated == 0 and len(blocking) > 0:
1403             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1404             instances.set_has_obfuscation(row["domain"], False)
1405
1406         if instances.has_pending(row["domain"]):
1407             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1408             instances.update_data(row["domain"])
1409
1410         logger.debug("Invoking commit() ...")
1411         database.connection.commit()
1412
1413         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1414         if config.get("bot_enabled") and len(blockdict) > 0:
1415             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1416             network.send_bot_post(row["domain"], blockdict)
1417
1418     logger.debug("Success! - EXIT!")
1419     return 0
1420
1421 def fetch_fedilist(args: argparse.Namespace) -> int:
1422     logger.debug("args[]='%s' - CALLED!", type(args))
1423
1424     api_domain = "demo.fedilist.com"
1425     if apis.is_recent(api_domain):
1426         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1427         return 0
1428     else:
1429         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1430         apis.update(api_domain)
1431
1432     url = f"http://{api_domain}/instance/csv?onion=not"
1433     if args.software is not None and args.software != "":
1434         logger.debug("args.software='%s'", args.software)
1435         url = f"http://{api_domain}/instance/csv?software={args.software}&onion=not"
1436
1437     locking.acquire()
1438
1439     logger.info("Fetching url='%s' ...", url)
1440     response = reqto.get(
1441         url,
1442         headers=network.web_headers,
1443         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1444         allow_redirects=False
1445     )
1446
1447     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1448     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1449         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1450         return 1
1451
1452     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1453
1454     logger.debug("reader[]='%s'", type(reader))
1455     blockdict = list()
1456     for row in reader:
1457         logger.debug("row[]='%s'", type(row))
1458         domain = tidyup.domain(row["hostname"])
1459         logger.debug("domain='%s' - AFTER!", domain)
1460
1461         if domain == "":
1462             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1463             continue
1464         elif not utils.is_domain_wanted(domain):
1465             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1466             continue
1467         elif (args.all is None or not args.all) and instances.is_registered(domain):
1468             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1469             continue
1470         elif instances.is_recent(domain):
1471             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1472             continue
1473
1474         logger.info("Fetching instances from domain='%s' ...", domain)
1475         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1476
1477     logger.debug("Success! - EXIT!")
1478     return 0
1479
1480 def update_nodeinfo(args: argparse.Namespace) -> int:
1481     logger.debug("args[]='%s' - CALLED!", type(args))
1482
1483     locking.acquire()
1484
1485     if args.domain is not None and args.domain != "":
1486         logger.debug("Fetching args.domain='%s'", args.domain)
1487         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1488     elif args.software is not None and args.software != "":
1489         logger.info("Fetching domains for args.software='%s'", args.software)
1490         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1491     else:
1492         logger.info("Fetching domains for recently updated ...")
1493         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1494
1495     domains = database.cursor.fetchall()
1496
1497     logger.info("Checking %d domain(s) ...", len(domains))
1498     cnt = 0
1499     for row in domains:
1500         logger.debug("row[]='%s'", type(row))
1501         try:
1502             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1503             software = federation.determine_software(row["domain"])
1504
1505             logger.debug("Determined software='%s'", software)
1506             if software != row["software"]:
1507                 logger.warning("Software type has changed from '%s' to '%s'!", row["software"], software)
1508                 instances.set_software(row["domain"], software)
1509
1510             instances.set_success(row["domain"])
1511         except network.exceptions as exception:
1512             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1513             instances.set_last_error(row["domain"], exception)
1514
1515         instances.set_last_nodeinfo(row["domain"])
1516         instances.update_data(row["domain"])
1517         cnt = cnt + 1
1518
1519     logger.debug("Success! - EXIT!")
1520     return 0
1521
1522 def fetch_instances_social(args: argparse.Namespace) -> int:
1523     logger.debug("args[]='%s' - CALLED!", type(args))
1524
1525     api_domain = "instances.social"
1526
1527     if config.get("instances_social_api_key") == "":
1528         logger.error("API key not set. Please set in your config.json file.")
1529         return 1
1530     elif apis.is_recent(api_domain):
1531         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1532         return 0
1533     else:
1534         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1535         apis.update(api_domain)
1536
1537     locking.acquire()
1538     headers = {
1539         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1540     }
1541
1542     fetched = network.get_json_api(
1543         api_domain,
1544         "/api/1.0/instances/list?count=0&sort_by=name",
1545         headers,
1546         (config.get("connection_timeout"), config.get("read_timeout"))
1547     )
1548     logger.debug("fetched[]='%s'", type(fetched))
1549
1550     if "error_message" in fetched:
1551         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1552         return 2
1553     elif "exception" in fetched:
1554         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1555         return 3
1556     elif "json" not in fetched:
1557         logger.warning("fetched has no element 'json' - EXIT!")
1558         return 4
1559     elif "instances" not in fetched["json"]:
1560         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1561         return 5
1562
1563     domains = list()
1564     rows = fetched["json"]["instances"]
1565
1566     logger.info("Checking %d row(s) ...", len(rows))
1567     for row in rows:
1568         logger.debug("row[]='%s'", type(row))
1569         domain = tidyup.domain(row["name"])
1570
1571         logger.debug("domain='%s' - AFTER!", domain)
1572         if domain == "":
1573             logger.debug("domain is empty - SKIPPED!")
1574             continue
1575         elif not utils.is_domain_wanted(domain):
1576             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1577             continue
1578         elif domain in domains:
1579             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1580             continue
1581         elif instances.is_registered(domain):
1582             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1583             continue
1584         elif instances.is_recent(domain):
1585             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1586             continue
1587
1588         logger.info("Fetching instances from domain='%s'", domain)
1589         federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name)
1590
1591     logger.debug("Success! - EXIT!")
1592     return 0