]> git.mxchange.org Git - fba.git/blob - fba/commands.py
d1a09c0b0b62ba920ea80c79b7fa8efe245d8edc
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 import argparse
24 import atoma
25 import bs4
26 import markdown
27 import reqto
28 import validators
29
30 from fba import csrf
31 from fba import database
32 from fba import utils
33
34 from fba.helpers import blacklist
35 from fba.helpers import config
36 from fba.helpers import cookies
37 from fba.helpers import locking
38 from fba.helpers import software as software_helper
39 from fba.helpers import tidyup
40
41 from fba.http import federation
42 from fba.http import network
43
44 from fba.models import apis
45 from fba.models import blocks
46 from fba.models import instances
47
48 from fba.networks import friendica
49 from fba.networks import lemmy
50 from fba.networks import mastodon
51 from fba.networks import misskey
52 from fba.networks import pleroma
53
54 logging.basicConfig(level=logging.INFO)
55 logger = logging.getLogger(__name__)
56 #logger.setLevel(logging.DEBUG)
57
58 def check_instance(args: argparse.Namespace) -> int:
59     logger.debug("args.domain='%s' - CALLED!", args.domain)
60     status = 0
61     if not validators.domain(args.domain):
62         logger.warning("args.domain='%s' is not valid", args.domain)
63         status = 100
64     elif blacklist.is_blacklisted(args.domain):
65         logger.warning("args.domain='%s' is blacklisted", args.domain)
66         status = 101
67     elif instances.is_registered(args.domain):
68         logger.warning("args.domain='%s' is already registered", args.domain)
69         status = 102
70     else:
71         logger.info("args.domain='%s' is not known", args.domain)
72
73     logger.debug("status=%d - EXIT!", status)
74     return status
75
76 def check_nodeinfo(args: argparse.Namespace) -> int:
77     logger.debug("args[]='%s' - CALLED!", type(args))
78
79     # Fetch rows
80     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
81
82     cnt = 0
83     for row in database.cursor.fetchall():
84         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
85         punycode = row["domain"].encode("idna").decode("utf-8")
86
87         if row["nodeinfo_url"].startswith("/"):
88             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
89             continue
90         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
91             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
92             cnt = cnt + 1
93
94     logger.info("Found %d row(s)", cnt)
95
96     logger.debug("EXIT!")
97     return 0
98
99 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
100     logger.debug("args[]='%s' - CALLED!", type(args))
101
102     # No CSRF by default, you don't have to add network.api_headers by yourself here
103     headers = tuple()
104     api_domain = "pixelfed.org"
105
106     if apis.is_recent(api_domain):
107         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
108         return 0
109     else:
110         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
111         apis.update(api_domain)
112
113     try:
114         logger.debug("Checking CSRF from api_domain='%s' ...", api_domain)
115         headers = csrf.determine(api_domain, dict())
116     except network.exceptions as exception:
117         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
118         return list()
119
120     try:
121         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
122         fetched = network.get_json_api(
123             api_domain,
124             "/api/v1/servers/all.json?scope=All&country=all&language=all",
125             headers,
126             (config.get("connection_timeout"), config.get("read_timeout"))
127         )
128
129         logger.debug("JSON API returned %d elements", len(fetched))
130         if "error_message" in fetched:
131             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
132             return 101
133         elif "data" not in fetched["json"]:
134             logger.warning("API did not return JSON with 'data' element - EXIT!")
135             return 102
136
137         rows = fetched["json"]["data"]
138         logger.info("Checking %d fetched rows ...", len(rows))
139         for row in rows:
140             logger.debug("row[]='%s'", type(row))
141             if "domain" not in row:
142                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
143                 continue
144             elif row["domain"] == "":
145                 logger.debug("row[domain] is empty - SKIPPED!")
146                 continue
147             elif not utils.is_domain_wanted(row["domain"]):
148                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
149                 continue
150             elif instances.is_registered(row["domain"]):
151                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
152                 continue
153             elif instances.is_recent(row["domain"]):
154                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
155                 continue
156
157             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
158             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
159
160     except network.exceptions as exception:
161         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
162         return 103
163
164     logger.debug("Success! - EXIT!")
165     return 0
166
167 def fetch_bkali(args: argparse.Namespace) -> int:
168     logger.debug("args[]='%s' - CALLED!", type(args))
169
170     logger.debug("Invoking locking.acquire() ...")
171     locking.acquire()
172
173     api_domain = "gql.apis.bka.li"
174     if apis.is_recent(api_domain):
175         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
176         return 0
177     else:
178         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
179         apis.update(api_domain)
180
181     domains = list()
182     try:
183         logger.info("Fetching domainlist from api_domain='%s' ...", api_domain)
184         fetched = network.post_json_api(
185             api_domain,
186             "/v1/graphql",
187             json.dumps({
188                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
189             })
190         )
191
192         logger.debug("fetched[]='%s'", type(fetched))
193         if "error_message" in fetched:
194             logger.warning("post_json_api() for 'gql.apis.bka.li' returned error message='%s", fetched["error_message"])
195             return 100
196         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
197             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
198             return 101
199
200         rows = fetched["json"]
201
202         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
203         if len(rows) == 0:
204             raise Exception("WARNING: Returned no records")
205         elif "data" not in rows:
206             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
207         elif "nodeinfo" not in rows["data"]:
208             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
209
210         for entry in rows["data"]["nodeinfo"]:
211             logger.debug("entry[%s]='%s'", type(entry), entry)
212             if "domain" not in entry:
213                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
214                 continue
215             elif entry["domain"] == "":
216                 logger.debug("entry[domain] is empty - SKIPPED!")
217                 continue
218             elif not utils.is_domain_wanted(entry["domain"]):
219                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
220                 continue
221             elif instances.is_registered(entry["domain"]):
222                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
223                 continue
224             elif instances.is_recent(entry["domain"]):
225                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
226                 continue
227
228             logger.debug("Adding domain='%s' ...", entry["domain"])
229             domains.append(entry["domain"])
230
231     except network.exceptions as exception:
232         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
233         return 102
234
235     logger.debug("domains()=%d", len(domains))
236     if len(domains) > 0:
237         logger.info("Adding %d new instances ...", len(domains))
238         for domain in domains:
239             try:
240                 logger.info("Fetching instances from domain='%s' ...", domain)
241                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
242             except network.exceptions as exception:
243                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
244                 instances.set_last_error(domain, exception)
245                 return 100
246
247     logger.debug("Success - EXIT!")
248     return 0
249
250 def fetch_blocks(args: argparse.Namespace) -> int:
251     logger.debug("args[]='%s' - CALLED!", type(args))
252     if args.domain is not None and args.domain != "":
253         logger.debug("args.domain='%s' - checking ...", args.domain)
254         if not validators.domain(args.domain):
255             logger.warning("args.domain='%s' is not valid.", args.domain)
256             return 100
257         elif blacklist.is_blacklisted(args.domain):
258             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
259             return 101
260         elif not instances.is_registered(args.domain):
261             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
262             return 102
263
264     logger.debug("Invoking locking.acquire() ...")
265     locking.acquire()
266
267     if args.domain is not None and args.domain != "":
268         # Re-check single domain
269         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
270         database.cursor.execute(
271             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
272         )
273     elif args.software is not None and args.software != "":
274         # Re-check single software
275         logger.debug("Querying database for args.software='%s' ...", args.software)
276         database.cursor.execute(
277             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
278         )
279     else:
280         # Re-check after "timeout" (aka. minimum interval)
281         database.cursor.execute(
282             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
283         )
284
285     rows = database.cursor.fetchall()
286     logger.info("Checking %d entries ...", len(rows))
287     for blocker, software, origin, nodeinfo_url in rows:
288         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
289         blocker = tidyup.domain(blocker)
290         logger.debug("blocker='%s' - AFTER!", blocker)
291
292         if blocker == "":
293             logger.warning("blocker is now empty!")
294             continue
295         elif nodeinfo_url is None or nodeinfo_url == "":
296             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
297             continue
298         elif not utils.is_domain_wanted(blocker):
299             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
300             continue
301
302         logger.debug("blocker='%s'", blocker)
303         instances.set_last_blocked(blocker)
304         instances.set_has_obfuscation(blocker, False)
305
306         blocking = list()
307         blockdict = list()
308         if software == "pleroma":
309             logger.info("blocker='%s',software='%s'", blocker, software)
310             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
311         elif software == "mastodon":
312             logger.info("blocker='%s',software='%s'", blocker, software)
313             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
314         elif software == "lemmy":
315             logger.info("blocker='%s',software='%s'", blocker, software)
316             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
317         elif software == "friendica":
318             logger.info("blocker='%s',software='%s'", blocker, software)
319             blocking = friendica.fetch_blocks(blocker)
320         elif software == "misskey":
321             logger.info("blocker='%s',software='%s'", blocker, software)
322             blocking = misskey.fetch_blocks(blocker)
323         else:
324             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
325
326         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
327         instances.set_total_blocks(blocker, blocking)
328
329         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
330         blockdict = list()
331         for block in blocking:
332             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
333
334             if block["block_level"] == "":
335                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
336                 continue
337
338             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
339             block["blocked"] = tidyup.domain(block["blocked"])
340             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
341             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
342
343             if block["blocked"] == "":
344                 logger.warning("blocked is empty, blocker='%s'", blocker)
345                 continue
346             elif block["blocked"].endswith(".onion"):
347                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
348                 continue
349             elif block["blocked"].endswith(".arpa"):
350                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
351                 continue
352             elif block["blocked"].endswith(".tld"):
353                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
354                 continue
355             elif block["blocked"].find("*") >= 0:
356                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
357
358                 # Some friendica servers also obscure domains without hash
359                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
360
361                 logger.debug("row[]='%s'", type(row))
362                 if row is None:
363                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
364                     instances.set_has_obfuscation(blocker, True)
365                     continue
366
367                 block["blocked"] = row["domain"]
368                 origin           = row["origin"]
369                 nodeinfo_url     = row["nodeinfo_url"]
370             elif block["blocked"].find("?") >= 0:
371                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
372
373                 # Some obscure them with question marks, not sure if that's dependent on version or not
374                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
375
376                 logger.debug("row[]='%s'", type(row))
377                 if row is None:
378                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
379                     instances.set_has_obfuscation(blocker, True)
380                     continue
381
382                 block["blocked"] = row["domain"]
383                 origin           = row["origin"]
384                 nodeinfo_url     = row["nodeinfo_url"]
385
386             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
387             if block["blocked"] == "":
388                 logger.debug("block[blocked] is empty - SKIPPED!")
389                 continue
390             elif not utils.is_domain_wanted(block["blocked"]):
391                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
392                 continue
393             elif block["block_level"] in ["accept", "accepted"]:
394                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
395                 continue
396             elif not instances.is_registered(block["blocked"]):
397                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
398                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
399
400             block["block_level"] = utils.alias_block_level(block["block_level"])
401
402             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
403                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
404                 blockdict.append({
405                     "blocked": block["blocked"],
406                     "reason" : block["reason"],
407                 })
408
409             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
410             cookies.clear(block["blocked"])
411
412         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
413         if instances.has_pending(blocker):
414             logger.debug("Flushing updates for blocker='%s' ...", blocker)
415             instances.update_data(blocker)
416
417         logger.debug("Invoking commit() ...")
418         database.connection.commit()
419
420         logger.debug("Invoking cookies.clear(%s) ...", blocker)
421         cookies.clear(blocker)
422
423         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
424         if config.get("bot_enabled") and len(blockdict) > 0:
425             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
426             network.send_bot_post(blocker, blockdict)
427
428     logger.debug("Success! - EXIT!")
429     return 0
430
431 def fetch_observer(args: argparse.Namespace) -> int:
432     logger.debug("args[]='%s' - CALLED!", type(args))
433
434     logger.debug("Invoking locking.acquire() ...")
435     locking.acquire()
436
437     api_domain = "fediverse.observer"
438     if apis.is_recent(api_domain):
439         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
440         return 0
441     else:
442         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
443         apis.update(api_domain)
444
445     types = list()
446     if args.software is None:
447         logger.info("Fetching software list ...")
448         raw = utils.fetch_url(
449             f"https://{api_domain}",
450             network.web_headers,
451             (config.get("connection_timeout"), config.get("read_timeout"))
452         ).text
453         logger.debug("raw[%s]()=%d", type(raw), len(raw))
454
455         doc = bs4.BeautifulSoup(raw, features="html.parser")
456         logger.debug("doc[]='%s'", type(doc))
457
458         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
459         logger.debug("items[]='%s'", type(items))
460
461         logger.info("Checking %d menu items ...", len(items))
462         for item in items:
463             logger.debug("item[%s]='%s'", type(item), item)
464             if item.text.lower() == "all":
465                 logger.debug("Skipping 'All' menu entry ...")
466                 continue
467
468             logger.debug("Appending item.text='%s' ...", item.text)
469             types.append(tidyup.domain(item.text))
470     else:
471         logger.info("Adding args.software='%s' as type ...", args.software)
472         types.append(args.software)
473
474     logger.info("Fetching %d different table data ...", len(types))
475     for software in types:
476         logger.debug("software='%s' - BEFORE!", software)
477         if args.software is not None and args.software != software:
478             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
479             continue
480
481         doc = None
482         try:
483             logger.debug("Fetching table data for software='%s' ...", software)
484             raw = utils.fetch_url(
485                 f"https://{api_domain}/app/views/tabledata.php?software={software}",
486                 network.web_headers,
487                 (config.get("connection_timeout"), config.get("read_timeout"))
488             ).text
489             logger.debug("raw[%s]()=%d", type(raw), len(raw))
490
491             doc = bs4.BeautifulSoup(raw, features="html.parser")
492             logger.debug("doc[]='%s'", type(doc))
493         except network.exceptions as exception:
494             logger.warning("Cannot fetch software='%s' from api_domain='%s': '%s'", software, api_domain, type(exception))
495             continue
496
497         items = doc.findAll("a", {"class": "url"})
498         logger.info("Checking %d items,software='%s' ...", len(items), software)
499         for item in items:
500             logger.debug("item[]='%s'", type(item))
501             domain = item.decode_contents()
502
503             logger.debug("domain='%s' - AFTER!", domain)
504             if domain == "":
505                 logger.debug("domain is empty - SKIPPED!")
506                 continue
507             elif not utils.is_domain_wanted(domain):
508                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
509                 continue
510             elif instances.is_registered(domain):
511                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
512                 continue
513             elif instances.is_recent(domain):
514                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
515                 continue
516
517             software = software_helper.alias(software)
518             logger.info("Fetching instances for domain='%s'", domain)
519             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
520
521     logger.debug("Success! - EXIT!")
522     return 0
523
524 def fetch_todon_wiki(args: argparse.Namespace) -> int:
525     logger.debug("args[]='%s' - CALLED!", type(args))
526
527     logger.debug("Invoking locking.acquire() ...")
528     locking.acquire()
529
530     api_domain = "wiki.todon.eu"
531     if apis.is_recent(api_domain):
532         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
533         return 0
534     else:
535         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
536         apis.update(api_domain)
537
538     blocklist = {
539         "silenced": list(),
540         "reject": list(),
541     }
542
543     raw = utils.fetch_url(f"https://{api_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
544     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
545
546     doc = bs4.BeautifulSoup(raw, "html.parser")
547     logger.debug("doc[]='%s'", type(doc))
548
549     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
550     logger.info("Checking %d silenced/limited entries ...", len(silenced))
551     blocklist["silenced"] = utils.find_domains(silenced, "div")
552
553     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
554     logger.info("Checking %d suspended entries ...", len(suspended))
555     blocklist["reject"] = utils.find_domains(suspended, "div")
556
557     blocking = blocklist["silenced"] + blocklist["reject"]
558     blocker = "todon.eu"
559
560     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
561     instances.set_total_blocks(blocker, blocking)
562
563     blockdict = list()
564     for block_level in blocklist:
565         blockers = blocklist[block_level]
566
567         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
568         for blocked in blockers:
569             logger.debug("blocked='%s'", blocked)
570
571             if not instances.is_registered(blocked):
572                 try:
573                     logger.info("Fetching instances from domain='%s' ...", blocked)
574                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
575                 except network.exceptions as exception:
576                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
577                     instances.set_last_error(blocked, exception)
578
579             if blocks.is_instance_blocked(blocker, blocked, block_level):
580                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
581                 continue
582
583             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
584             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
585                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
586                 blockdict.append({
587                     "blocked": blocked,
588                     "reason" : None,
589                 })
590
591         logger.debug("Invoking commit() ...")
592         database.connection.commit()
593
594         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
595         if config.get("bot_enabled") and len(blockdict) > 0:
596             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
597             network.send_bot_post(blocker, blockdict)
598
599     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
600     if instances.has_pending(blocker):
601         logger.debug("Flushing updates for blocker='%s' ...", blocker)
602         instances.update_data(blocker)
603
604     logger.debug("Success! - EXIT!")
605     return 0
606
607 def fetch_cs(args: argparse.Namespace):
608     logger.debug("args[]='%s' - CALLED!", type(args))
609
610     logger.debug("Invoking locking.acquire() ...")
611     locking.acquire()
612
613     extensions = [
614         "extra",
615         "abbr",
616         "attr_list",
617         "def_list",
618         "fenced_code",
619         "footnotes",
620         "md_in_html",
621         "admonition",
622         "codehilite",
623         "legacy_attrs",
624         "legacy_em",
625         "meta",
626         "nl2br",
627         "sane_lists",
628         "smarty",
629         "toc",
630         "wikilinks"
631     ]
632
633     domains = {
634         "silenced": list(),
635         "reject"  : list(),
636     }
637
638     api_domain = "raw.githubusercontent.com"
639     if apis.is_recent(api_domain):
640         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
641         return 0
642     else:
643         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
644         apis.update(api_domain)
645
646     raw = utils.fetch_url(f"https://{api_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
647     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
648
649     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
650     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
651
652     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
653     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
654     domains["silenced"] = federation.find_domains(silenced)
655
656     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
657     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
658     domains["reject"] = federation.find_domains(blocked)
659
660     blocking = blocklist["silenced"] + blocklist["reject"]
661     blocker = "chaos.social"
662
663     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
664     instances.set_total_blocks(blocker, blocking)
665
666     logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"]))
667     blockdict = list()
668     if len(domains) > 0:
669
670         for block_level in domains:
671             logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level]))
672
673             for row in domains[block_level]:
674                 logger.debug("row[%s]='%s'", type(row), row)
675                 if instances.is_recent(row["domain"], "last_blocked"):
676                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
677                     continue
678                 elif not instances.is_registered(row["domain"]):
679                     try:
680                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
681                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
682                     except network.exceptions as exception:
683                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
684                         instances.set_last_error(row["domain"], exception)
685
686                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
687                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
688                     blockdict.append({
689                         "blocked": row["domain"],
690                         "reason" : row["reason"],
691                     })
692
693         logger.debug("Invoking commit() ...")
694         database.connection.commit()
695
696         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
697         if config.get("bot_enabled") and len(blockdict) > 0:
698             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
699             network.send_bot_post(blocker, blockdict)
700
701     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
702     if instances.has_pending(blocker):
703         logger.debug("Flushing updates for blocker='%s' ...", blocker)
704         instances.update_data(blocker)
705
706     logger.debug("Success! - EXIT!")
707     return 0
708
709 def fetch_fba_rss(args: argparse.Namespace) -> int:
710     logger.debug("args[]='%s' - CALLED!", type(args))
711
712     domains = list()
713
714     logger.debug("Invoking locking.acquire() ...")
715     locking.acquire()
716
717     components = urlparse(args.feed)
718
719     if apis.is_recent(components.netloc):
720         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
721         return 0
722     else:
723         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
724         apis.update(components.netloc)
725
726     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
727     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
728
729     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
730     if response.ok and response.status_code < 300 and len(response.text) > 0:
731         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
732         rss = atoma.parse_rss_bytes(response.content)
733
734         logger.debug("rss[]='%s'", type(rss))
735         for item in rss.items:
736             logger.debug("item='%s'", item)
737             domain = tidyup.domain(item.link.split("=")[1])
738
739             logger.debug("domain='%s' - AFTER!", domain)
740             if domain == "":
741                 logger.debug("domain is empty - SKIPPED!")
742                 continue
743             elif not utils.is_domain_wanted(domain):
744                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
745                 continue
746             elif domain in domains:
747                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
748                 continue
749             elif instances.is_registered(domain):
750                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
751                 continue
752             elif instances.is_recent(domain):
753                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
754                 continue
755
756             logger.debug("Adding domain='%s'", domain)
757             domains.append(domain)
758
759     logger.debug("domains()=%d", len(domains))
760     if len(domains) > 0:
761         logger.info("Adding %d new instances ...", len(domains))
762         for domain in domains:
763             try:
764                 logger.info("Fetching instances from domain='%s' ...", domain)
765                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
766             except network.exceptions as exception:
767                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
768                 instances.set_last_error(domain, exception)
769                 return 100
770
771     logger.debug("Success! - EXIT!")
772     return 0
773
774 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
775     logger.debug("args[]='%s' - CALLED!", type(args))
776
777     logger.debug("Invoking locking.acquire() ...")
778     locking.acquire()
779
780     api_domain = "ryana.agency"
781     if apis.is_recent(api_domain):
782         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
783         return 0
784     else:
785         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
786         apis.update(api_domain)
787
788     feed = f"https://{api_domain}/users/fba/feed.atom"
789
790     domains = list()
791
792     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
793     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
794
795     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
796     if response.ok and response.status_code < 300 and len(response.text) > 0:
797         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
798         atom = atoma.parse_atom_bytes(response.content)
799
800         logger.debug("atom[]='%s'", type(atom))
801         for entry in atom.entries:
802             logger.debug("entry[]='%s'", type(entry))
803             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
804             logger.debug("doc[]='%s'", type(doc))
805             for element in doc.findAll("a"):
806                 logger.debug("element[]='%s'", type(element))
807                 for href in element["href"].split(","):
808                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
809                     domain = tidyup.domain(href)
810
811                     logger.debug("domain='%s' - AFTER!", domain)
812                     if domain == "":
813                         logger.debug("domain is empty - SKIPPED!")
814                         continue
815                     elif not utils.is_domain_wanted(domain):
816                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
817                         continue
818                     elif domain in domains:
819                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
820                         continue
821                     elif instances.is_registered(domain):
822                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
823                         continue
824                     elif instances.is_recent(domain):
825                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
826                         continue
827
828                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
829                     domains.append(domain)
830
831     logger.debug("domains()=%d", len(domains))
832     if len(domains) > 0:
833         logger.info("Adding %d new instances ...", len(domains))
834         for domain in domains:
835             logger.debug("domain='%s'", domain)
836             try:
837                 logger.info("Fetching instances from domain='%s' ...", domain)
838                 federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name)
839             except network.exceptions as exception:
840                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
841                 instances.set_last_error(domain, exception)
842                 return 100
843
844     logger.debug("Success! - EXIT!")
845     return 0
846
847 def fetch_instances(args: argparse.Namespace) -> int:
848     logger.debug("args[]='%s' - CALLED!", type(args))
849
850     logger.debug("args.domain='%s' - checking ...", args.domain)
851     if not validators.domain(args.domain):
852         logger.warning("args.domain='%s' is not valid.", args.domain)
853         return 100
854     elif blacklist.is_blacklisted(args.domain):
855         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
856         return 101
857
858     logger.debug("Invoking locking.acquire() ...")
859     locking.acquire()
860
861     # Initial fetch
862     try:
863         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
864         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
865     except network.exceptions as exception:
866         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
867         instances.set_last_error(args.domain, exception)
868         instances.update_data(args.domain)
869         return 100
870
871     if args.single:
872         logger.debug("Not fetching more instances - EXIT!")
873         return 0
874
875     # Loop through some instances
876     database.cursor.execute(
877         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
878     )
879
880     rows = database.cursor.fetchall()
881     logger.info("Checking %d entries ...", len(rows))
882     for row in rows:
883         logger.debug("row[domain]='%s'", row["domain"])
884         if row["domain"] == "":
885             logger.debug("row[domain] is empty - SKIPPED!")
886             continue
887         elif not utils.is_domain_wanted(row["domain"]):
888             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
889             continue
890
891         try:
892             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
893             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
894         except network.exceptions as exception:
895             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
896             instances.set_last_error(row["domain"], exception)
897
898     logger.debug("Success - EXIT!")
899     return 0
900
901 def fetch_oliphant(args: argparse.Namespace) -> int:
902     logger.debug("args[]='%s' - CALLED!", type(args))
903
904     logger.debug("Invoking locking.acquire() ...")
905     locking.acquire()
906
907     api_domain = "codeberg.org"
908     if apis.is_recent(api_domain):
909         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
910         return 0
911     else:
912         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
913         apis.update(api_domain)
914
915     # Base URL
916     base_url = f"https://{api_domain}/oliphant/blocklists/raw/branch/main/blocklists"
917
918     # URLs to fetch
919     blocklists = (
920         {
921             "blocker": "artisan.chat",
922             "csv_url": "mastodon/artisan.chat.csv",
923         },{
924             "blocker": "mastodon.art",
925             "csv_url": "mastodon/mastodon.art.csv",
926         },{
927             "blocker": "pleroma.envs.net",
928             "csv_url": "mastodon/pleroma.envs.net.csv",
929         },{
930             "blocker": "oliphant.social",
931             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
932         },{
933             "blocker": "mastodon.online",
934             "csv_url": "mastodon/mastodon.online.csv",
935         },{
936             "blocker": "mastodon.social",
937             "csv_url": "mastodon/mastodon.social.csv",
938         },{
939             "blocker": "mastodon.social",
940             "csv_url": "other/missing-tier0-mastodon.social.csv",
941         },{
942             "blocker": "rage.love",
943             "csv_url": "mastodon/rage.love.csv",
944         },{
945             "blocker": "sunny.garden",
946             "csv_url": "mastodon/sunny.garden.csv",
947         },{
948             "blocker": "solarpunk.moe",
949             "csv_url": "mastodon/solarpunk.moe.csv",
950         },{
951             "blocker": "toot.wales",
952             "csv_url": "mastodon/toot.wales.csv",
953         },{
954             "blocker": "union.place",
955             "csv_url": "mastodon/union.place.csv",
956         }
957     )
958
959     domains = list()
960
961     logger.debug("Downloading %d files ...", len(blocklists))
962     for block in blocklists:
963         # Is domain given and not equal blocker?
964         if isinstance(args.domain, str) and args.domain != block["blocker"]:
965             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
966             continue
967         elif args.domain in domains:
968             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
969             continue
970         elif instances.is_recent(block["blocker"]):
971             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
972             continue
973
974         # Fetch this URL
975         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
976         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
977
978         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
979         if not response.ok or response.status_code >= 300 or response.content == "":
980             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
981             continue
982
983         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
984         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
985
986         blockdict = list()
987
988         logger.info("Processing %d rows ...", len(reader))
989         cnt = 0
990         for row in reader:
991             logger.debug("row[%s]='%s'", type(row), row)
992             domain = severity = None
993             reject_media = reject_reports = False
994
995             if "#domain" in row:
996                 domain = row["#domain"]
997             elif "domain" in row:
998                 domain = row["domain"]
999             else:
1000                 logger.debug("row='%s' does not contain domain column", row)
1001                 continue
1002
1003             if "#severity" in row:
1004                 severity = row["#severity"]
1005             elif "severity" in row:
1006                 severity = row["severity"]
1007             else:
1008                 logger.debug("row='%s' does not contain severity column", row)
1009                 continue
1010
1011             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1012                 reject_media = True
1013             elif "reject_media" in row and row["reject_media"].lower() == "true":
1014                 reject_media = True
1015
1016             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1017                 reject_reports = True
1018             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1019                 reject_reports = True
1020
1021             cnt = cnt + 1
1022             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1023             if domain == "":
1024                 logger.debug("domain is empty - SKIPPED!")
1025                 continue
1026             elif not utils.is_domain_wanted(domain):
1027                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1028                 continue
1029
1030             logger.debug("Marking domain='%s' as handled", domain)
1031             domains.append(domain)
1032
1033             logger.debug("Processing domain='%s' ...", domain)
1034             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1035             logger.debug("processed='%s'", processed)
1036
1037             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1038                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1039                 blockdict.append({
1040                     "blocked": domain,
1041                     "reason" : block["reason"],
1042                 })
1043
1044             if reject_media:
1045                 utils.process_block(block["blocker"], domain, None, "reject_media")
1046             if reject_reports:
1047                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1048
1049         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", block["blocker"], cnt)
1050         instances.set_total_blocks(block["blocker"], cnt)
1051
1052         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1053         if instances.has_pending(block["blocker"]):
1054             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1055             instances.update_data(block["blocker"])
1056
1057         logger.debug("Invoking commit() ...")
1058         database.connection.commit()
1059
1060         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1061         if config.get("bot_enabled") and len(blockdict) > 0:
1062             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1063             network.send_bot_post(block["blocker"], blockdict)
1064
1065     logger.debug("Success! - EXIT!")
1066     return 0
1067
1068 def fetch_txt(args: argparse.Namespace) -> int:
1069     logger.debug("args[]='%s' - CALLED!", type(args))
1070
1071     logger.debug("Invoking locking.acquire() ...")
1072     locking.acquire()
1073
1074     # Static URLs
1075     urls = ({
1076         "blocker": "seirdy.one",
1077         "url"    : "https://seirdy.one/pb/bsl.txt",
1078     },)
1079
1080     logger.info("Checking %d text file(s) ...", len(urls))
1081     for row in urls:
1082         logger.debug("Fetching row[url]='%s' ...", row["url"])
1083         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1084
1085         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1086         if response.ok and response.status_code < 300 and response.text != "":
1087             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1088             domains = response.text.split("\n")
1089
1090             logger.info("Processing %d domains ...", len(domains))
1091             for domain in domains:
1092                 logger.debug("domain='%s' - BEFORE!", domain)
1093                 domain = tidyup.domain(domain)
1094
1095                 logger.debug("domain='%s' - AFTER!", domain)
1096                 if domain == "":
1097                     logger.debug("domain is empty - SKIPPED!")
1098                     continue
1099                 elif not utils.is_domain_wanted(domain):
1100                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1101                     continue
1102                 elif instances.is_recent(domain):
1103                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1104                     continue
1105
1106                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1107                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1108
1109                 logger.debug("processed='%s'", processed)
1110                 if not processed:
1111                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1112                     continue
1113
1114     logger.debug("Success! - EXIT!")
1115     return 0
1116
1117 def fetch_fedipact(args: argparse.Namespace) -> int:
1118     logger.debug("args[]='%s' - CALLED!", type(args))
1119
1120     logger.debug("Invoking locking.acquire() ...")
1121     locking.acquire()
1122
1123     api_domain = "fedipact.online"
1124     if apis.is_recent(api_domain):
1125         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1126         return 0
1127     else:
1128         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1129         apis.update(api_domain)
1130
1131     response = utils.fetch_url(
1132         f"https://{api_domain}",
1133         network.web_headers,
1134         (config.get("connection_timeout"), config.get("read_timeout"))
1135     )
1136
1137     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1138     if response.ok and response.status_code < 300 and response.text != "":
1139         logger.debug("Parsing %d Bytes ...", len(response.text))
1140
1141         doc = bs4.BeautifulSoup(response.text, "html.parser")
1142         logger.debug("doc[]='%s'", type(doc))
1143
1144         rows = doc.findAll("li")
1145         logger.info("Checking %d row(s) ...", len(rows))
1146         for row in rows:
1147             logger.debug("row[]='%s'", type(row))
1148             domain = tidyup.domain(row.contents[0])
1149
1150             logger.debug("domain='%s' - AFTER!", domain)
1151             if domain == "":
1152                 logger.debug("domain is empty - SKIPPED!")
1153                 continue
1154             elif not utils.is_domain_wanted(domain):
1155                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1156                 continue
1157             elif instances.is_registered(domain):
1158                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1159                 continue
1160             elif instances.is_recent(domain):
1161                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1162                 continue
1163
1164             logger.info("Fetching domain='%s' ...", domain)
1165             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1166
1167     logger.debug("Success! - EXIT!")
1168     return 0
1169
1170 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1171     logger.debug("args[]='%s' - CALLED!", type(args))
1172
1173     logger.debug("Invoking locking.acquire() ...")
1174     locking.acquire()
1175
1176     api_domain = "joinfediverse.wiki"
1177     if apis.is_recent(api_domain):
1178         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1179         return 0
1180     else:
1181         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1182         apis.update(api_domain)
1183
1184     raw = utils.fetch_url(
1185         f"https://{api_domain}/FediBlock",
1186         network.web_headers,
1187         (config.get("connection_timeout"), config.get("read_timeout"))
1188     ).text
1189     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1190
1191     doc = bs4.BeautifulSoup(raw, "html.parser")
1192     logger.debug("doc[]='%s'", type(doc))
1193
1194     tables = doc.findAll("table", {"class": "wikitable"})
1195
1196     logger.info("Analyzing %d table(s) ...", len(tables))
1197     blocklist = list()
1198     for table in tables:
1199         logger.debug("table[]='%s'", type(table))
1200
1201         rows = table.findAll("tr")
1202         logger.info("Checking %d row(s) ...", len(rows))
1203         block_headers = dict()
1204         for row in rows:
1205             logger.debug("row[%s]='%s'", type(row), row)
1206
1207             headers = row.findAll("th")
1208             logger.debug("Found headers()=%d header(s)", len(headers))
1209             if len(headers) > 1:
1210                 block_headers = dict()
1211                 cnt = 0
1212                 for header in headers:
1213                     cnt = cnt + 1
1214                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1215                     text = header.contents[0]
1216
1217                     logger.debug("text[]='%s'", type(text))
1218                     if not isinstance(text, str):
1219                         logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text))
1220                         continue
1221                     elif validators.domain(text.strip()):
1222                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1223                         continue
1224
1225                     text = tidyup.domain(text.strip())
1226                     logger.debug("text='%s'", text)
1227                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1228                         logger.debug("Found header: '%s'=%d", text, cnt)
1229                         block_headers[cnt] = text
1230
1231             elif len(block_headers) == 0:
1232                 logger.debug("row is not scrapable - SKIPPED!")
1233                 continue
1234             elif len(block_headers) > 0:
1235                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1236                 cnt = 0
1237                 block = dict()
1238
1239                 for element in row.find_all(["th", "td"]):
1240                     cnt = cnt + 1
1241                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1242                     if cnt in block_headers:
1243                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1244
1245                         text = element.text.strip()
1246                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1247
1248                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1249                         if key in ["domain", "instance"]:
1250                             block[key] = text
1251                         elif key == "reason":
1252                             block[key] = tidyup.reason(text)
1253                         elif key == "subdomain(s)":
1254                             block[key] = list()
1255                             if text != "":
1256                                 block[key] = text.split("/")
1257                         else:
1258                             logger.debug("key='%s'", key)
1259                             block[key] = text
1260
1261                 logger.debug("block()=%d ...", len(block))
1262                 if len(block) > 0:
1263                     logger.debug("Appending block()=%d ...", len(block))
1264                     blocklist.append(block)
1265
1266     logger.debug("blocklist()=%d", len(blocklist))
1267
1268     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1269     domains = database.cursor.fetchall()
1270
1271     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1272     blocking = list()
1273     for block in blocklist:
1274         logger.debug("block='%s'", block)
1275         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1276             origin = block["blocked"]
1277             for subdomain in block["subdomain(s)"]:
1278                 block["blocked"] = subdomain + "." + origin
1279                 blocking.append(block)
1280         else:
1281             blocking.append(block)
1282
1283     logger.debug("blocking()=%d", blocking)
1284     for block in blocking:
1285         logger.debug("block[]='%s'", type(block))
1286         block["blocked"] = tidyup.domain(block["blocked"])
1287
1288         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1289         if block["blocked"] == "":
1290             logger.debug("block[blocked] is empty - SKIPPED!")
1291             continue
1292         elif not utils.is_domain_wanted(block["blocked"]):
1293             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1294             continue
1295         elif instances.is_recent(block["blocked"]):
1296             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1297             continue
1298
1299         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1300         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1301
1302     blockdict = list()
1303     for blocker in domains:
1304         blocker = blocker[0]
1305         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1306
1307         for block in blocking:
1308             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1309             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1310
1311             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1312             if block["blocked"] == "":
1313                 logger.debug("block[blocked] is empty - SKIPPED!")
1314                 continue
1315             elif not utils.is_domain_wanted(block["blocked"]):
1316                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1317                 continue
1318
1319             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1320             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1321                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1322                 blockdict.append({
1323                     "blocked": block["blocked"],
1324                     "reason" : block["reason"],
1325                 })
1326
1327         if instances.has_pending(blocker):
1328             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1329             instances.update_data(blocker)
1330
1331         logger.debug("Invoking commit() ...")
1332         database.connection.commit()
1333
1334         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1335         if config.get("bot_enabled") and len(blockdict) > 0:
1336             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1337             network.send_bot_post(blocker, blockdict)
1338
1339     logger.debug("Success! - EXIT!")
1340     return 0
1341
1342 def recheck_obfuscation(args: argparse.Namespace) -> int:
1343     logger.debug("args[]='%s' - CALLED!", type(args))
1344
1345     logger.debug("Invoking locking.acquire() ...")
1346     locking.acquire()
1347
1348     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1349         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1350     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1351         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1352     else:
1353         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1354
1355     rows = database.cursor.fetchall()
1356     logger.info("Checking %d domains ...", len(rows))
1357     for row in rows:
1358         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1359         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1360             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1361             continue
1362
1363         blocking = list()
1364         if row["software"] == "pleroma":
1365             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1366             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1367         elif row["software"] == "mastodon":
1368             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1369             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1370         elif row["software"] == "lemmy":
1371             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1372             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1373         elif row["software"] == "friendica":
1374             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1375             blocking = friendica.fetch_blocks(row["domain"])
1376         elif row["software"] == "misskey":
1377             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1378             blocking = misskey.fetch_blocks(row["domain"])
1379         else:
1380             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1381
1382         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1383         instances.set_total_blocks(row["domain"], blocking)
1384
1385         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1386         obfuscated = 0
1387         blockdict = list()
1388         for block in blocking:
1389             logger.debug("block[blocked]='%s'", block["blocked"])
1390             blocked = None
1391
1392             if block["blocked"] == "":
1393                 logger.debug("block[blocked] is empty - SKIPPED!")
1394                 continue
1395             elif block["blocked"].endswith(".arpa"):
1396                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1397                 continue
1398             elif block["blocked"].endswith(".tld"):
1399                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1400                 continue
1401             elif block["blocked"].endswith(".onion"):
1402                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1403                 continue
1404             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1405                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1406                 obfuscated = obfuscated + 1
1407                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1408             elif not utils.is_domain_wanted(block["blocked"]):
1409                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1410                 continue
1411             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1412                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1413                 continue
1414
1415             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1416             if blocked is not None and blocked != block["blocked"]:
1417                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1418                 obfuscated = obfuscated - 1
1419                 if blocks.is_instance_blocked(row["domain"], blocked):
1420                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1421                     continue
1422
1423                 block["block_level"] = utils.alias_block_level(block["block_level"])
1424
1425                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1426                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1427                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1428                     blockdict.append({
1429                         "blocked": blocked,
1430                         "reason" : block["reason"],
1431                     })
1432
1433         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1434         if obfuscated == 0 and len(blocking) > 0:
1435             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1436             instances.set_has_obfuscation(row["domain"], False)
1437
1438         if instances.has_pending(row["domain"]):
1439             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1440             instances.update_data(row["domain"])
1441
1442         logger.debug("Invoking commit() ...")
1443         database.connection.commit()
1444
1445         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1446         if config.get("bot_enabled") and len(blockdict) > 0:
1447             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1448             network.send_bot_post(row["domain"], blockdict)
1449
1450     logger.debug("Success! - EXIT!")
1451     return 0
1452
1453 def fetch_fedilist(args: argparse.Namespace) -> int:
1454     logger.debug("args[]='%s' - CALLED!", type(args))
1455
1456     logger.debug("Invoking locking.acquire() ...")
1457     locking.acquire()
1458
1459     api_domain = "demo.fedilist.com"
1460     if apis.is_recent(api_domain):
1461         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1462         return 0
1463     else:
1464         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1465         apis.update(api_domain)
1466
1467     url = f"http://{api_domain}/instance/csv?onion=not"
1468     if args.software is not None and args.software != "":
1469         logger.debug("args.software='%s'", args.software)
1470         url = f"http://{api_domain}/instance/csv?software={args.software}&onion=not"
1471
1472     logger.info("Fetching url='%s' ...", url)
1473     response = reqto.get(
1474         url,
1475         headers=network.web_headers,
1476         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1477         allow_redirects=False
1478     )
1479
1480     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1481     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1482         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1483         return 1
1484
1485     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1486
1487     logger.debug("reader[]='%s'", type(reader))
1488     blockdict = list()
1489     for row in reader:
1490         logger.debug("row[]='%s'", type(row))
1491         domain = tidyup.domain(row["hostname"])
1492         logger.debug("domain='%s' - AFTER!", domain)
1493
1494         if domain == "":
1495             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1496             continue
1497         elif not utils.is_domain_wanted(domain):
1498             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1499             continue
1500         elif (args.all is None or not args.all) and instances.is_registered(domain):
1501             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1502             continue
1503         elif instances.is_recent(domain):
1504             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1505             continue
1506
1507         logger.info("Fetching instances from domain='%s' ...", domain)
1508         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1509
1510     logger.debug("Success! - EXIT!")
1511     return 0
1512
1513 def update_nodeinfo(args: argparse.Namespace) -> int:
1514     logger.debug("args[]='%s' - CALLED!", type(args))
1515
1516     logger.debug("Invoking locking.acquire() ...")
1517     locking.acquire()
1518
1519     if args.domain is not None and args.domain != "":
1520         logger.debug("Fetching args.domain='%s'", args.domain)
1521         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1522     elif args.software is not None and args.software != "":
1523         logger.info("Fetching domains for args.software='%s'", args.software)
1524         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1525     else:
1526         logger.info("Fetching domains for recently updated ...")
1527         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1528
1529     domains = database.cursor.fetchall()
1530
1531     logger.info("Checking %d domain(s) ...", len(domains))
1532     cnt = 0
1533     for row in domains:
1534         logger.debug("row[]='%s'", type(row))
1535         try:
1536             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1537             software = federation.determine_software(row["domain"])
1538
1539             logger.debug("Determined software='%s'", software)
1540             if software != row["software"] and software is not None:
1541                 logger.warning("Software type has changed from '%s' to '%s'!", row["software"], software)
1542                 instances.set_software(row["domain"], software)
1543
1544             instances.set_success(row["domain"])
1545         except network.exceptions as exception:
1546             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1547             instances.set_last_error(row["domain"], exception)
1548
1549         instances.set_last_nodeinfo(row["domain"])
1550         instances.update_data(row["domain"])
1551         cnt = cnt + 1
1552
1553     logger.debug("Success! - EXIT!")
1554     return 0
1555
1556 def fetch_instances_social(args: argparse.Namespace) -> int:
1557     logger.debug("args[]='%s' - CALLED!", type(args))
1558
1559     logger.debug("Invoking locking.acquire() ...")
1560     locking.acquire()
1561
1562     api_domain = "instances.social"
1563
1564     if config.get("instances_social_api_key") == "":
1565         logger.error("API key not set. Please set in your config.json file.")
1566         return 1
1567     elif apis.is_recent(api_domain):
1568         logger.info("API from api_domain='%s' has recently being accessed - EXIT!", api_domain)
1569         return 0
1570     else:
1571         logger.debug("api_domain='%s' has not been recently used, marking ...", api_domain)
1572         apis.update(api_domain)
1573
1574     headers = {
1575         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1576     }
1577
1578     fetched = network.get_json_api(
1579         api_domain,
1580         "/api/1.0/instances/list?count=0&sort_by=name",
1581         headers,
1582         (config.get("connection_timeout"), config.get("read_timeout"))
1583     )
1584     logger.debug("fetched[]='%s'", type(fetched))
1585
1586     if "error_message" in fetched:
1587         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1588         return 2
1589     elif "exception" in fetched:
1590         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1591         return 3
1592     elif "json" not in fetched:
1593         logger.warning("fetched has no element 'json' - EXIT!")
1594         return 4
1595     elif "instances" not in fetched["json"]:
1596         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1597         return 5
1598
1599     domains = list()
1600     rows = fetched["json"]["instances"]
1601
1602     logger.info("Checking %d row(s) ...", len(rows))
1603     for row in rows:
1604         logger.debug("row[]='%s'", type(row))
1605         domain = tidyup.domain(row["name"])
1606
1607         logger.debug("domain='%s' - AFTER!", domain)
1608         if domain == "":
1609             logger.debug("domain is empty - SKIPPED!")
1610             continue
1611         elif not utils.is_domain_wanted(domain):
1612             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1613             continue
1614         elif domain in domains:
1615             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1616             continue
1617         elif instances.is_registered(domain):
1618             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1619             continue
1620         elif instances.is_recent(domain):
1621             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1622             continue
1623
1624         logger.info("Fetching instances from domain='%s'", domain)
1625         federation.fetch_instances(domain, api_domain, None, inspect.currentframe().f_code.co_name)
1626
1627     logger.debug("Success! - EXIT!")
1628     return 0