]> git.mxchange.org Git - fba.git/blob - fba/commands.py
e7f2e2a49427039c246e29621ebf7d8a20ff0fa3
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import software as software_helper
41 from fba.helpers import tidyup
42
43 from fba.http import federation
44 from fba.http import network
45
46 from fba.models import blocks
47 from fba.models import instances
48 from fba.models import sources
49
50 from fba.networks import friendica
51 from fba.networks import lemmy
52 from fba.networks import mastodon
53 from fba.networks import misskey
54 from fba.networks import pleroma
55
56 logging.basicConfig(level=logging.INFO)
57 logger = logging.getLogger(__name__)
58 #logger.setLevel(logging.DEBUG)
59
60 def check_instance(args: argparse.Namespace) -> int:
61     logger.debug("args.domain='%s' - CALLED!", args.domain)
62     status = 0
63     if not validators.domain(args.domain):
64         logger.warning("args.domain='%s' is not valid", args.domain)
65         status = 100
66     elif blacklist.is_blacklisted(args.domain):
67         logger.warning("args.domain='%s' is blacklisted", args.domain)
68         status = 101
69     elif instances.is_registered(args.domain):
70         logger.warning("args.domain='%s' is already registered", args.domain)
71         status = 102
72     else:
73         logger.info("args.domain='%s' is not known", args.domain)
74
75     logger.debug("status=%d - EXIT!", status)
76     return status
77
78 def check_nodeinfo(args: argparse.Namespace) -> int:
79     logger.debug("args[]='%s' - CALLED!", type(args))
80
81     # Fetch rows
82     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
83
84     cnt = 0
85     for row in database.cursor.fetchall():
86         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
87         punycode = row["domain"].encode("idna").decode("utf-8")
88
89         if row["nodeinfo_url"].startswith("/"):
90             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
91             continue
92         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
93             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
94             cnt = cnt + 1
95
96     logger.info("Found %d row(s)", cnt)
97
98     logger.debug("EXIT!")
99     return 0
100
101 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
102     logger.debug("args[]='%s' - CALLED!", type(args))
103
104     # No CSRF by default, you don't have to add network.source_headers by yourself here
105     headers = tuple()
106     source_domain = "pixelfed.org"
107
108     if sources.is_recent(source_domain):
109         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
110         return 0
111     else:
112         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
113         sources.update(source_domain)
114
115     try:
116         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
117         headers = csrf.determine(source_domain, dict())
118     except network.exceptions as exception:
119         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
120         return list()
121
122     try:
123         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
124         fetched = network.get_json_api(
125             source_domain,
126             "/api/v1/servers/all.json?scope=All&country=all&language=all",
127             headers,
128             (config.get("connection_timeout"), config.get("read_timeout"))
129         )
130
131         logger.debug("JSON API returned %d elements", len(fetched))
132         if "error_message" in fetched:
133             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
134             return 101
135         elif "data" not in fetched["json"]:
136             logger.warning("API did not return JSON with 'data' element - EXIT!")
137             return 102
138
139         rows = fetched["json"]["data"]
140         logger.info("Checking %d fetched rows ...", len(rows))
141         for row in rows:
142             logger.debug("row[]='%s'", type(row))
143             if "domain" not in row:
144                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
145                 continue
146             elif row["domain"] == "":
147                 logger.debug("row[domain] is empty - SKIPPED!")
148                 continue
149             elif not utils.is_domain_wanted(row["domain"]):
150                 logger.warning("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
151                 continue
152             elif instances.is_registered(row["domain"]):
153                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
154                 continue
155             elif instances.is_recent(row["domain"]):
156                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
157                 continue
158
159             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
160             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
161
162     except network.exceptions as exception:
163         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
164         return 103
165
166     logger.debug("Success! - EXIT!")
167     return 0
168
169 def fetch_bkali(args: argparse.Namespace) -> int:
170     logger.debug("args[]='%s' - CALLED!", type(args))
171
172     logger.debug("Invoking locking.acquire() ...")
173     locking.acquire()
174
175     source_domain = "gql.sources.bka.li"
176     if sources.is_recent(source_domain):
177         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
178         return 0
179     else:
180         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
181         sources.update(source_domain)
182
183     domains = list()
184     try:
185         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
186         fetched = network.post_json_api(
187             source_domain,
188             "/v1/graphql",
189             json.dumps({
190                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
191             })
192         )
193
194         logger.debug("fetched[]='%s'", type(fetched))
195         if "error_message" in fetched:
196             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
197             return 100
198         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
199             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
200             return 101
201
202         rows = fetched["json"]
203
204         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
205         if len(rows) == 0:
206             raise Exception("WARNING: Returned no records")
207         elif "data" not in rows:
208             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
209         elif "nodeinfo" not in rows["data"]:
210             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
211
212         for entry in rows["data"]["nodeinfo"]:
213             logger.debug("entry[%s]='%s'", type(entry), entry)
214             if "domain" not in entry:
215                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
216                 continue
217             elif entry["domain"] == "":
218                 logger.debug("entry[domain] is empty - SKIPPED!")
219                 continue
220             elif not utils.is_domain_wanted(entry["domain"]):
221                 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
222                 continue
223             elif instances.is_registered(entry["domain"]):
224                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
225                 continue
226             elif instances.is_recent(entry["domain"]):
227                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
228                 continue
229
230             logger.debug("Adding domain='%s' ...", entry["domain"])
231             domains.append(entry["domain"])
232
233     except network.exceptions as exception:
234         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
235         return 102
236
237     logger.debug("domains()=%d", len(domains))
238     if len(domains) > 0:
239         logger.info("Adding %d new instances ...", len(domains))
240         for domain in domains:
241             try:
242                 logger.info("Fetching instances from domain='%s' ...", domain)
243                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
244             except network.exceptions as exception:
245                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
246                 instances.set_last_error(domain, exception)
247                 return 100
248
249     logger.debug("Success - EXIT!")
250     return 0
251
252 def fetch_blocks(args: argparse.Namespace) -> int:
253     logger.debug("args[]='%s' - CALLED!", type(args))
254     if args.domain is not None and args.domain != "":
255         logger.debug("args.domain='%s' - checking ...", args.domain)
256         if not validators.domain(args.domain):
257             logger.warning("args.domain='%s' is not valid.", args.domain)
258             return 100
259         elif blacklist.is_blacklisted(args.domain):
260             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
261             return 101
262         elif not instances.is_registered(args.domain):
263             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
264             return 102
265
266     logger.debug("Invoking locking.acquire() ...")
267     locking.acquire()
268
269     if args.domain is not None and args.domain != "":
270         # Re-check single domain
271         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
272         database.cursor.execute(
273             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
274         )
275     elif args.software is not None and args.software != "":
276         # Re-check single software
277         logger.debug("Querying database for args.software='%s' ...", args.software)
278         database.cursor.execute(
279             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
280         )
281     else:
282         # Re-check after "timeout" (aka. minimum interval)
283         database.cursor.execute(
284             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
285         )
286
287     rows = database.cursor.fetchall()
288     logger.info("Checking %d entries ...", len(rows))
289     for blocker, software, origin, nodeinfo_url in rows:
290         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
291         blocker = tidyup.domain(blocker)
292         logger.debug("blocker='%s' - AFTER!", blocker)
293
294         if blocker == "":
295             logger.warning("blocker is now empty!")
296             continue
297         elif nodeinfo_url is None or nodeinfo_url == "":
298             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
299             continue
300         elif not utils.is_domain_wanted(blocker):
301             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
302             continue
303
304         logger.debug("blocker='%s'", blocker)
305         instances.set_last_blocked(blocker)
306         instances.set_has_obfuscation(blocker, False)
307
308         blocking = list()
309         blockdict = list()
310         if software == "pleroma":
311             logger.info("blocker='%s',software='%s'", blocker, software)
312             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
313         elif software == "mastodon":
314             logger.info("blocker='%s',software='%s'", blocker, software)
315             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
316         elif software == "lemmy":
317             logger.info("blocker='%s',software='%s'", blocker, software)
318             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
319         elif software == "friendica":
320             logger.info("blocker='%s',software='%s'", blocker, software)
321             blocking = friendica.fetch_blocks(blocker)
322         elif software == "misskey":
323             logger.info("blocker='%s',software='%s'", blocker, software)
324             blocking = misskey.fetch_blocks(blocker)
325         else:
326             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
327
328         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
329         instances.set_total_blocks(blocker, blocking)
330
331         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
332         blockdict = list()
333         for block in blocking:
334             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
335
336             if block["block_level"] == "":
337                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
338                 continue
339
340             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
341             block["blocked"] = tidyup.domain(block["blocked"])
342             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
343             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
344
345             if block["blocked"] == "":
346                 logger.warning("blocked is empty, blocker='%s'", blocker)
347                 continue
348             elif block["blocked"].endswith(".onion"):
349                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
350                 continue
351             elif block["blocked"].endswith(".arpa"):
352                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
353                 continue
354             elif block["blocked"].endswith(".tld"):
355                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
356                 continue
357             elif block["blocked"].find("*") >= 0:
358                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
359
360                 # Some friendica servers also obscure domains without hash
361                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
362
363                 logger.debug("row[]='%s'", type(row))
364                 if row is None:
365                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
366                     instances.set_has_obfuscation(blocker, True)
367                     continue
368
369                 block["blocked"] = row["domain"]
370                 origin           = row["origin"]
371                 nodeinfo_url     = row["nodeinfo_url"]
372             elif block["blocked"].find("?") >= 0:
373                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
374
375                 # Some obscure them with question marks, not sure if that's dependent on version or not
376                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
377
378                 logger.debug("row[]='%s'", type(row))
379                 if row is None:
380                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
381                     instances.set_has_obfuscation(blocker, True)
382                     continue
383
384                 block["blocked"] = row["domain"]
385                 origin           = row["origin"]
386                 nodeinfo_url     = row["nodeinfo_url"]
387
388             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
389             if block["blocked"] == "":
390                 logger.debug("block[blocked] is empty - SKIPPED!")
391                 continue
392             elif not utils.is_domain_wanted(block["blocked"]):
393                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
394                 continue
395             elif block["block_level"] in ["accept", "accepted"]:
396                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
397                 continue
398             elif not instances.is_registered(block["blocked"]):
399                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
400                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
401
402             block["block_level"] = utils.alias_block_level(block["block_level"])
403
404             if utils.process_block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
405                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
406                 blockdict.append({
407                     "blocked": block["blocked"],
408                     "reason" : block["reason"],
409                 })
410
411             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
412             cookies.clear(block["blocked"])
413
414         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
415         if instances.has_pending(blocker):
416             logger.debug("Flushing updates for blocker='%s' ...", blocker)
417             instances.update_data(blocker)
418
419         logger.debug("Invoking commit() ...")
420         database.connection.commit()
421
422         logger.debug("Invoking cookies.clear(%s) ...", blocker)
423         cookies.clear(blocker)
424
425         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
426         if config.get("bot_enabled") and len(blockdict) > 0:
427             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
428             network.send_bot_post(blocker, blockdict)
429
430     logger.debug("Success! - EXIT!")
431     return 0
432
433 def fetch_observer(args: argparse.Namespace) -> int:
434     logger.debug("args[]='%s' - CALLED!", type(args))
435
436     logger.debug("Invoking locking.acquire() ...")
437     locking.acquire()
438
439     source_domain = "fediverse.observer"
440     if sources.is_recent(source_domain):
441         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
442         return 0
443     else:
444         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
445         sources.update(source_domain)
446
447     types = list()
448     if args.software is None:
449         logger.info("Fetching software list ...")
450         raw = utils.fetch_url(
451             f"https://{source_domain}",
452             network.web_headers,
453             (config.get("connection_timeout"), config.get("read_timeout"))
454         ).text
455         logger.debug("raw[%s]()=%d", type(raw), len(raw))
456
457         doc = bs4.BeautifulSoup(raw, features="html.parser")
458         logger.debug("doc[]='%s'", type(doc))
459
460         items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
461         logger.debug("items[]='%s'", type(items))
462
463         logger.info("Checking %d menu items ...", len(items))
464         for item in items:
465             logger.debug("item[%s]='%s'", type(item), item)
466             if item.text.lower() == "all":
467                 logger.debug("Skipping 'All' menu entry ...")
468                 continue
469
470             logger.debug("Appending item.text='%s' ...", item.text)
471             types.append(tidyup.domain(item.text))
472     else:
473         logger.info("Adding args.software='%s' as type ...", args.software)
474         types.append(args.software)
475
476     logger.info("Fetching %d different table data ...", len(types))
477     for software in types:
478         logger.debug("software='%s' - BEFORE!", software)
479         if args.software is not None and args.software != software:
480             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
481             continue
482
483         doc = None
484         try:
485             logger.debug("Fetching table data for software='%s' ...", software)
486             raw = utils.fetch_url(
487                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
488                 network.web_headers,
489                 (config.get("connection_timeout"), config.get("read_timeout"))
490             ).text
491             logger.debug("raw[%s]()=%d", type(raw), len(raw))
492
493             doc = bs4.BeautifulSoup(raw, features="html.parser")
494             logger.debug("doc[]='%s'", type(doc))
495         except network.exceptions as exception:
496             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
497             continue
498
499         items = doc.findAll("a", {"class": "url"})
500         logger.info("Checking %d items,software='%s' ...", len(items), software)
501         for item in items:
502             logger.debug("item[]='%s'", type(item))
503             domain = item.decode_contents()
504
505             logger.debug("domain='%s' - AFTER!", domain)
506             if domain == "":
507                 logger.debug("domain is empty - SKIPPED!")
508                 continue
509             elif not utils.is_domain_wanted(domain):
510                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
511                 continue
512             elif instances.is_registered(domain):
513                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
514                 continue
515             elif instances.is_recent(domain):
516                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
517                 continue
518
519             software = software_helper.alias(software)
520             logger.info("Fetching instances for domain='%s'", domain)
521             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
522
523     logger.debug("Success! - EXIT!")
524     return 0
525
526 def fetch_todon_wiki(args: argparse.Namespace) -> int:
527     logger.debug("args[]='%s' - CALLED!", type(args))
528
529     logger.debug("Invoking locking.acquire() ...")
530     locking.acquire()
531
532     source_domain = "wiki.todon.eu"
533     if sources.is_recent(source_domain):
534         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
535         return 0
536     else:
537         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
538         sources.update(source_domain)
539
540     blocklist = {
541         "silenced": list(),
542         "reject": list(),
543     }
544
545     raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
546     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
547
548     doc = bs4.BeautifulSoup(raw, "html.parser")
549     logger.debug("doc[]='%s'", type(doc))
550
551     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
552     logger.info("Checking %d silenced/limited entries ...", len(silenced))
553     blocklist["silenced"] = utils.find_domains(silenced, "div")
554
555     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
556     logger.info("Checking %d suspended entries ...", len(suspended))
557     blocklist["reject"] = utils.find_domains(suspended, "div")
558
559     blocking = blocklist["silenced"] + blocklist["reject"]
560     blocker = "todon.eu"
561
562     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
563     instances.set_total_blocks(blocker, blocking)
564
565     blockdict = list()
566     for block_level in blocklist:
567         blockers = blocklist[block_level]
568
569         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
570         for blocked in blockers:
571             logger.debug("blocked='%s'", blocked)
572
573             if not instances.is_registered(blocked):
574                 try:
575                     logger.info("Fetching instances from domain='%s' ...", blocked)
576                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
577                 except network.exceptions as exception:
578                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
579                     instances.set_last_error(blocked, exception)
580
581             if blocks.is_instance_blocked(blocker, blocked, block_level):
582                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
583                 continue
584
585             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
586             if utils.process_block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
587                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
588                 blockdict.append({
589                     "blocked": blocked,
590                     "reason" : None,
591                 })
592
593         logger.debug("Invoking commit() ...")
594         database.connection.commit()
595
596         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
597         if config.get("bot_enabled") and len(blockdict) > 0:
598             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
599             network.send_bot_post(blocker, blockdict)
600
601     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
602     if instances.has_pending(blocker):
603         logger.debug("Flushing updates for blocker='%s' ...", blocker)
604         instances.update_data(blocker)
605
606     logger.debug("Success! - EXIT!")
607     return 0
608
609 def fetch_cs(args: argparse.Namespace):
610     logger.debug("args[]='%s' - CALLED!", type(args))
611
612     logger.debug("Invoking locking.acquire() ...")
613     locking.acquire()
614
615     extensions = [
616         "extra",
617         "abbr",
618         "attr_list",
619         "def_list",
620         "fenced_code",
621         "footnotes",
622         "md_in_html",
623         "admonition",
624         "codehilite",
625         "legacy_attrs",
626         "legacy_em",
627         "meta",
628         "nl2br",
629         "sane_lists",
630         "smarty",
631         "toc",
632         "wikilinks"
633     ]
634
635     blocklist = {
636         "silenced": list(),
637         "reject"  : list(),
638     }
639
640     source_domain = "raw.githubusercontent.com"
641     if sources.is_recent(source_domain):
642         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
643         return 0
644     else:
645         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
646         sources.update(source_domain)
647
648     raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
649     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
650
651     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
652     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
653
654     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
655     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
656     blocklist["silenced"] = federation.find_domains(silenced)
657
658     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
659     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
660     blocklist["reject"] = federation.find_domains(blocked)
661
662     blocking = blocklist["silenced"] + blocklist["reject"]
663     blocker = "chaos.social"
664
665     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
666     instances.set_total_blocks(blocker, blocking)
667
668     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
669     blockdict = list()
670     if len(blocking) > 0:
671         for block_level in blocklist:
672             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
673
674             for row in blocklist[block_level]:
675                 logger.debug("row[%s]='%s'", type(row), row)
676                 if instances.is_recent(row["domain"], "last_blocked"):
677                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
678                     continue
679                 elif not instances.is_registered(row["domain"]):
680                     try:
681                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
682                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
683                     except network.exceptions as exception:
684                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
685                         instances.set_last_error(row["domain"], exception)
686
687                 if utils.process_block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
688                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
689                     blockdict.append({
690                         "blocked": row["domain"],
691                         "reason" : row["reason"],
692                     })
693
694         logger.debug("Invoking commit() ...")
695         database.connection.commit()
696
697         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
698         if config.get("bot_enabled") and len(blockdict) > 0:
699             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
700             network.send_bot_post(blocker, blockdict)
701
702     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
703     if instances.has_pending(blocker):
704         logger.debug("Flushing updates for blocker='%s' ...", blocker)
705         instances.update_data(blocker)
706
707     logger.debug("Success! - EXIT!")
708     return 0
709
710 def fetch_fba_rss(args: argparse.Namespace) -> int:
711     logger.debug("args[]='%s' - CALLED!", type(args))
712
713     domains = list()
714
715     logger.debug("Invoking locking.acquire() ...")
716     locking.acquire()
717
718     components = urlparse(args.feed)
719
720     if sources.is_recent(components.netloc):
721         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
722         return 0
723     else:
724         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
725         sources.update(components.netloc)
726
727     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
728     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
729
730     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
731     if response.ok and response.status_code < 300 and len(response.text) > 0:
732         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
733         rss = atoma.parse_rss_bytes(response.content)
734
735         logger.debug("rss[]='%s'", type(rss))
736         for item in rss.items:
737             logger.debug("item='%s'", item)
738             domain = tidyup.domain(item.link.split("=")[1])
739
740             logger.debug("domain='%s' - AFTER!", domain)
741             if domain == "":
742                 logger.debug("domain is empty - SKIPPED!")
743                 continue
744             elif not utils.is_domain_wanted(domain):
745                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
746                 continue
747             elif domain in domains:
748                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
749                 continue
750             elif instances.is_registered(domain):
751                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
752                 continue
753             elif instances.is_recent(domain):
754                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
755                 continue
756
757             logger.debug("Adding domain='%s'", domain)
758             domains.append(domain)
759
760     logger.debug("domains()=%d", len(domains))
761     if len(domains) > 0:
762         logger.info("Adding %d new instances ...", len(domains))
763         for domain in domains:
764             try:
765                 logger.info("Fetching instances from domain='%s' ...", domain)
766                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
767             except network.exceptions as exception:
768                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
769                 instances.set_last_error(domain, exception)
770                 return 100
771
772     logger.debug("Success! - EXIT!")
773     return 0
774
775 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
776     logger.debug("args[]='%s' - CALLED!", type(args))
777
778     logger.debug("Invoking locking.acquire() ...")
779     locking.acquire()
780
781     source_domain = "ryona.agency"
782     if sources.is_recent(source_domain):
783         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
784         return 0
785     else:
786         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
787         sources.update(source_domain)
788
789     feed = f"https://{source_domain}/users/fba/feed.atom"
790
791     domains = list()
792
793     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
794     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
795
796     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
797     if response.ok and response.status_code < 300 and len(response.text) > 0:
798         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
799         atom = atoma.parse_atom_bytes(response.content)
800
801         logger.debug("atom[]='%s'", type(atom))
802         for entry in atom.entries:
803             logger.debug("entry[]='%s'", type(entry))
804             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
805             logger.debug("doc[]='%s'", type(doc))
806             for element in doc.findAll("a"):
807                 logger.debug("element[]='%s'", type(element))
808                 for href in element["href"].split(","):
809                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
810                     domain = tidyup.domain(href)
811
812                     logger.debug("domain='%s' - AFTER!", domain)
813                     if domain == "":
814                         logger.debug("domain is empty - SKIPPED!")
815                         continue
816                     elif not utils.is_domain_wanted(domain):
817                         logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
818                         continue
819                     elif domain in domains:
820                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
821                         continue
822                     elif instances.is_registered(domain):
823                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
824                         continue
825                     elif instances.is_recent(domain):
826                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
827                         continue
828
829                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
830                     domains.append(domain)
831
832     logger.debug("domains()=%d", len(domains))
833     if len(domains) > 0:
834         logger.info("Adding %d new instances ...", len(domains))
835         for domain in domains:
836             logger.debug("domain='%s'", domain)
837             try:
838                 logger.info("Fetching instances from domain='%s' ...", domain)
839                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
840             except network.exceptions as exception:
841                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
842                 instances.set_last_error(domain, exception)
843                 return 100
844
845     logger.debug("Success! - EXIT!")
846     return 0
847
848 def fetch_instances(args: argparse.Namespace) -> int:
849     logger.debug("args[]='%s' - CALLED!", type(args))
850
851     logger.debug("args.domain='%s' - checking ...", args.domain)
852     if not validators.domain(args.domain):
853         logger.warning("args.domain='%s' is not valid.", args.domain)
854         return 100
855     elif blacklist.is_blacklisted(args.domain):
856         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
857         return 101
858
859     logger.debug("Invoking locking.acquire() ...")
860     locking.acquire()
861
862     # Initial fetch
863     try:
864         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
865         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
866     except network.exceptions as exception:
867         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
868         instances.set_last_error(args.domain, exception)
869         instances.update_data(args.domain)
870         return 100
871
872     if args.single:
873         logger.debug("Not fetching more instances - EXIT!")
874         return 0
875
876     # Loop through some instances
877     database.cursor.execute(
878         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
879     )
880
881     rows = database.cursor.fetchall()
882     logger.info("Checking %d entries ...", len(rows))
883     for row in rows:
884         logger.debug("row[domain]='%s'", row["domain"])
885         if row["domain"] == "":
886             logger.debug("row[domain] is empty - SKIPPED!")
887             continue
888         elif not utils.is_domain_wanted(row["domain"]):
889             logger.warning("Domain row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
890             continue
891
892         try:
893             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row["domain"], row["origin"], row["software"], row["nodeinfo_url"])
894             federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
895         except network.exceptions as exception:
896             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
897             instances.set_last_error(row["domain"], exception)
898
899     logger.debug("Success - EXIT!")
900     return 0
901
902 def fetch_oliphant(args: argparse.Namespace) -> int:
903     logger.debug("args[]='%s' - CALLED!", type(args))
904
905     logger.debug("Invoking locking.acquire() ...")
906     locking.acquire()
907
908     source_domain = "codeberg.org"
909     if sources.is_recent(source_domain):
910         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
911         return 0
912     else:
913         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
914         sources.update(source_domain)
915
916     # Base URL
917     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
918
919     # URLs to fetch
920     blocklists = (
921         {
922             "blocker": "artisan.chat",
923             "csv_url": "mastodon/artisan.chat.csv",
924         },{
925             "blocker": "mastodon.art",
926             "csv_url": "mastodon/mastodon.art.csv",
927         },{
928             "blocker": "pleroma.envs.net",
929             "csv_url": "mastodon/pleroma.envs.net.csv",
930         },{
931             "blocker": "oliphant.social",
932             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
933         },{
934             "blocker": "mastodon.online",
935             "csv_url": "mastodon/mastodon.online.csv",
936         },{
937             "blocker": "mastodon.social",
938             "csv_url": "mastodon/mastodon.social.csv",
939         },{
940             "blocker": "mastodon.social",
941             "csv_url": "other/missing-tier0-mastodon.social.csv",
942         },{
943             "blocker": "rage.love",
944             "csv_url": "mastodon/rage.love.csv",
945         },{
946             "blocker": "sunny.garden",
947             "csv_url": "mastodon/sunny.garden.csv",
948         },{
949             "blocker": "solarpunk.moe",
950             "csv_url": "mastodon/solarpunk.moe.csv",
951         },{
952             "blocker": "toot.wales",
953             "csv_url": "mastodon/toot.wales.csv",
954         },{
955             "blocker": "union.place",
956             "csv_url": "mastodon/union.place.csv",
957         }
958     )
959
960     domains = list()
961
962     logger.debug("Downloading %d files ...", len(blocklists))
963     for block in blocklists:
964         # Is domain given and not equal blocker?
965         if isinstance(args.domain, str) and args.domain != block["blocker"]:
966             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
967             continue
968         elif args.domain in domains:
969             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
970             continue
971         elif instances.is_recent(block["blocker"]):
972             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
973             continue
974
975         # Fetch this URL
976         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
977         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
978
979         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
980         if not response.ok or response.status_code >= 300 or response.content == "":
981             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
982             continue
983
984         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
985         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
986
987         blockdict = list()
988
989         logger.info("Processing %d rows ...", len(reader))
990         cnt = 0
991         for row in reader:
992             logger.debug("row[%s]='%s'", type(row), row)
993             domain = severity = None
994             reject_media = reject_reports = False
995
996             if "#domain" in row:
997                 domain = row["#domain"]
998             elif "domain" in row:
999                 domain = row["domain"]
1000             else:
1001                 logger.debug("row='%s' does not contain domain column", row)
1002                 continue
1003
1004             if "#severity" in row:
1005                 severity = row["#severity"]
1006             elif "severity" in row:
1007                 severity = row["severity"]
1008             else:
1009                 logger.debug("row='%s' does not contain severity column", row)
1010                 continue
1011
1012             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1013                 reject_media = True
1014             elif "reject_media" in row and row["reject_media"].lower() == "true":
1015                 reject_media = True
1016
1017             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1018                 reject_reports = True
1019             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1020                 reject_reports = True
1021
1022             cnt = cnt + 1
1023             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1024             if domain == "":
1025                 logger.debug("domain is empty - SKIPPED!")
1026                 continue
1027             elif not utils.is_domain_wanted(domain):
1028                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1029                 continue
1030
1031             logger.debug("Marking domain='%s' as handled", domain)
1032             domains.append(domain)
1033
1034             logger.debug("Processing domain='%s' ...", domain)
1035             processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1036             logger.debug("processed='%s'", processed)
1037
1038             if utils.process_block(block["blocker"], domain, None, "reject") and config.get("bot_enabled"):
1039                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1040                 blockdict.append({
1041                     "blocked": domain,
1042                     "reason" : block["reason"],
1043                 })
1044
1045             if reject_media:
1046                 utils.process_block(block["blocker"], domain, None, "reject_media")
1047             if reject_reports:
1048                 utils.process_block(block["blocker"], domain, None, "reject_reports")
1049
1050         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", block["blocker"], cnt)
1051         instances.set_total_blocks(block["blocker"], cnt)
1052
1053         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1054         if instances.has_pending(block["blocker"]):
1055             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1056             instances.update_data(block["blocker"])
1057
1058         logger.debug("Invoking commit() ...")
1059         database.connection.commit()
1060
1061         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1062         if config.get("bot_enabled") and len(blockdict) > 0:
1063             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1064             network.send_bot_post(block["blocker"], blockdict)
1065
1066     logger.debug("Success! - EXIT!")
1067     return 0
1068
1069 def fetch_txt(args: argparse.Namespace) -> int:
1070     logger.debug("args[]='%s' - CALLED!", type(args))
1071
1072     logger.debug("Invoking locking.acquire() ...")
1073     locking.acquire()
1074
1075     # Static URLs
1076     urls = ({
1077         "blocker": "seirdy.one",
1078         "url"    : "https://seirdy.one/pb/bsl.txt",
1079     },)
1080
1081     logger.info("Checking %d text file(s) ...", len(urls))
1082     for row in urls:
1083         logger.debug("Fetching row[url]='%s' ...", row["url"])
1084         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1085
1086         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1087         if response.ok and response.status_code < 300 and response.text != "":
1088             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1089             domains = response.text.split("\n")
1090
1091             logger.info("Processing %d domains ...", len(domains))
1092             for domain in domains:
1093                 logger.debug("domain='%s' - BEFORE!", domain)
1094                 domain = tidyup.domain(domain)
1095
1096                 logger.debug("domain='%s' - AFTER!", domain)
1097                 if domain == "":
1098                     logger.debug("domain is empty - SKIPPED!")
1099                     continue
1100                 elif not utils.is_domain_wanted(domain):
1101                     logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1102                     continue
1103                 elif instances.is_recent(domain):
1104                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1105                     continue
1106
1107                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1108                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1109
1110                 logger.debug("processed='%s'", processed)
1111                 if not processed:
1112                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1113                     continue
1114
1115     logger.debug("Success! - EXIT!")
1116     return 0
1117
1118 def fetch_fedipact(args: argparse.Namespace) -> int:
1119     logger.debug("args[]='%s' - CALLED!", type(args))
1120
1121     logger.debug("Invoking locking.acquire() ...")
1122     locking.acquire()
1123
1124     source_domain = "fedipact.online"
1125     if sources.is_recent(source_domain):
1126         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1127         return 0
1128     else:
1129         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1130         sources.update(source_domain)
1131
1132     response = utils.fetch_url(
1133         f"https://{source_domain}",
1134         network.web_headers,
1135         (config.get("connection_timeout"), config.get("read_timeout"))
1136     )
1137
1138     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1139     if response.ok and response.status_code < 300 and response.text != "":
1140         logger.debug("Parsing %d Bytes ...", len(response.text))
1141
1142         doc = bs4.BeautifulSoup(response.text, "html.parser")
1143         logger.debug("doc[]='%s'", type(doc))
1144
1145         rows = doc.findAll("li")
1146         logger.info("Checking %d row(s) ...", len(rows))
1147         for row in rows:
1148             logger.debug("row[]='%s'", type(row))
1149             domain = tidyup.domain(row.contents[0])
1150
1151             logger.debug("domain='%s' - AFTER!", domain)
1152             if domain == "":
1153                 logger.debug("domain is empty - SKIPPED!")
1154                 continue
1155             elif not utils.is_domain_wanted(domain):
1156                 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1157                 continue
1158             elif instances.is_registered(domain):
1159                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1160                 continue
1161             elif instances.is_recent(domain):
1162                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1163                 continue
1164
1165             logger.info("Fetching domain='%s' ...", domain)
1166             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1167
1168     logger.debug("Success! - EXIT!")
1169     return 0
1170
1171 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1172     logger.debug("args[]='%s' - CALLED!", type(args))
1173
1174     logger.debug("Invoking locking.acquire() ...")
1175     locking.acquire()
1176
1177     source_domain = "joinfediverse.wiki"
1178     if sources.is_recent(source_domain):
1179         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1180         return 0
1181     else:
1182         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1183         sources.update(source_domain)
1184
1185     raw = utils.fetch_url(
1186         f"https://{source_domain}/FediBlock",
1187         network.web_headers,
1188         (config.get("connection_timeout"), config.get("read_timeout"))
1189     ).text
1190     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1191
1192     doc = bs4.BeautifulSoup(raw, "html.parser")
1193     logger.debug("doc[]='%s'", type(doc))
1194
1195     tables = doc.findAll("table", {"class": "wikitable"})
1196
1197     logger.info("Analyzing %d table(s) ...", len(tables))
1198     blocklist = list()
1199     for table in tables:
1200         logger.debug("table[]='%s'", type(table))
1201
1202         rows = table.findAll("tr")
1203         logger.info("Checking %d row(s) ...", len(rows))
1204         block_headers = dict()
1205         for row in rows:
1206             logger.debug("row[%s]='%s'", type(row), row)
1207
1208             headers = row.findAll("th")
1209             logger.debug("Found headers()=%d header(s)", len(headers))
1210             if len(headers) > 1:
1211                 block_headers = dict()
1212                 cnt = 0
1213                 for header in headers:
1214                     cnt = cnt + 1
1215                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1216                     text = header.contents[0]
1217
1218                     logger.debug("text[]='%s'", type(text))
1219                     if not isinstance(text, str):
1220                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1221                         continue
1222                     elif validators.domain(text.strip()):
1223                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1224                         continue
1225
1226                     text = tidyup.domain(text.strip())
1227                     logger.debug("text='%s'", text)
1228                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1229                         logger.debug("Found header: '%s'=%d", text, cnt)
1230                         block_headers[cnt] = text
1231
1232             elif len(block_headers) == 0:
1233                 logger.debug("row is not scrapable - SKIPPED!")
1234                 continue
1235             elif len(block_headers) > 0:
1236                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1237                 cnt = 0
1238                 block = dict()
1239
1240                 for element in row.find_all(["th", "td"]):
1241                     cnt = cnt + 1
1242                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1243                     if cnt in block_headers:
1244                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1245
1246                         text = element.text.strip()
1247                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1248
1249                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1250                         if key in ["domain", "instance"]:
1251                             block[key] = text
1252                         elif key == "reason":
1253                             block[key] = tidyup.reason(text)
1254                         elif key == "subdomain(s)":
1255                             block[key] = list()
1256                             if text != "":
1257                                 block[key] = text.split("/")
1258                         else:
1259                             logger.debug("key='%s'", key)
1260                             block[key] = text
1261
1262                 logger.debug("block()=%d ...", len(block))
1263                 if len(block) > 0:
1264                     logger.debug("Appending block()=%d ...", len(block))
1265                     blocklist.append(block)
1266
1267     logger.debug("blocklist()=%d", len(blocklist))
1268
1269     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1270     domains = database.cursor.fetchall()
1271
1272     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1273     blocking = list()
1274     for block in blocklist:
1275         logger.debug("block='%s'", block)
1276         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1277             origin = block["blocked"]
1278             for subdomain in block["subdomain(s)"]:
1279                 block["blocked"] = subdomain + "." + origin
1280                 blocking.append(block)
1281         else:
1282             blocking.append(block)
1283
1284     logger.debug("blocking()=%d", blocking)
1285     for block in blocking:
1286         logger.debug("block[]='%s'", type(block))
1287         block["blocked"] = tidyup.domain(block["blocked"])
1288
1289         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1290         if block["blocked"] == "":
1291             logger.debug("block[blocked] is empty - SKIPPED!")
1292             continue
1293         elif not utils.is_domain_wanted(block["blocked"]):
1294             logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1295             continue
1296         elif instances.is_recent(block["blocked"]):
1297             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1298             continue
1299
1300         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1301         utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1302
1303     blockdict = list()
1304     for blocker in domains:
1305         blocker = blocker[0]
1306         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1307
1308         for block in blocking:
1309             logger.debug("block[blocked]='%s',block[reason]='%s' - BEFORE!", block["blocked"], block["reason"])
1310             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1311
1312             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1313             if block["blocked"] == "":
1314                 logger.debug("block[blocked] is empty - SKIPPED!")
1315                 continue
1316             elif not utils.is_domain_wanted(block["blocked"]):
1317                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1318                 continue
1319
1320             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1321             if utils.process_block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1322                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1323                 blockdict.append({
1324                     "blocked": block["blocked"],
1325                     "reason" : block["reason"],
1326                 })
1327
1328         if instances.has_pending(blocker):
1329             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1330             instances.update_data(blocker)
1331
1332         logger.debug("Invoking commit() ...")
1333         database.connection.commit()
1334
1335         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1336         if config.get("bot_enabled") and len(blockdict) > 0:
1337             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1338             network.send_bot_post(blocker, blockdict)
1339
1340     logger.debug("Success! - EXIT!")
1341     return 0
1342
1343 def recheck_obfuscation(args: argparse.Namespace) -> int:
1344     logger.debug("args[]='%s' - CALLED!", type(args))
1345
1346     logger.debug("Invoking locking.acquire() ...")
1347     locking.acquire()
1348
1349     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1350         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1351     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1352         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1353     else:
1354         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1355
1356     rows = database.cursor.fetchall()
1357     logger.info("Checking %d domains ...", len(rows))
1358     for row in rows:
1359         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1360         if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1361             logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1362             continue
1363
1364         blocking = list()
1365         if row["software"] == "pleroma":
1366             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1367             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1368         elif row["software"] == "mastodon":
1369             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1370             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1371         elif row["software"] == "lemmy":
1372             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1373             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1374         elif row["software"] == "friendica":
1375             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1376             blocking = friendica.fetch_blocks(row["domain"])
1377         elif row["software"] == "misskey":
1378             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1379             blocking = misskey.fetch_blocks(row["domain"])
1380         else:
1381             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1382
1383         logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1384         instances.set_total_blocks(row["domain"], blocking)
1385
1386         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1387         obfuscated = 0
1388         blockdict = list()
1389         for block in blocking:
1390             logger.debug("block[blocked]='%s'", block["blocked"])
1391             blocked = None
1392
1393             if block["blocked"] == "":
1394                 logger.debug("block[blocked] is empty - SKIPPED!")
1395                 continue
1396             elif block["blocked"].endswith(".arpa"):
1397                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1398                 continue
1399             elif block["blocked"].endswith(".tld"):
1400                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1401                 continue
1402             elif block["blocked"].endswith(".onion"):
1403                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1404                 continue
1405             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1406                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1407                 obfuscated = obfuscated + 1
1408                 blocked = utils.deobfuscate_domain(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1409             elif not utils.is_domain_wanted(block["blocked"]):
1410                 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1411                 continue
1412             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1413                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1414                 continue
1415
1416             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1417             if blocked is not None and blocked != block["blocked"]:
1418                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1419                 obfuscated = obfuscated - 1
1420                 if blocks.is_instance_blocked(row["domain"], blocked):
1421                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1422                     continue
1423
1424                 block["block_level"] = utils.alias_block_level(block["block_level"])
1425
1426                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1427                 if utils.process_block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1428                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1429                     blockdict.append({
1430                         "blocked": blocked,
1431                         "reason" : block["reason"],
1432                     })
1433
1434         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1435         if obfuscated == 0 and len(blocking) > 0:
1436             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1437             instances.set_has_obfuscation(row["domain"], False)
1438
1439         if instances.has_pending(row["domain"]):
1440             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1441             instances.update_data(row["domain"])
1442
1443         logger.debug("Invoking commit() ...")
1444         database.connection.commit()
1445
1446         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1447         if config.get("bot_enabled") and len(blockdict) > 0:
1448             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1449             network.send_bot_post(row["domain"], blockdict)
1450
1451     logger.debug("Success! - EXIT!")
1452     return 0
1453
1454 def fetch_fedilist(args: argparse.Namespace) -> int:
1455     logger.debug("args[]='%s' - CALLED!", type(args))
1456
1457     logger.debug("Invoking locking.acquire() ...")
1458     locking.acquire()
1459
1460     source_domain = "demo.fedilist.com"
1461     if sources.is_recent(source_domain):
1462         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1463         return 0
1464     else:
1465         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1466         sources.update(source_domain)
1467
1468     url = f"http://{source_domain}/instance/csv?onion=not"
1469     if args.software is not None and args.software != "":
1470         logger.debug("args.software='%s'", args.software)
1471         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1472
1473     logger.info("Fetching url='%s' ...", url)
1474     response = reqto.get(
1475         url,
1476         headers=network.web_headers,
1477         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1478         allow_redirects=False
1479     )
1480
1481     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1482     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1483         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", response.ok, response.status_code, len(response.text))
1484         return 1
1485
1486     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1487
1488     logger.debug("reader[]='%s'", type(reader))
1489     blockdict = list()
1490     for row in reader:
1491         logger.debug("row[]='%s'", type(row))
1492         domain = tidyup.domain(row["hostname"])
1493         logger.debug("domain='%s' - AFTER!", domain)
1494
1495         if domain == "":
1496             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1497             continue
1498         elif not utils.is_domain_wanted(domain):
1499             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1500             continue
1501         elif (args.all is None or not args.all) and instances.is_registered(domain):
1502             logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", type(args.all))
1503             continue
1504         elif instances.is_recent(domain):
1505             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1506             continue
1507
1508         logger.info("Fetching instances from domain='%s' ...", domain)
1509         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1510
1511     logger.debug("Success! - EXIT!")
1512     return 0
1513
1514 def update_nodeinfo(args: argparse.Namespace) -> int:
1515     logger.debug("args[]='%s' - CALLED!", type(args))
1516
1517     logger.debug("Invoking locking.acquire() ...")
1518     locking.acquire()
1519
1520     if args.domain is not None and args.domain != "":
1521         logger.debug("Fetching args.domain='%s'", args.domain)
1522         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1523     elif args.software is not None and args.software != "":
1524         logger.info("Fetching domains for args.software='%s'", args.software)
1525         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1526     else:
1527         logger.info("Fetching domains for recently updated ...")
1528         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1529
1530     domains = database.cursor.fetchall()
1531
1532     logger.info("Checking %d domain(s) ...", len(domains))
1533     cnt = 0
1534     for row in domains:
1535         logger.debug("row[]='%s'", type(row))
1536         try:
1537             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1538             software = federation.determine_software(row["domain"])
1539
1540             logger.debug("Determined software='%s'", software)
1541             if software != row["software"] and software is not None:
1542                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1543                 instances.set_software(row["domain"], software)
1544
1545             instances.set_success(row["domain"])
1546         except network.exceptions as exception:
1547             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1548             instances.set_last_error(row["domain"], exception)
1549
1550         instances.set_last_nodeinfo(row["domain"])
1551         instances.update_data(row["domain"])
1552         cnt = cnt + 1
1553
1554     logger.debug("Success! - EXIT!")
1555     return 0
1556
1557 def fetch_instances_social(args: argparse.Namespace) -> int:
1558     logger.debug("args[]='%s' - CALLED!", type(args))
1559
1560     logger.debug("Invoking locking.acquire() ...")
1561     locking.acquire()
1562
1563     source_domain = "instances.social"
1564
1565     if config.get("instances_social_api_key") == "":
1566         logger.error("API key not set. Please set in your config.json file.")
1567         return 1
1568     elif sources.is_recent(source_domain):
1569         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1570         return 0
1571     else:
1572         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1573         sources.update(source_domain)
1574
1575     headers = {
1576         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1577     }
1578
1579     fetched = network.get_json_api(
1580         source_domain,
1581         "/api/1.0/instances/list?count=0&sort_by=name",
1582         headers,
1583         (config.get("connection_timeout"), config.get("read_timeout"))
1584     )
1585     logger.debug("fetched[]='%s'", type(fetched))
1586
1587     if "error_message" in fetched:
1588         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1589         return 2
1590     elif "exception" in fetched:
1591         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1592         return 3
1593     elif "json" not in fetched:
1594         logger.warning("fetched has no element 'json' - EXIT!")
1595         return 4
1596     elif "instances" not in fetched["json"]:
1597         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1598         return 5
1599
1600     domains = list()
1601     rows = fetched["json"]["instances"]
1602
1603     logger.info("Checking %d row(s) ...", len(rows))
1604     for row in rows:
1605         logger.debug("row[]='%s'", type(row))
1606         domain = tidyup.domain(row["name"])
1607
1608         logger.debug("domain='%s' - AFTER!", domain)
1609         if domain == "":
1610             logger.debug("domain is empty - SKIPPED!")
1611             continue
1612         elif not utils.is_domain_wanted(domain):
1613             logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1614             continue
1615         elif domain in domains:
1616             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1617             continue
1618         elif instances.is_registered(domain):
1619             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1620             continue
1621         elif instances.is_recent(domain):
1622             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1623             continue
1624
1625         logger.info("Fetching instances from domain='%s'", domain)
1626         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1627
1628     logger.debug("Success! - EXIT!")
1629     return 0