]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 import argparse
24 import atoma
25 import bs4
26 import markdown
27 import reqto
28 import validators
29
30 from fba import csrf
31 from fba import database
32 from fba import utils
33
34 from fba.helpers import blacklist
35 from fba.helpers import config
36 from fba.helpers import cookies
37 from fba.helpers import locking
38 from fba.helpers import software as software_helper
39 from fba.helpers import tidyup
40
41 from fba.http import federation
42 from fba.http import network
43
44 from fba.models import blocks
45 from fba.models import instances
46
47 from fba.networks import friendica
48 from fba.networks import lemmy
49 from fba.networks import mastodon
50 from fba.networks import misskey
51 from fba.networks import pleroma
52
53 logging.basicConfig(level=logging.INFO)
54 logger = logging.getLogger(__name__)
55 #logger.setLevel(logging.DEBUG)
56
57 def check_instance(args: argparse.Namespace) -> int:
58     logger.debug("args.domain='%s' - CALLED!", args.domain)
59     status = 0
60     if not validators.domain(args.domain):
61         logger.warning("args.domain='%s' is not valid", args.domain)
62         status = 100
63     elif blacklist.is_blacklisted(args.domain):
64         logger.warning("args.domain='%s' is blacklisted", args.domain)
65         status = 101
66     elif instances.is_registered(args.domain):
67         logger.warning("args.domain='%s' is already registered", args.domain)
68         status = 102
69     else:
70         logger.info("args.domain='%s' is not known", args.domain)
71
72     logger.debug("status=%d - EXIT!", status)
73     return status
74
75 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
76     logger.debug("args[]='%s' - CALLED!", type(args))
77
78     # No CSRF by default, you don't have to add network.api_headers by yourself here
79     headers = tuple()
80
81     try:
82         logger.debug("Checking CSRF from pixelfed.org")
83         headers = csrf.determine("pixelfed.org", dict())
84     except network.exceptions as exception:
85         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
86         return list()
87
88     domains = list()
89     try:
90         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
91         fetched = network.get_json_api(
92             "pixelfed.org",
93             "/api/v1/servers/all.json?scope=All&country=all&language=all",
94             headers,
95             (config.get("connection_timeout"), config.get("read_timeout"))
96         )
97
98         logger.debug("JSON API returned %d elements", len(fetched))
99         if "error_message" in fetched:
100             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
101             return 101
102         elif "data" not in fetched["json"]:
103             logger.warning("API did not return JSON with 'data' element - EXIT!")
104             return 102
105
106         rows = fetched["json"]["data"]
107         logger.info("Checking %d fetched rows ...", len(rows))
108         for row in rows:
109             logger.debug("row[]='%s'", type(row))
110             if "domain" not in row:
111                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
112                 continue
113             elif not utils.is_domain_wanted(row["domain"]):
114                 logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
115                 continue
116             elif instances.is_registered(row["domain"]):
117                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
118                 continue
119             elif instances.is_recent(row["domain"]):
120                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
121                 continue
122
123             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
124             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
125
126     except network.exceptions as exception:
127         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
128         return 103
129
130     logger.debug("Success! - EXIT!")
131     return 0
132
133 def fetch_bkali(args: argparse.Namespace) -> int:
134     logger.debug("args[]='%s' - CALLED!", type(args))
135     domains = list()
136     try:
137         fetched = network.post_json_api("gql.api.bka.li", "/v1/graphql", json.dumps({
138             "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
139         }))
140
141         logger.debug("fetched[]='%s'", type(fetched))
142         if "error_message" in fetched:
143             logger.warning("post_json_api() for 'gql.api.bka.li' returned error message='%s", fetched['error_message'])
144             return 100
145         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
146             logger.warning("post_json_api() returned error: '%s", fetched['error']['message'])
147             return 101
148
149         rows = fetched["json"]
150
151         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
152         if len(rows) == 0:
153             raise Exception("WARNING: Returned no records")
154         elif "data" not in rows:
155             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
156         elif "nodeinfo" not in rows["data"]:
157             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
158
159         for entry in rows["data"]["nodeinfo"]:
160             logger.debug("entry[%s]='%s'", type(entry), entry)
161             if "domain" not in entry:
162                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
163                 continue
164             elif not utils.is_domain_wanted(entry["domain"]):
165                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!")
166                 continue
167             elif instances.is_registered(entry["domain"]):
168                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
169                 continue
170             elif instances.is_recent(entry["domain"]):
171                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
172                 continue
173
174             logger.debug("Adding domain='%s' ...", entry["domain"])
175             domains.append(entry["domain"])
176
177     except network.exceptions as exception:
178         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
179         return 102
180
181     logger.debug("domains()=%d", len(domains))
182     if len(domains) > 0:
183         locking.acquire()
184
185         logger.info("Adding %d new instances ...", len(domains))
186         for domain in domains:
187             try:
188                 logger.info("Fetching instances from domain='%s' ...", domain)
189                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
190             except network.exceptions as exception:
191                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
192                 instances.set_last_error(domain, exception)
193
194     logger.debug("Success - EXIT!")
195     return 0
196
197 def fetch_blocks(args: argparse.Namespace) -> int:
198     logger.debug("args[]='%s' - CALLED!", type(args))
199     if args.domain is not None and args.domain != "":
200         logger.debug("args.domain='%s' - checking ...", args.domain)
201         if not validators.domain(args.domain):
202             logger.warning("args.domain='%s' is not valid.", args.domain)
203             return 100
204         elif blacklist.is_blacklisted(args.domain):
205             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
206             return 101
207         elif not instances.is_registered(args.domain):
208             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
209             return 102
210
211     locking.acquire()
212
213     if args.domain is not None and args.domain != "":
214         # Re-check single domain
215         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
216         database.cursor.execute(
217             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
218         )
219     elif args.software is not None and args.software != "":
220         # Re-check single software
221         logger.debug("Querying database for args.software='%s' ...", args.software)
222         database.cursor.execute(
223             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
224         )
225     else:
226         # Re-check after "timeout" (aka. minimum interval)
227         database.cursor.execute(
228             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
229         )
230
231     rows = database.cursor.fetchall()
232     logger.info("Checking %d entries ...", len(rows))
233     for blocker, software, origin, nodeinfo_url in rows:
234         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
235         blocker = tidyup.domain(blocker)
236         logger.debug("blocker='%s' - AFTER!", blocker)
237
238         if blocker == "":
239             logger.warning("blocker is now empty!")
240             continue
241         elif nodeinfo_url is None or nodeinfo_url == "":
242             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
243             continue
244         elif not utils.is_domain_wanted(blocker):
245             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
246             continue
247
248         logger.debug("blocker='%s'", blocker)
249         instances.set_last_blocked(blocker)
250         instances.set_has_obfuscation(blocker, False)
251
252         blocking = list()
253         blockdict = list()
254         if software == "pleroma":
255             logger.info("blocker='%s',software='%s'", blocker, software)
256             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
257         elif software == "mastodon":
258             logger.info("blocker='%s',software='%s'", blocker, software)
259             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
260         elif software == "lemmy":
261             logger.info("blocker='%s',software='%s'", blocker, software)
262             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
263         elif software == "friendica":
264             logger.info("blocker='%s',software='%s'", blocker, software)
265             blocking = friendica.fetch_blocks(blocker)
266         elif software == "misskey":
267             logger.info("blocker='%s',software='%s'", blocker, software)
268             blocking = misskey.fetch_blocks(blocker)
269         else:
270             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
271
272         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
273         for block in blocking:
274             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block['block_level'], block["reason"])
275
276             if block['block_level'] == "":
277                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
278                 continue
279
280             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
281             block["blocked"] = tidyup.domain(block["blocked"])
282             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
283             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
284
285             if block["blocked"] == "":
286                 logger.warning("blocked is empty, blocker='%s'", blocker)
287                 continue
288             elif block["blocked"].count("*") > 0:
289                 logger.debug("blocker='%s' uses obfuscated domains, marking ...", blocker)
290                 instances.set_has_obfuscation(blocker, True)
291
292                 # Some friendica servers also obscure domains without hash
293                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
294
295                 logger.debug("row[]='%s'", type(row))
296                 if row is None:
297                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
298                     continue
299
300                 block["blocked"] = row[0]
301                 origin           = row[1]
302                 nodeinfo_url     = row[2]
303             elif block["blocked"].count("?") > 0:
304                 logger.debug("blocker='%s' uses obfuscated domains, marking ...", blocker)
305                 instances.set_has_obfuscation(blocker, True)
306
307                 # Some obscure them with question marks, not sure if that's dependent on version or not
308                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
309
310                 logger.debug("row[]='%s'", type(row))
311                 if row is None:
312                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
313                     continue
314
315                 block["blocked"] = row[0]
316                 origin           = row[1]
317                 nodeinfo_url     = row[2]
318
319             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
320             if not utils.is_domain_wanted(block["blocked"]):
321                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
322                 continue
323             elif block['block_level'] in ["accept", "accepted"]:
324                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
325                 continue
326             elif not instances.is_registered(block["blocked"]):
327                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
328                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
329
330             if block['block_level'] == "silence":
331                 logger.debug("Block level 'silence' has been changed to 'silenced'")
332                 block['block_level'] = "silenced"
333             elif block['block_level'] == "suspend":
334                 logger.debug("Block level 'suspend' has been changed to 'suspended'")
335                 block['block_level'] = "suspended"
336
337             if not blocks.is_instance_blocked(blocker, block["blocked"], block['block_level']):
338                 logger.debug("Invoking blocks.add_instance(%s, %s, %s, %s)", blocker, block["blocked"], block["reason"], block['block_level'])
339                 blocks.add_instance(blocker, block["blocked"], block["reason"], block['block_level'])
340
341                 logger.debug("block_level='%s',config[bot_enabled]='%s'", block['block_level'], config.get("bot_enabled"))
342                 if block['block_level'] == "reject" and config.get("bot_enabled"):
343                     logger.debug("blocker='%s' has blocked '%s' with reason='%s' - Adding to bot notification ...", blocker, block["blocked"], block["reason"])
344                     blockdict.append({
345                         "blocked": block["blocked"],
346                         "reason" : block["reason"],
347                     })
348             else:
349                 logger.debug("Updating block last seen and reason for blocker='%s',blocked='%s' ...", blocker, block["blocked"])
350                 blocks.update_last_seen(blocker, block["blocked"], block['block_level'])
351                 blocks.update_reason(block["reason"], blocker, block["blocked"], block['block_level'])
352
353             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
354             cookies.clear(block["blocked"])
355
356         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
357         if instances.has_pending(blocker):
358             logger.debug("Flushing updates for blocker='%s' ...", blocker)
359             instances.update_data(blocker)
360
361         logger.debug("Invoking commit() ...")
362         database.connection.commit()
363
364         logger.debug("Invoking cookies.clear(%s) ...", blocker)
365         cookies.clear(blocker)
366
367         logger.debug("config[bot_enabled]='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
368         if config.get("bot_enabled") and len(blockdict) > 0:
369             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
370             network.send_bot_post(blocker, blockdict)
371
372     logger.debug("Success! - EXIT!")
373     return 0
374
375 def fetch_observer(args: argparse.Namespace) -> int:
376     logger.debug("args[]='%s' - CALLED!", type(args))
377     types = [
378         "akkoma",
379         "birdsitelive",
380         "bookwyrm",
381         "calckey",
382         "diaspora",
383         "foundkey",
384         "friendica",
385         "funkwhale",
386         "gancio",
387         "gnusocial",
388         "gotosocial",
389         "hometown",
390         "hubzilla",
391         "kbin",
392         "ktistec",
393         "lemmy",
394         "mastodon",
395         "microblogpub",
396         "misskey",
397         "mitra",
398         "mobilizon",
399         "owncast",
400         "peertube",
401         "pixelfed",
402         "pleroma",
403         "plume",
404         "snac",
405         "takahe",
406         "wildebeest",
407         "writefreely"
408     ]
409
410     locking.acquire()
411
412     logger.info("Fetching %d different table data ...", len(types))
413     for software in types:
414         logger.debug("software='%s' - BEFORE!", software)
415         if args.software is not None and args.software != software:
416             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!")
417             continue
418
419         doc = None
420         try:
421             logger.debug("Fetching table data for software='%s' ...", software)
422             raw = utils.fetch_url(
423                 f"https://fediverse.observer/app/views/tabledata.php?software={software}",
424                 network.web_headers,
425                 (config.get("connection_timeout"), config.get("read_timeout"))
426             ).text
427             logger.debug("raw[%s]()=%d", type(raw), len(raw))
428
429             doc = bs4.BeautifulSoup(raw, features='html.parser')
430             logger.debug("doc[]='%s'", type(doc))
431         except network.exceptions as exception:
432             logger.warning("Cannot fetch software='%s' from fediverse.observer: '%s'", software, type(exception))
433             continue
434
435         items = doc.findAll("a", {"class": "url"})
436         logger.info("Checking %d items,software='%s' ...", len(items), software)
437         for item in items:
438             logger.debug("item[]='%s'", type(item))
439             domain = item.decode_contents()
440
441             logger.debug("domain='%s'", domain)
442             if not utils.is_domain_wanted(domain):
443                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
444                 continue
445             elif instances.is_registered(domain):
446                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
447                 continue
448             elif instances.is_recent(domain):
449                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
450                 continue
451
452             software = software_helper.alias(software)
453             logger.info("Fetching instances for domain='%s'", domain)
454             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
455
456     logger.debug("Success! - EXIT!")
457     return 0
458
459 def fetch_todon_wiki(args: argparse.Namespace) -> int:
460     logger.debug("args[]='%s' - CALLED!", type(args))
461
462     locking.acquire()
463     blocklist = {
464         "silenced": list(),
465         "reject": list(),
466     }
467
468     raw = utils.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
469     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
470
471     doc = bs4.BeautifulSoup(raw, "html.parser")
472     logger.debug("doc[]='%s'", type(doc))
473
474     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
475     logger.info("Checking %d silenced/limited entries ...", len(silenced))
476     blocklist["silenced"] = utils.find_domains(silenced, "div")
477
478     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
479     logger.info("Checking %d suspended entries ...", len(suspended))
480     blocklist["reject"] = utils.find_domains(suspended, "div")
481
482     for block_level in blocklist:
483         blockers = blocklist[block_level]
484
485         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
486         for blocked in blockers:
487             logger.debug("blocked='%s'", blocked)
488
489             if not instances.is_registered(blocked):
490                 try:
491                     logger.info("Fetching instances from domain='%s' ...", blocked)
492                     federation.fetch_instances(blocked, 'chaos.social', None, inspect.currentframe().f_code.co_name)
493                 except network.exceptions as exception:
494                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
495                     instances.set_last_error(blocked, exception)
496
497             if blocks.is_instance_blocked("todon.eu", blocked, block_level):
498                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
499                 continue
500
501             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
502             blocks.add_instance("todon.eu", blocked, None, block_level)
503
504         logger.debug("Invoking commit() ...")
505         database.connection.commit()
506
507     logger.debug("Success! - EXIT!")
508     return 0
509
510 def fetch_cs(args: argparse.Namespace):
511     logger.debug("args[]='%s' - CALLED!", type(args))
512     extensions = [
513         "extra",
514         "abbr",
515         "attr_list",
516         "def_list",
517         "fenced_code",
518         "footnotes",
519         "md_in_html",
520         "admonition",
521         "codehilite",
522         "legacy_attrs",
523         "legacy_em",
524         "meta",
525         "nl2br",
526         "sane_lists",
527         "smarty",
528         "toc",
529         "wikilinks"
530     ]
531
532     domains = {
533         "silenced": list(),
534         "reject"  : list(),
535     }
536
537     raw = utils.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
538     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
539
540     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features='html.parser')
541     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
542
543     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
544     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
545     domains["silenced"] = federation.find_domains(silenced)
546
547     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
548     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
549     domains["reject"] = federation.find_domains(blocked)
550
551     logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"]))
552     if len(domains) > 0:
553         locking.acquire()
554
555         for block_level in domains:
556             logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level]))
557
558             for row in domains[block_level]:
559                 logger.debug("row[%s]='%s'", type(row), row)
560                 if instances.is_recent(row["domain"], "last_blocked"):
561                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
562                     continue
563                 elif not instances.is_registered(row["domain"]):
564                     try:
565                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
566                         federation.fetch_instances(row["domain"], 'chaos.social', None, inspect.currentframe().f_code.co_name)
567                     except network.exceptions as exception:
568                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
569                         instances.set_last_error(row["domain"], exception)
570
571                 if not blocks.is_instance_blocked('chaos.social', row["domain"], block_level):
572                     logger.debug("domain='%s',block_level='%s' blocked by chaos.social, adding ...", row["domain"], block_level)
573                     blocks.add_instance('chaos.social', row["domain"], row["reason"], block_level)
574
575         logger.debug("Invoking commit() ...")
576         database.connection.commit()
577
578     logger.debug("Success! - EXIT!")
579     return 0
580
581 def fetch_fba_rss(args: argparse.Namespace) -> int:
582     logger.debug("args[]='%s' - CALLED!", type(args))
583     domains = list()
584
585     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
586     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
587
588     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
589     if response.ok and response.status_code < 300 and len(response.text) > 0:
590         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
591         rss = atoma.parse_rss_bytes(response.content)
592
593         logger.debug("rss[]='%s'", type(rss))
594         for item in rss.items:
595             logger.debug("item='%s'", item)
596             domain = item.link.split("=")[1]
597
598             if not utils.is_domain_wanted(domain):
599                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
600                 continue
601             elif domain in domains:
602                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
603                 continue
604             elif instances.is_registered(domain):
605                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
606                 continue
607             elif instances.is_recent(domain):
608                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
609                 continue
610
611             logger.debug("Adding domain='%s'", domain)
612             domains.append(domain)
613
614     logger.debug("domains()=%d", len(domains))
615     if len(domains) > 0:
616         locking.acquire()
617
618         logger.info("Adding %d new instances ...", len(domains))
619         for domain in domains:
620             try:
621                 logger.info("Fetching instances from domain='%s' ...", domain)
622                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
623             except network.exceptions as exception:
624                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
625                 instances.set_last_error(domain, exception)
626
627     logger.debug("Success! - EXIT!")
628     return 0
629
630 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
631     logger.debug("args[]='%s' - CALLED!", type(args))
632     feed = "https://ryona.agency/users/fba/feed.atom"
633
634     domains = list()
635
636     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
637     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
638
639     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
640     if response.ok and response.status_code < 300 and len(response.text) > 0:
641         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
642         atom = atoma.parse_atom_bytes(response.content)
643
644         logger.debug("atom[]='%s'", type(atom))
645         for entry in atom.entries:
646             logger.debug("entry[]='%s'", type(entry))
647             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
648             logger.debug("doc[]='%s'", type(doc))
649             for element in doc.findAll("a"):
650                 for href in element["href"].split(","):
651                     logger.debug("href[%s]='%s", type(href), href)
652                     domain = tidyup.domain(href)
653
654                     logger.debug("domain='%s'", domain)
655                     if not utils.is_domain_wanted(domain):
656                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
657                         continue
658                     elif domain in domains:
659                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
660                         continue
661                     elif instances.is_registered(domain):
662                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
663                         continue
664                     elif instances.is_recent(domain):
665                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
666                         continue
667
668                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
669                     domains.append(domain)
670
671     logger.debug("domains()='%d", len(domains))
672     if len(domains) > 0:
673         locking.acquire()
674
675         logger.info("Adding %d new instances ...", len(domains))
676         for domain in domains:
677             try:
678                 logger.info("Fetching instances from domain='%s' ...", domain)
679                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
680             except network.exceptions as exception:
681                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
682                 instances.set_last_error(domain, exception)
683
684     logger.debug("Success! - EXIT!")
685     return 0
686
687 def fetch_instances(args: argparse.Namespace) -> int:
688     logger.debug("args[]='%s' - CALLED!", type(args))
689     locking.acquire()
690
691     # Initial fetch
692     try:
693         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
694         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
695     except network.exceptions as exception:
696         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
697         instances.set_last_error(args.domain, exception)
698         return 100
699
700     if args.single:
701         logger.debug("Not fetching more instances - EXIT!")
702         return 0
703
704     # Loop through some instances
705     database.cursor.execute(
706         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
707     )
708
709     rows = database.cursor.fetchall()
710     logger.info("Checking %d entries ...", len(rows))
711     for row in rows:
712         logger.debug("domain='%s'", row[0])
713         if not utils.is_domain_wanted(row[0]):
714             logger.debug("Domain row[0]='%s' is not wanted - SKIPPED!", row[0])
715             continue
716
717         try:
718             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row[0], row[1], row[2], row[3])
719             federation.fetch_instances(row[0], row[1], row[2], inspect.currentframe().f_code.co_name, row[3])
720         except network.exceptions as exception:
721             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[0]='%s'", type(exception), row[0])
722             instances.set_last_error(row[0], exception)
723
724     logger.debug("Success - EXIT!")
725     return 0
726
727 def fetch_oliphant(args: argparse.Namespace) -> int:
728     logger.debug("args[]='%s' - CALLED!", type(args))
729     locking.acquire()
730
731     # Base URL
732     base_url = "https://codeberg.org/oliphant/blocklists/raw/branch/main/blocklists"
733
734     # URLs to fetch
735     blocklists = (
736         {
737             "blocker": "artisan.chat",
738             "csv_url": "mastodon/artisan.chat.csv",
739         },{
740             "blocker": "mastodon.art",
741             "csv_url": "mastodon/mastodon.art.csv",
742         },{
743             "blocker": "pleroma.envs.net",
744             "csv_url": "mastodon/pleroma.envs.net.csv",
745         },{
746             "blocker": "oliphant.social",
747             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
748         },{
749             "blocker": "mastodon.online",
750             "csv_url": "mastodon/mastodon.online.csv",
751         },{
752             "blocker": "mastodon.social",
753             "csv_url": "mastodon/mastodon.social.csv",
754         },{
755             "blocker": "mastodon.social",
756             "csv_url": "other/missing-tier0-mastodon.social.csv",
757         },{
758             "blocker": "rage.love",
759             "csv_url": "mastodon/rage.love.csv",
760         },{
761             "blocker": "sunny.garden",
762             "csv_url": "mastodon/sunny.garden.csv",
763         },{
764             "blocker": "solarpunk.moe",
765             "csv_url": "mastodon/solarpunk.moe.csv",
766         },{
767             "blocker": "toot.wales",
768             "csv_url": "mastodon/toot.wales.csv",
769         },{
770             "blocker": "union.place",
771             "csv_url": "mastodon/union.place.csv",
772         }
773     )
774
775     domains = list()
776
777     logger.debug("Downloading %d files ...", len(blocklists))
778     for block in blocklists:
779         # Is domain given and not equal blocker?
780         if isinstance(args.domain, str) and args.domain != block["blocker"]:
781             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
782             continue
783         elif args.domain in domains:
784             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
785             continue
786         elif instances.is_recent(block["blocker"]):
787             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
788             continue
789
790         # Fetch this URL
791         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block['csv_url'], block["blocker"])
792         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
793
794         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
795         if response.ok and response.content != "":
796             logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
797             reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix")
798
799             logger.debug("reader[]='%s'", type(reader))
800             for row in reader:
801                 logger.debug("row[%s]='%s'", type(row), row)
802                 domain = None
803                 if "#domain" in row:
804                     domain = row["#domain"]
805                 elif "domain" in row:
806                     domain = row["domain"]
807                 else:
808                     logger.debug("row='%s' does not contain domain column", row)
809                     continue
810
811                 logger.debug("domain='%s'", domain)
812                 if not utils.is_domain_wanted(domain):
813                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
814                     continue
815                 elif instances.is_recent(domain):
816                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
817                     continue
818
819                 logger.debug("Marking domain='%s' as handled", domain)
820                 domains.append(domain)
821
822                 logger.debug("Processing domain='%s' ...", domain)
823                 processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
824
825                 logger.debug("processed='%s'", processed)
826
827     logger.debug("Success! - EXIT!")
828     return 0
829
830 def fetch_txt(args: argparse.Namespace) -> int:
831     logger.debug("args[]='%s' - CALLED!", type(args))
832     locking.acquire()
833
834     # Static URLs
835     urls = ({
836         "blocker": "seirdy.one",
837         "url"    : "https://seirdy.one/pb/bsl.txt",
838     },)
839
840     logger.info("Checking %d text file(s) ...", len(urls))
841     for row in urls:
842         logger.debug("Fetching row[url]='%s' ...", row["url"])
843         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
844
845         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
846         if response.ok and response.status_code < 300 and response.text != "":
847             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
848             domains = response.text.split("\n")
849
850             logger.info("Processing %d domains ...", len(domains))
851             for domain in domains:
852                 logger.debug("domain='%s'", domain)
853                 if domain == "":
854                     logger.debug("domain is empty - SKIPPED!")
855                     continue
856                 elif not utils.is_domain_wanted(domain):
857                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
858                     continue
859                 elif instances.is_recent(domain):
860                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
861                     continue
862
863                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
864                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
865
866                 logger.debug("processed='%s'", processed)
867                 if not processed:
868                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
869                     continue
870
871     logger.debug("Success! - EXIT!")
872     return 0
873
874 def fetch_fedipact(args: argparse.Namespace) -> int:
875     logger.debug("args[]='%s' - CALLED!", type(args))
876     locking.acquire()
877
878     response = utils.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
879
880     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
881     if response.ok and response.status_code < 300 and response.text != "":
882         logger.debug("Parsing %d Bytes ...", len(response.text))
883
884         doc = bs4.BeautifulSoup(response.text, "html.parser")
885         logger.debug("doc[]='%s'", type(doc))
886
887         rows = doc.findAll("li")
888         logger.info("Checking %d row(s) ...", len(rows))
889         for row in rows:
890             logger.debug("row[]='%s'", type(row))
891             domain = tidyup.domain(row.contents[0])
892
893             logger.debug("domain='%s'", domain)
894             if domain == "":
895                 logger.debug("domain is empty - SKIPPED!")
896                 continue
897             elif not utils.is_domain_wanted(domain):
898                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
899                 continue
900             elif instances.is_registered(domain):
901                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
902                 continue
903             elif instances.is_recent(domain):
904                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
905                 continue
906
907             logger.info("Fetching domain='%s' ...", domain)
908             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
909
910     logger.debug("Success! - EXIT!")
911     return 0
912
913 def fetch_joinfediverse(args: argparse.Namespace) -> int:
914     logger.debug("args[]='%s' - CALLED!", type(args))
915     locking.acquire()
916
917     raw = utils.fetch_url("https://joinfediverse.wiki/FediBlock", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
918     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
919
920     doc = bs4.BeautifulSoup(raw, "html.parser")
921     logger.debug("doc[]='%s'", type(doc))
922
923     tables = doc.findAll("table", {"class": "wikitable"})
924
925     logger.info("Analyzing %d table(s) ...", len(tables))
926     blocklist = list()
927     for table in tables:
928         logger.debug("table[]='%s'", type(table))
929
930         rows = table.findAll("tr")
931         logger.info("Checking %d row(s) ...", len(rows))
932         block_headers = dict()
933         for row in rows:
934             #logger.debug("row[%s]='%s'", type(row), row)
935
936             headers = row.findAll("th")
937             #logger.debug("Found headers()=%d header(s)", len(headers))
938             if len(headers) > 1:
939                 block_headers = dict()
940                 cnt = 0
941                 for header in headers:
942                     cnt = cnt + 1
943                     #logger.debug("header[]='%s',cnt=%d", type(header), cnt)
944                     text = header.contents[0]
945
946                     #logger.debug("text[]='%s'", type(text))
947                     if not isinstance(text, str):
948                         #logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text))
949                         continue
950                     elif validators.domain(text.strip()):
951                         #logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
952                         continue
953
954                     text = tidyup.domain(text.strip())
955                     #logger.debug("text='%s'", text)
956                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
957                         logger.debug("Found header: '%s'=%d", text, cnt)
958                         block_headers[cnt] = text
959             elif len(block_headers) == 0:
960                 #logger.debug("row is not scrapable - SKIPPED!")
961                 continue
962             elif len(block_headers) > 0:
963                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
964                 cnt = 0
965                 block = dict()
966
967                 for element in row.find_all(["th", "td"]):
968                     cnt = cnt + 1
969                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
970                     if cnt in block_headers:
971                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
972
973                         text = element.text.strip()
974                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
975
976                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
977                         if key in ["domain", "instance"]:
978                             block[key] = text
979                         elif key == "reason":
980                             block[key] = tidyup.reason(text)
981                         elif key == "subdomain(s)":
982                             block[key] = list()
983                             if text != "":
984                                 block[key] = text.split("/")
985                         else:
986                             logger.debug("key='%s'", key)
987                             block[key] = text
988
989                 logger.debug("block()=%d ...", len(block))
990                 if len(block) > 0:
991                     logger.debug("Appending block()=%d ...", len(block))
992                     blocklist.append(block)
993
994     logger.debug("blocklist()=%d", len(blocklist))
995
996     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
997     domains = database.cursor.fetchall()
998
999     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1000     blocking = list()
1001     for block in blocklist:
1002         logger.debug("block='%s'", block)
1003         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1004             origin = block["blocked"]
1005             for subdomain in block["subdomain(s)"]:
1006                 block["blocked"] = subdomain + "." + origin
1007                 blocking.append(block)
1008         else:
1009             blocking.append(block)
1010
1011     logger.debug("blocking()=%d", blocking)
1012     for block in blocking:
1013         block["blocked"] = tidyup.domain(block["blocked"])
1014
1015         if not utils.is_domain_wanted(block["blocked"]):
1016             logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1017             continue
1018         elif instances.is_recent(block["blocked"]):
1019             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1020             continue
1021
1022         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1023         processed = utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1024
1025     blockdict = list()
1026     for blocker in domains:
1027         blocker = blocker[0]
1028         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1029
1030         for block in blocking:
1031             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1032
1033             if not utils.is_domain_wanted(block["blocked"]):
1034                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1035                 continue
1036
1037             logger.debug("blocked='%s',reason='%s'", block['blocked'], block['reason'])
1038             if not blocks.is_instance_blocked(blocker, block['blocked'], "reject"):
1039                 logger.debug("Invoking blocks.add_instance(%s, %s, %s, %s)", blocker, block['blocked'], block["reason"], "reject")
1040                 blocks.add_instance(blocker, block['blocked'], block["reason"], "reject")
1041
1042                 logger.debug("block_level='%s',config[bot_enabled]='%s'", "reject", config.get("bot_enabled"))
1043                 if config.get("bot_enabled"):
1044                     logger.debug("blocker='%s' has blocked '%s' with reason='%s' - Adding to bot notification ...", blocker, block['blocked'], block["reason"])
1045                     blockdict.append({
1046                         "blocked": block['blocked'],
1047                         "reason" : block["reason"],
1048                     })
1049             else:
1050                 logger.debug("Updating block last seen and reason for blocker='%s',blocked='%s' ...", blocker, block['blocked'])
1051                 blocks.update_last_seen(blocker, block['blocked'], "reject")
1052                 blocks.update_reason(block["reason"], blocker, block['blocked'], "reject")
1053
1054         if instances.has_pending(blocker):
1055             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1056             instances.update_data(blocker)
1057
1058         logger.debug("Invoking commit() ...")
1059         database.connection.commit()
1060
1061     logger.debug("config[bot_enabled]='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
1062     if config.get("bot_enabled") and len(blockdict) > 0:
1063         logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1064         network.send_bot_post(blocker, blockdict)
1065
1066     logger.debug("Success! - EXIT!")
1067     return 0