]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Fixed:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 import argparse
24 import atoma
25 import bs4
26 import markdown
27 import reqto
28 import validators
29
30 from fba import csrf
31 from fba import database
32 from fba import utils
33
34 from fba.helpers import blacklist
35 from fba.helpers import config
36 from fba.helpers import cookies
37 from fba.helpers import locking
38 from fba.helpers import software as software_helper
39 from fba.helpers import tidyup
40
41 from fba.http import federation
42 from fba.http import network
43
44 from fba.models import blocks
45 from fba.models import instances
46
47 from fba.networks import friendica
48 from fba.networks import lemmy
49 from fba.networks import mastodon
50 from fba.networks import misskey
51 from fba.networks import pleroma
52
53 logging.basicConfig(level=logging.INFO)
54 logger = logging.getLogger(__name__)
55 #logger.setLevel(logging.DEBUG)
56
57 def check_instance(args: argparse.Namespace) -> int:
58     logger.debug("args.domain='%s' - CALLED!", args.domain)
59     status = 0
60     if not validators.domain(args.domain):
61         logger.warning("args.domain='%s' is not valid", args.domain)
62         status = 100
63     elif blacklist.is_blacklisted(args.domain):
64         logger.warning("args.domain='%s' is blacklisted", args.domain)
65         status = 101
66     elif instances.is_registered(args.domain):
67         logger.warning("args.domain='%s' is already registered", args.domain)
68         status = 102
69     else:
70         logger.info("args.domain='%s' is not known", args.domain)
71
72     logger.debug("status=%d - EXIT!", status)
73     return status
74
75 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
76     logger.debug("args[]='%s' - CALLED!", type(args))
77
78     # No CSRF by default, you don't have to add network.api_headers by yourself here
79     headers = tuple()
80
81     try:
82         logger.debug("Checking CSRF from pixelfed.org")
83         headers = csrf.determine("pixelfed.org", dict())
84     except network.exceptions as exception:
85         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
86         return list()
87
88     domains = list()
89     try:
90         logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
91         fetched = network.get_json_api(
92             "pixelfed.org",
93             "/api/v1/servers/all.json?scope=All&country=all&language=all",
94             headers,
95             (config.get("connection_timeout"), config.get("read_timeout"))
96         )
97
98         logger.debug("JSON API returned %d elements", len(fetched))
99         if "error_message" in fetched:
100             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
101             return 101
102         elif "data" not in fetched["json"]:
103             logger.warning("API did not return JSON with 'data' element - EXIT!")
104             return 102
105
106         rows = fetched["json"]["data"]
107         logger.info("Checking %d fetched rows ...", len(rows))
108         for row in rows:
109             logger.debug("row[]='%s'", type(row))
110             if "domain" not in row:
111                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
112                 continue
113             elif not utils.is_domain_wanted(row["domain"]):
114                 logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
115                 continue
116             elif instances.is_registered(row["domain"]):
117                 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
118                 continue
119             elif instances.is_recent(row["domain"]):
120                 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
121                 continue
122
123             logger.debug("Fetching instances from row[domain]='%s' ...", row["domain"])
124             federation.fetch_instances(row["domain"], None, None, inspect.currentframe().f_code.co_name)
125
126     except network.exceptions as exception:
127         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
128         return 103
129
130     logger.debug("Success! - EXIT!")
131     return 0
132
133 def fetch_bkali(args: argparse.Namespace) -> int:
134     logger.debug("args[]='%s' - CALLED!", type(args))
135     domains = list()
136     try:
137         fetched = network.post_json_api("gql.api.bka.li", "/v1/graphql", json.dumps({
138             "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
139         }))
140
141         logger.debug("fetched[]='%s'", type(fetched))
142         if "error_message" in fetched:
143             logger.warning("post_json_api() for 'gql.api.bka.li' returned error message='%s", fetched['error_message'])
144             return 100
145         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
146             logger.warning("post_json_api() returned error: '%s", fetched['error']['message'])
147             return 101
148
149         rows = fetched["json"]
150
151         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
152         if len(rows) == 0:
153             raise Exception("WARNING: Returned no records")
154         elif "data" not in rows:
155             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
156         elif "nodeinfo" not in rows["data"]:
157             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
158
159         for entry in rows["data"]["nodeinfo"]:
160             logger.debug("entry[%s]='%s'", type(entry), entry)
161             if "domain" not in entry:
162                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
163                 continue
164             elif not utils.is_domain_wanted(entry["domain"]):
165                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!")
166                 continue
167             elif instances.is_registered(entry["domain"]):
168                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
169                 continue
170             elif instances.is_recent(entry["domain"]):
171                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
172                 continue
173
174             logger.debug("Adding domain='%s' ...", entry["domain"])
175             domains.append(entry["domain"])
176
177     except network.exceptions as exception:
178         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
179         return 102
180
181     logger.debug("domains()=%d", len(domains))
182     if len(domains) > 0:
183         locking.acquire()
184
185         logger.info("Adding %d new instances ...", len(domains))
186         for domain in domains:
187             try:
188                 logger.info("Fetching instances from domain='%s' ...", domain)
189                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
190             except network.exceptions as exception:
191                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
192                 instances.set_last_error(domain, exception)
193
194     logger.debug("Success - EXIT!")
195     return 0
196
197 def fetch_blocks(args: argparse.Namespace) -> int:
198     logger.debug("args[]='%s' - CALLED!", type(args))
199     if args.domain is not None and args.domain != "":
200         logger.debug("args.domain='%s' - checking ...", args.domain)
201         if not validators.domain(args.domain):
202             logger.warning("args.domain='%s' is not valid.", args.domain)
203             return 100
204         elif blacklist.is_blacklisted(args.domain):
205             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
206             return 101
207         elif not instances.is_registered(args.domain):
208             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
209             return 102
210
211     locking.acquire()
212
213     if args.domain is not None and args.domain != "":
214         # Re-check single domain
215         logger.debug("Querying database for single args.domain='%s' ...", args.domain)
216         database.cursor.execute(
217             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
218         )
219     elif args.software is not None and args.software != "":
220         # Re-check single software
221         logger.debug("Querying database for args.software='%s' ...", args.software)
222         database.cursor.execute(
223             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
224         )
225     else:
226         # Re-check after "timeout" (aka. minimum interval)
227         database.cursor.execute(
228             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
229         )
230
231     rows = database.cursor.fetchall()
232     logger.info("Checking %d entries ...", len(rows))
233     for blocker, software, origin, nodeinfo_url in rows:
234         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
235         blocker = tidyup.domain(blocker)
236         logger.debug("blocker='%s' - AFTER!", blocker)
237
238         if blocker == "":
239             logger.warning("blocker is now empty!")
240             continue
241         elif nodeinfo_url is None or nodeinfo_url == "":
242             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
243             continue
244         elif not utils.is_domain_wanted(blocker):
245             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
246             continue
247
248         logger.debug("blocker='%s'", blocker)
249         instances.set_last_blocked(blocker)
250         instances.set_has_obfuscation(blocker, False)
251
252         blocking = list()
253         blockdict = list()
254         if software == "pleroma":
255             logger.info("blocker='%s',software='%s'", blocker, software)
256             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
257         elif software == "mastodon":
258             logger.info("blocker='%s',software='%s'", blocker, software)
259             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
260         elif software == "lemmy":
261             logger.info("blocker='%s',software='%s'", blocker, software)
262             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
263         elif software == "friendica":
264             logger.info("blocker='%s',software='%s'", blocker, software)
265             blocking = friendica.fetch_blocks(blocker)
266         elif software == "misskey":
267             logger.info("blocker='%s',software='%s'", blocker, software)
268             blocking = misskey.fetch_blocks(blocker)
269         else:
270             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
271
272         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
273         for block in blocking:
274             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block['block_level'], block["reason"])
275
276             if block['block_level'] == "":
277                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
278                 continue
279
280             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
281             block["blocked"] = tidyup.domain(block["blocked"])
282             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
283             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
284
285             if block["blocked"] == "":
286                 logger.warning("blocked is empty, blocker='%s'", blocker)
287                 continue
288             elif block["blocked"].count("*") > 0:
289                 logger.debug("blocker='%s' uses obfuscated domains, marking ...", blocker)
290                 instances.set_has_obfuscation(blocker, True)
291
292                 # Some friendica servers also obscure domains without hash
293                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
294
295                 logger.debug("row[]='%s'", type(row))
296                 if row is None:
297                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
298                     continue
299
300                 block["blocked"] = row[0]
301                 origin           = row[1]
302                 nodeinfo_url     = row[2]
303             elif block["blocked"].count("?") > 0:
304                 logger.debug("blocker='%s' uses obfuscated domains, marking ...", blocker)
305                 instances.set_has_obfuscation(blocker, True)
306
307                 # Some obscure them with question marks, not sure if that's dependent on version or not
308                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
309
310                 logger.debug("row[]='%s'", type(row))
311                 if row is None:
312                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
313                     continue
314
315                 block["blocked"] = row[0]
316                 origin           = row[1]
317                 nodeinfo_url     = row[2]
318
319             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
320             if not utils.is_domain_wanted(block["blocked"]):
321                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
322                 continue
323             elif block['block_level'] in ["accept", "accepted"]:
324                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
325                 continue
326             elif not instances.is_registered(block["blocked"]):
327                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
328                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
329
330             if block['block_level'] == "silence":
331                 logger.debug("Block level 'silence' has been changed to 'silenced'")
332                 block['block_level'] = "silenced"
333             elif block['block_level'] == "suspend":
334                 logger.debug("Block level 'suspend' has been changed to 'suspended'")
335                 block['block_level'] = "suspended"
336
337             utils.process_block(blocker, block['blocked'], block['reason'], block['block_level'])
338
339             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
340             cookies.clear(block["blocked"])
341
342         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
343         if instances.has_pending(blocker):
344             logger.debug("Flushing updates for blocker='%s' ...", blocker)
345             instances.update_data(blocker)
346
347         logger.debug("Invoking commit() ...")
348         database.connection.commit()
349
350         logger.debug("Invoking cookies.clear(%s) ...", blocker)
351         cookies.clear(blocker)
352
353         logger.debug("config[bot_enabled]='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
354         if config.get("bot_enabled") and len(blockdict) > 0:
355             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
356             network.send_bot_post(blocker, blockdict)
357
358     logger.debug("Success! - EXIT!")
359     return 0
360
361 def fetch_observer(args: argparse.Namespace) -> int:
362     logger.debug("args[]='%s' - CALLED!", type(args))
363     types = [
364         "akkoma",
365         "birdsitelive",
366         "bookwyrm",
367         "calckey",
368         "diaspora",
369         "foundkey",
370         "friendica",
371         "funkwhale",
372         "gancio",
373         "gnusocial",
374         "gotosocial",
375         "hometown",
376         "hubzilla",
377         "kbin",
378         "ktistec",
379         "lemmy",
380         "mastodon",
381         "microblogpub",
382         "misskey",
383         "mitra",
384         "mobilizon",
385         "owncast",
386         "peertube",
387         "pixelfed",
388         "pleroma",
389         "plume",
390         "snac",
391         "takahe",
392         "wildebeest",
393         "writefreely"
394     ]
395
396     locking.acquire()
397
398     logger.info("Fetching %d different table data ...", len(types))
399     for software in types:
400         logger.debug("software='%s' - BEFORE!", software)
401         if args.software is not None and args.software != software:
402             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!")
403             continue
404
405         doc = None
406         try:
407             logger.debug("Fetching table data for software='%s' ...", software)
408             raw = utils.fetch_url(
409                 f"https://fediverse.observer/app/views/tabledata.php?software={software}",
410                 network.web_headers,
411                 (config.get("connection_timeout"), config.get("read_timeout"))
412             ).text
413             logger.debug("raw[%s]()=%d", type(raw), len(raw))
414
415             doc = bs4.BeautifulSoup(raw, features='html.parser')
416             logger.debug("doc[]='%s'", type(doc))
417         except network.exceptions as exception:
418             logger.warning("Cannot fetch software='%s' from fediverse.observer: '%s'", software, type(exception))
419             continue
420
421         items = doc.findAll("a", {"class": "url"})
422         logger.info("Checking %d items,software='%s' ...", len(items), software)
423         for item in items:
424             logger.debug("item[]='%s'", type(item))
425             domain = item.decode_contents()
426
427             logger.debug("domain='%s'", domain)
428             if not utils.is_domain_wanted(domain):
429                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
430                 continue
431             elif instances.is_registered(domain):
432                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
433                 continue
434             elif instances.is_recent(domain):
435                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
436                 continue
437
438             software = software_helper.alias(software)
439             logger.info("Fetching instances for domain='%s'", domain)
440             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
441
442     logger.debug("Success! - EXIT!")
443     return 0
444
445 def fetch_todon_wiki(args: argparse.Namespace) -> int:
446     logger.debug("args[]='%s' - CALLED!", type(args))
447
448     locking.acquire()
449     blocklist = {
450         "silenced": list(),
451         "reject": list(),
452     }
453
454     raw = utils.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
455     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
456
457     doc = bs4.BeautifulSoup(raw, "html.parser")
458     logger.debug("doc[]='%s'", type(doc))
459
460     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
461     logger.info("Checking %d silenced/limited entries ...", len(silenced))
462     blocklist["silenced"] = utils.find_domains(silenced, "div")
463
464     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
465     logger.info("Checking %d suspended entries ...", len(suspended))
466     blocklist["reject"] = utils.find_domains(suspended, "div")
467
468     for block_level in blocklist:
469         blockers = blocklist[block_level]
470
471         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
472         for blocked in blockers:
473             logger.debug("blocked='%s'", blocked)
474
475             if not instances.is_registered(blocked):
476                 try:
477                     logger.info("Fetching instances from domain='%s' ...", blocked)
478                     federation.fetch_instances(blocked, 'chaos.social', None, inspect.currentframe().f_code.co_name)
479                 except network.exceptions as exception:
480                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
481                     instances.set_last_error(blocked, exception)
482
483             if blocks.is_instance_blocked("todon.eu", blocked, block_level):
484                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
485                 continue
486
487             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
488             blocks.add_instance("todon.eu", blocked, None, block_level)
489
490         logger.debug("Invoking commit() ...")
491         database.connection.commit()
492
493     logger.debug("Success! - EXIT!")
494     return 0
495
496 def fetch_cs(args: argparse.Namespace):
497     logger.debug("args[]='%s' - CALLED!", type(args))
498     extensions = [
499         "extra",
500         "abbr",
501         "attr_list",
502         "def_list",
503         "fenced_code",
504         "footnotes",
505         "md_in_html",
506         "admonition",
507         "codehilite",
508         "legacy_attrs",
509         "legacy_em",
510         "meta",
511         "nl2br",
512         "sane_lists",
513         "smarty",
514         "toc",
515         "wikilinks"
516     ]
517
518     domains = {
519         "silenced": list(),
520         "reject"  : list(),
521     }
522
523     raw = utils.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
524     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
525
526     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features='html.parser')
527     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
528
529     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
530     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
531     domains["silenced"] = federation.find_domains(silenced)
532
533     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
534     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
535     domains["reject"] = federation.find_domains(blocked)
536
537     logger.debug("domains[silenced]()=%d,domains[reject]()=%d", len(domains["silenced"]), len(domains["reject"]))
538     if len(domains) > 0:
539         locking.acquire()
540
541         for block_level in domains:
542             logger.info("block_level='%s' has %d row(s)", block_level, len(domains[block_level]))
543
544             for row in domains[block_level]:
545                 logger.debug("row[%s]='%s'", type(row), row)
546                 if instances.is_recent(row["domain"], "last_blocked"):
547                     logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
548                     continue
549                 elif not instances.is_registered(row["domain"]):
550                     try:
551                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
552                         federation.fetch_instances(row["domain"], 'chaos.social', None, inspect.currentframe().f_code.co_name)
553                     except network.exceptions as exception:
554                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
555                         instances.set_last_error(row["domain"], exception)
556
557                 utils.process_block('chaos.social', row['domain'], row['reason'], block_level)
558
559         logger.debug("Invoking commit() ...")
560         database.connection.commit()
561
562     logger.debug("Success! - EXIT!")
563     return 0
564
565 def fetch_fba_rss(args: argparse.Namespace) -> int:
566     logger.debug("args[]='%s' - CALLED!", type(args))
567     domains = list()
568
569     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
570     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
571
572     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
573     if response.ok and response.status_code < 300 and len(response.text) > 0:
574         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
575         rss = atoma.parse_rss_bytes(response.content)
576
577         logger.debug("rss[]='%s'", type(rss))
578         for item in rss.items:
579             logger.debug("item='%s'", item)
580             domain = item.link.split("=")[1]
581
582             if not utils.is_domain_wanted(domain):
583                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
584                 continue
585             elif domain in domains:
586                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
587                 continue
588             elif instances.is_registered(domain):
589                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
590                 continue
591             elif instances.is_recent(domain):
592                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
593                 continue
594
595             logger.debug("Adding domain='%s'", domain)
596             domains.append(domain)
597
598     logger.debug("domains()=%d", len(domains))
599     if len(domains) > 0:
600         locking.acquire()
601
602         logger.info("Adding %d new instances ...", len(domains))
603         for domain in domains:
604             try:
605                 logger.info("Fetching instances from domain='%s' ...", domain)
606                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
607             except network.exceptions as exception:
608                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
609                 instances.set_last_error(domain, exception)
610
611     logger.debug("Success! - EXIT!")
612     return 0
613
614 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
615     logger.debug("args[]='%s' - CALLED!", type(args))
616     feed = "https://ryona.agency/users/fba/feed.atom"
617
618     domains = list()
619
620     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
621     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
622
623     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
624     if response.ok and response.status_code < 300 and len(response.text) > 0:
625         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
626         atom = atoma.parse_atom_bytes(response.content)
627
628         logger.debug("atom[]='%s'", type(atom))
629         for entry in atom.entries:
630             logger.debug("entry[]='%s'", type(entry))
631             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
632             logger.debug("doc[]='%s'", type(doc))
633             for element in doc.findAll("a"):
634                 for href in element["href"].split(","):
635                     logger.debug("href[%s]='%s", type(href), href)
636                     domain = tidyup.domain(href)
637
638                     logger.debug("domain='%s'", domain)
639                     if not utils.is_domain_wanted(domain):
640                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
641                         continue
642                     elif domain in domains:
643                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
644                         continue
645                     elif instances.is_registered(domain):
646                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
647                         continue
648                     elif instances.is_recent(domain):
649                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
650                         continue
651
652                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
653                     domains.append(domain)
654
655     logger.debug("domains()='%d", len(domains))
656     if len(domains) > 0:
657         locking.acquire()
658
659         logger.info("Adding %d new instances ...", len(domains))
660         for domain in domains:
661             try:
662                 logger.info("Fetching instances from domain='%s' ...", domain)
663                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
664             except network.exceptions as exception:
665                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
666                 instances.set_last_error(domain, exception)
667
668     logger.debug("Success! - EXIT!")
669     return 0
670
671 def fetch_instances(args: argparse.Namespace) -> int:
672     logger.debug("args[]='%s' - CALLED!", type(args))
673     locking.acquire()
674
675     # Initial fetch
676     try:
677         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
678         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
679     except network.exceptions as exception:
680         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
681         instances.set_last_error(args.domain, exception)
682         return 100
683
684     if args.single:
685         logger.debug("Not fetching more instances - EXIT!")
686         return 0
687
688     # Loop through some instances
689     database.cursor.execute(
690         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
691     )
692
693     rows = database.cursor.fetchall()
694     logger.info("Checking %d entries ...", len(rows))
695     for row in rows:
696         logger.debug("domain='%s'", row[0])
697         if not utils.is_domain_wanted(row[0]):
698             logger.debug("Domain row[0]='%s' is not wanted - SKIPPED!", row[0])
699             continue
700
701         try:
702             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", row[0], row[1], row[2], row[3])
703             federation.fetch_instances(row[0], row[1], row[2], inspect.currentframe().f_code.co_name, row[3])
704         except network.exceptions as exception:
705             logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[0]='%s'", type(exception), row[0])
706             instances.set_last_error(row[0], exception)
707
708     logger.debug("Success - EXIT!")
709     return 0
710
711 def fetch_oliphant(args: argparse.Namespace) -> int:
712     logger.debug("args[]='%s' - CALLED!", type(args))
713     locking.acquire()
714
715     # Base URL
716     base_url = "https://codeberg.org/oliphant/blocklists/raw/branch/main/blocklists"
717
718     # URLs to fetch
719     blocklists = (
720         {
721             "blocker": "artisan.chat",
722             "csv_url": "mastodon/artisan.chat.csv",
723         },{
724             "blocker": "mastodon.art",
725             "csv_url": "mastodon/mastodon.art.csv",
726         },{
727             "blocker": "pleroma.envs.net",
728             "csv_url": "mastodon/pleroma.envs.net.csv",
729         },{
730             "blocker": "oliphant.social",
731             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
732         },{
733             "blocker": "mastodon.online",
734             "csv_url": "mastodon/mastodon.online.csv",
735         },{
736             "blocker": "mastodon.social",
737             "csv_url": "mastodon/mastodon.social.csv",
738         },{
739             "blocker": "mastodon.social",
740             "csv_url": "other/missing-tier0-mastodon.social.csv",
741         },{
742             "blocker": "rage.love",
743             "csv_url": "mastodon/rage.love.csv",
744         },{
745             "blocker": "sunny.garden",
746             "csv_url": "mastodon/sunny.garden.csv",
747         },{
748             "blocker": "solarpunk.moe",
749             "csv_url": "mastodon/solarpunk.moe.csv",
750         },{
751             "blocker": "toot.wales",
752             "csv_url": "mastodon/toot.wales.csv",
753         },{
754             "blocker": "union.place",
755             "csv_url": "mastodon/union.place.csv",
756         }
757     )
758
759     domains = list()
760
761     logger.debug("Downloading %d files ...", len(blocklists))
762     for block in blocklists:
763         # Is domain given and not equal blocker?
764         if isinstance(args.domain, str) and args.domain != block["blocker"]:
765             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
766             continue
767         elif args.domain in domains:
768             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
769             continue
770         elif instances.is_recent(block["blocker"]):
771             logger.debug("block[blocker]='%s' has been recently crawled - SKIPPED!", block["blocker"])
772             continue
773
774         # Fetch this URL
775         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block['csv_url'], block["blocker"])
776         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
777
778         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
779         if response.ok and response.content != "":
780             logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
781             reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect="unix")
782
783             logger.debug("reader[]='%s'", type(reader))
784             for row in reader:
785                 logger.debug("row[%s]='%s'", type(row), row)
786                 domain = severity = None
787                 reject_media = reject_reports = False
788                 if "#domain" in row:
789                     domain = row["#domain"]
790                 elif "domain" in row:
791                     domain = row["domain"]
792                 else:
793                     logger.debug("row='%s' does not contain domain column", row)
794                     continue
795
796                 if "#severity" in row:
797                     severity = row["#severity"]
798                 elif "severity" in row:
799                     severity = row["severity"]
800                 else:
801                     logger.debug("row='%s' does not contain severity column", row)
802                     continue
803
804                 if "#reject_media" in row and row["#reject_media"].lower() == "true":
805                     reject_media = True
806                 elif "reject_media" in row and row["reject_media"].lower() == "true":
807                     reject_media = True
808
809                 if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
810                     reject_reports = True
811                 elif "reject_reports" in row and row["reject_reports"].lower() == "true":
812                     reject_reports = True
813
814                 logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
815                 if not utils.is_domain_wanted(domain):
816                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
817                     continue
818
819                 logger.debug("Marking domain='%s' as handled", domain)
820                 domains.append(domain)
821
822                 logger.debug("Processing domain='%s' ...", domain)
823                 processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
824                 logger.debug("processed='%s'", processed)
825
826                 utils.process_block(block['blocker'], domain, None, "reject")
827                 if reject_media:
828                     utils.process_block(block['blocker'], domain, None, "reject_media")
829                 if reject_reports:
830                     utils.process_block(block['blocker'], domain, None, "reject_reports")
831
832     logger.debug("Success! - EXIT!")
833     return 0
834
835 def fetch_txt(args: argparse.Namespace) -> int:
836     logger.debug("args[]='%s' - CALLED!", type(args))
837     locking.acquire()
838
839     # Static URLs
840     urls = ({
841         "blocker": "seirdy.one",
842         "url"    : "https://seirdy.one/pb/bsl.txt",
843     },)
844
845     logger.info("Checking %d text file(s) ...", len(urls))
846     for row in urls:
847         logger.debug("Fetching row[url]='%s' ...", row["url"])
848         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
849
850         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
851         if response.ok and response.status_code < 300 and response.text != "":
852             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
853             domains = response.text.split("\n")
854
855             logger.info("Processing %d domains ...", len(domains))
856             for domain in domains:
857                 logger.debug("domain='%s'", domain)
858                 if domain == "":
859                     logger.debug("domain is empty - SKIPPED!")
860                     continue
861                 elif not utils.is_domain_wanted(domain):
862                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
863                     continue
864                 elif instances.is_recent(domain):
865                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
866                     continue
867
868                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
869                 processed = utils.process_domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
870
871                 logger.debug("processed='%s'", processed)
872                 if not processed:
873                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
874                     continue
875
876     logger.debug("Success! - EXIT!")
877     return 0
878
879 def fetch_fedipact(args: argparse.Namespace) -> int:
880     logger.debug("args[]='%s' - CALLED!", type(args))
881     locking.acquire()
882
883     response = utils.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
884
885     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
886     if response.ok and response.status_code < 300 and response.text != "":
887         logger.debug("Parsing %d Bytes ...", len(response.text))
888
889         doc = bs4.BeautifulSoup(response.text, "html.parser")
890         logger.debug("doc[]='%s'", type(doc))
891
892         rows = doc.findAll("li")
893         logger.info("Checking %d row(s) ...", len(rows))
894         for row in rows:
895             logger.debug("row[]='%s'", type(row))
896             domain = tidyup.domain(row.contents[0])
897
898             logger.debug("domain='%s'", domain)
899             if domain == "":
900                 logger.debug("domain is empty - SKIPPED!")
901                 continue
902             elif not utils.is_domain_wanted(domain):
903                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
904                 continue
905             elif instances.is_registered(domain):
906                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
907                 continue
908             elif instances.is_recent(domain):
909                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
910                 continue
911
912             logger.info("Fetching domain='%s' ...", domain)
913             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
914
915     logger.debug("Success! - EXIT!")
916     return 0
917
918 def fetch_joinfediverse(args: argparse.Namespace) -> int:
919     logger.debug("args[]='%s' - CALLED!", type(args))
920     locking.acquire()
921
922     raw = utils.fetch_url("https://joinfediverse.wiki/FediBlock", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
923     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
924
925     doc = bs4.BeautifulSoup(raw, "html.parser")
926     logger.debug("doc[]='%s'", type(doc))
927
928     tables = doc.findAll("table", {"class": "wikitable"})
929
930     logger.info("Analyzing %d table(s) ...", len(tables))
931     blocklist = list()
932     for table in tables:
933         logger.debug("table[]='%s'", type(table))
934
935         rows = table.findAll("tr")
936         logger.info("Checking %d row(s) ...", len(rows))
937         block_headers = dict()
938         for row in rows:
939             logger.debug("row[%s]='%s'", type(row), row)
940
941             headers = row.findAll("th")
942             logger.debug("Found headers()=%d header(s)", len(headers))
943             if len(headers) > 1:
944                 block_headers = dict()
945                 cnt = 0
946                 for header in headers:
947                     cnt = cnt + 1
948                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
949                     text = header.contents[0]
950
951                     logger.debug("text[]='%s'", type(text))
952                     if not isinstance(text, str):
953                         logger.debug("text[]='%s' is not 'str' - SKIPPED!", type(text))
954                         continue
955                     elif validators.domain(text.strip()):
956                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
957                         continue
958
959                     text = tidyup.domain(text.strip())
960                     logger.debug("text='%s'", text)
961                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
962                         logger.debug("Found header: '%s'=%d", text, cnt)
963                         block_headers[cnt] = text
964
965             elif len(block_headers) == 0:
966                 logger.debug("row is not scrapable - SKIPPED!")
967                 continue
968             elif len(block_headers) > 0:
969                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
970                 cnt = 0
971                 block = dict()
972
973                 for element in row.find_all(["th", "td"]):
974                     cnt = cnt + 1
975                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
976                     if cnt in block_headers:
977                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
978
979                         text = element.text.strip()
980                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
981
982                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
983                         if key in ["domain", "instance"]:
984                             block[key] = text
985                         elif key == "reason":
986                             block[key] = tidyup.reason(text)
987                         elif key == "subdomain(s)":
988                             block[key] = list()
989                             if text != "":
990                                 block[key] = text.split("/")
991                         else:
992                             logger.debug("key='%s'", key)
993                             block[key] = text
994
995                 logger.debug("block()=%d ...", len(block))
996                 if len(block) > 0:
997                     logger.debug("Appending block()=%d ...", len(block))
998                     blocklist.append(block)
999
1000     logger.debug("blocklist()=%d", len(blocklist))
1001
1002     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1003     domains = database.cursor.fetchall()
1004
1005     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1006     blocking = list()
1007     for block in blocklist:
1008         logger.debug("block='%s'", block)
1009         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1010             origin = block["blocked"]
1011             for subdomain in block["subdomain(s)"]:
1012                 block["blocked"] = subdomain + "." + origin
1013                 blocking.append(block)
1014         else:
1015             blocking.append(block)
1016
1017     logger.debug("blocking()=%d", blocking)
1018     for block in blocking:
1019         block["blocked"] = tidyup.domain(block["blocked"])
1020
1021         if not utils.is_domain_wanted(block["blocked"]):
1022             logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1023             continue
1024         elif instances.is_recent(block["blocked"]):
1025             logger.debug("blocked='%s' has been recently checked - SKIPPED!", block["blocked"])
1026             continue
1027
1028         logger.info("Proccessing blocked='%s' ...", block["blocked"])
1029         processed = utils.process_domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1030
1031     blockdict = list()
1032     for blocker in domains:
1033         blocker = blocker[0]
1034         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1035
1036         for block in blocking:
1037             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1038
1039             if not utils.is_domain_wanted(block["blocked"]):
1040                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1041                 continue
1042
1043             logger.debug("blocked='%s',reason='%s'", block['blocked'], block['reason'])
1044             utils.process_block(blocker, block['blocked'], block['reason'], "reject")
1045
1046         if instances.has_pending(blocker):
1047             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1048             instances.update_data(blocker)
1049
1050         logger.debug("Invoking commit() ...")
1051         database.connection.commit()
1052
1053     logger.debug("config[bot_enabled]='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
1054     if config.get("bot_enabled") and len(blockdict) > 0:
1055         logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1056         network.send_bot_post(blocker, blockdict)
1057
1058     logger.debug("Success! - EXIT!")
1059     return 0