]> git.mxchange.org Git - fba.git/blob - fba/commands.py
f153b565e9d59c7b2835d63f97d287825b7269ca
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import locking
41 from fba.helpers import processing
42 from fba.helpers import software as software_helper
43 from fba.helpers import tidyup
44
45 from fba.http import federation
46 from fba.http import network
47
48 from fba.models import blocks
49 from fba.models import instances
50 from fba.models import sources
51
52 from fba.networks import friendica
53 from fba.networks import lemmy
54 from fba.networks import mastodon
55 from fba.networks import misskey
56 from fba.networks import pleroma
57
58 logging.basicConfig(level=logging.INFO)
59 logger = logging.getLogger(__name__)
60 #logger.setLevel(logging.DEBUG)
61
62 def check_instance(args: argparse.Namespace) -> int:
63     logger.debug("args.domain='%s' - CALLED!", args.domain)
64     status = 0
65     if not validators.domain(args.domain):
66         logger.warning("args.domain='%s' is not valid", args.domain)
67         status = 100
68     elif blacklist.is_blacklisted(args.domain):
69         logger.warning("args.domain='%s' is blacklisted", args.domain)
70         status = 101
71     elif instances.is_registered(args.domain):
72         logger.warning("args.domain='%s' is already registered", args.domain)
73         status = 102
74     else:
75         logger.info("args.domain='%s' is not known", args.domain)
76
77     logger.debug("status=%d - EXIT!", status)
78     return status
79
80 def check_nodeinfo(args: argparse.Namespace) -> int:
81     logger.debug("args[]='%s' - CALLED!", type(args))
82
83     # Fetch rows
84     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
85
86     cnt = 0
87     for row in database.cursor.fetchall():
88         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
89         punycode = row["domain"].encode("idna").decode("utf-8")
90
91         if row["nodeinfo_url"].startswith("/"):
92             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
93             continue
94         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
95             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
96             cnt = cnt + 1
97
98     logger.info("Found %d row(s)", cnt)
99
100     logger.debug("EXIT!")
101     return 0
102
103 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
104     logger.debug("args[]='%s' - CALLED!", type(args))
105
106     # No CSRF by default, you don't have to add network.source_headers by yourself here
107     headers = tuple()
108     source_domain = "pixelfed.org"
109
110     if sources.is_recent(source_domain):
111         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
112         return 0
113     else:
114         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
115         sources.update(source_domain)
116
117     try:
118         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
119         headers = csrf.determine(source_domain, dict())
120     except network.exceptions as exception:
121         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
122         return list()
123
124     try:
125         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
126         fetched = network.get_json_api(
127             source_domain,
128             "/api/v1/servers/all.json?scope=All&country=all&language=all",
129             headers,
130             (config.get("connection_timeout"), config.get("read_timeout"))
131         )
132
133         logger.debug("JSON API returned %d elements", len(fetched))
134         if "error_message" in fetched:
135             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
136             return 101
137         elif "data" not in fetched["json"]:
138             logger.warning("API did not return JSON with 'data' element - EXIT!")
139             return 102
140
141         rows = fetched["json"]["data"]
142         logger.info("Checking %d fetched rows ...", len(rows))
143         for row in rows:
144             logger.debug("row[]='%s'", type(row))
145             if "domain" not in row:
146                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
147                 continue
148             elif row["domain"] == "":
149                 logger.debug("row[domain] is empty - SKIPPED!")
150                 continue
151
152             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
153             domain = row["domain"].encode("idna").decode("utf-8")
154             logger.debug("domain='%s' - AFTER!", domain)
155
156             if not utils.is_domain_wanted(domain):
157                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
158                 continue
159             elif instances.is_registered(domain):
160                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
161                 continue
162             elif instances.is_recent(domain):
163                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
164                 continue
165
166             logger.debug("Fetching instances from domain='%s' ...", domain)
167             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
168
169     except network.exceptions as exception:
170         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
171         return 103
172
173     logger.debug("Success! - EXIT!")
174     return 0
175
176 def fetch_bkali(args: argparse.Namespace) -> int:
177     logger.debug("args[]='%s' - CALLED!", type(args))
178
179     logger.debug("Invoking locking.acquire() ...")
180     locking.acquire()
181
182     source_domain = "gql.api.bka.li"
183     if sources.is_recent(source_domain):
184         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
185         return 0
186     else:
187         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
188         sources.update(source_domain)
189
190     domains = list()
191     try:
192         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
193         fetched = network.post_json_api(
194             source_domain,
195             "/v1/graphql",
196             json.dumps({
197                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
198             })
199         )
200
201         logger.debug("fetched[]='%s'", type(fetched))
202         if "error_message" in fetched:
203             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
204             return 100
205         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
206             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
207             return 101
208
209         rows = fetched["json"]
210
211         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
212         if len(rows) == 0:
213             raise Exception("WARNING: Returned no records")
214         elif "data" not in rows:
215             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
216         elif "nodeinfo" not in rows["data"]:
217             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
218
219         for entry in rows["data"]["nodeinfo"]:
220             logger.debug("entry[%s]='%s'", type(entry), entry)
221             if "domain" not in entry:
222                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
223                 continue
224             elif entry["domain"] == "":
225                 logger.debug("entry[domain] is empty - SKIPPED!")
226                 continue
227             elif not utils.is_domain_wanted(entry["domain"]):
228                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
229                 continue
230             elif instances.is_registered(entry["domain"]):
231                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
232                 continue
233             elif instances.is_recent(entry["domain"]):
234                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
235                 continue
236
237             logger.debug("Adding domain='%s' ...", entry["domain"])
238             domains.append(entry["domain"])
239
240     except network.exceptions as exception:
241         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
242         return 102
243
244     logger.debug("domains()=%d", len(domains))
245     if len(domains) > 0:
246         logger.info("Adding %d new instances ...", len(domains))
247         for domain in domains:
248             logger.debug("domain='%s' - BEFORE!", domain)
249             domain = domain.encode("idna").decode("utf-8")
250             logger.debug("domain='%s' - AFTER!", domain)
251
252             try:
253                 logger.info("Fetching instances from domain='%s' ...", domain)
254                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
255             except network.exceptions as exception:
256                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
257                 instances.set_last_error(domain, exception)
258                 return 100
259
260     logger.debug("Success - EXIT!")
261     return 0
262
263 def fetch_blocks(args: argparse.Namespace) -> int:
264     logger.debug("args[]='%s' - CALLED!", type(args))
265     if args.domain is not None and args.domain != "":
266         logger.debug("args.domain='%s' - checking ...", args.domain)
267         if not validators.domain(args.domain):
268             logger.warning("args.domain='%s' is not valid.", args.domain)
269             return 100
270         elif blacklist.is_blacklisted(args.domain):
271             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
272             return 101
273         elif not instances.is_registered(args.domain):
274             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
275             return 102
276
277     logger.debug("Invoking locking.acquire() ...")
278     locking.acquire()
279
280     if args.domain is not None and args.domain != "":
281         # Re-check single domain
282         logger.debug("Querying database for args.domain='%s' ...", args.domain)
283         database.cursor.execute(
284             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
285         )
286     elif args.software is not None and args.software != "":
287         # Re-check single software
288         logger.debug("Querying database for args.software='%s' ...", args.software)
289         database.cursor.execute(
290             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
291         )
292     elif args.force:
293         # Re-check all
294         logger.debug("Re-checking all instances ...")
295         database.cursor.execute(
296             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC"
297         )
298     else:
299         # Re-check after "timeout" (aka. minimum interval)
300         database.cursor.execute(
301             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
302         )
303
304     rows = database.cursor.fetchall()
305     logger.info("Checking %d entries ...", len(rows))
306     for blocker, software, origin, nodeinfo_url in rows:
307         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
308         blocker = tidyup.domain(blocker)
309         logger.debug("blocker='%s' - AFTER!", blocker)
310
311         if blocker == "":
312             logger.warning("blocker is now empty!")
313             continue
314         elif nodeinfo_url is None or nodeinfo_url == "":
315             logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
316             continue
317         elif not utils.is_domain_wanted(blocker):
318             logger.debug("blocker='%s' is not wanted - SKIPPED!", blocker)
319             continue
320
321         logger.debug("blocker='%s'", blocker)
322         instances.set_last_blocked(blocker)
323         instances.set_has_obfuscation(blocker, False)
324
325         blocking = list()
326         if software == "pleroma":
327             logger.info("blocker='%s',software='%s'", blocker, software)
328             blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
329         elif software == "mastodon":
330             logger.info("blocker='%s',software='%s'", blocker, software)
331             blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
332         elif software == "lemmy":
333             logger.info("blocker='%s',software='%s'", blocker, software)
334             blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
335         elif software == "friendica":
336             logger.info("blocker='%s',software='%s'", blocker, software)
337             blocking = friendica.fetch_blocks(blocker)
338         elif software == "misskey":
339             logger.info("blocker='%s',software='%s'", blocker, software)
340             blocking = misskey.fetch_blocks(blocker)
341         else:
342             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
343
344         logger.debug("blocker='%s'", blocker)
345         if blocker != "chaos.social":
346             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
347             instances.set_total_blocks(blocker, blocking)
348
349         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
350         blockdict = list()
351         for block in blocking:
352             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
353
354             if block["block_level"] == "":
355                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
356                 continue
357
358             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
359             block["blocked"] = tidyup.domain(block["blocked"])
360             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
361             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
362
363             if block["blocked"] == "":
364                 logger.warning("blocked is empty, blocker='%s'", blocker)
365                 continue
366             elif block["blocked"].endswith(".onion"):
367                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
368                 continue
369             elif block["blocked"].endswith(".arpa"):
370                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
371                 continue
372             elif block["blocked"].endswith(".tld"):
373                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
374                 continue
375             elif block["blocked"].find("*") >= 0:
376                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
377
378                 # Some friendica servers also obscure domains without hash
379                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
380
381                 logger.debug("row[]='%s'", type(row))
382                 if row is None:
383                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
384                     instances.set_has_obfuscation(blocker, True)
385                     continue
386
387                 block["blocked"] = row["domain"]
388                 origin           = row["origin"]
389                 nodeinfo_url     = row["nodeinfo_url"]
390             elif block["blocked"].find("?") >= 0:
391                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
392
393                 # Some obscure them with question marks, not sure if that's dependent on version or not
394                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
395
396                 logger.debug("row[]='%s'", type(row))
397                 if row is None:
398                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
399                     instances.set_has_obfuscation(blocker, True)
400                     continue
401
402                 block["blocked"] = row["domain"]
403                 origin           = row["origin"]
404                 nodeinfo_url     = row["nodeinfo_url"]
405
406             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
407             if block["blocked"] == "":
408                 logger.debug("block[blocked] is empty - SKIPPED!")
409                 continue
410
411             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
412             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
413             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
414
415             if not utils.is_domain_wanted(block["blocked"]):
416                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
417                 continue
418             elif block["block_level"] in ["accept", "accepted"]:
419                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
420                 continue
421             elif not instances.is_registered(block["blocked"]):
422                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
423                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
424
425             block["block_level"] = blocks.alias_block_level(block["block_level"])
426
427             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
428                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
429                 blockdict.append({
430                     "blocked": block["blocked"],
431                     "reason" : block["reason"],
432                 })
433
434             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
435             cookies.clear(block["blocked"])
436
437         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
438         if instances.has_pending(blocker):
439             logger.debug("Flushing updates for blocker='%s' ...", blocker)
440             instances.update_data(blocker)
441
442         logger.debug("Invoking commit() ...")
443         database.connection.commit()
444
445         logger.debug("Invoking cookies.clear(%s) ...", blocker)
446         cookies.clear(blocker)
447
448         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
449         if config.get("bot_enabled") and len(blockdict) > 0:
450             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
451             network.send_bot_post(blocker, blockdict)
452
453     logger.debug("Success! - EXIT!")
454     return 0
455
456 def fetch_observer(args: argparse.Namespace) -> int:
457     logger.debug("args[]='%s' - CALLED!", type(args))
458
459     logger.debug("Invoking locking.acquire() ...")
460     locking.acquire()
461
462     source_domain = "fediverse.observer"
463     if sources.is_recent(source_domain):
464         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
465         return 0
466     else:
467         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
468         sources.update(source_domain)
469
470     types = list()
471     if args.software is None:
472         logger.info("Fetching software list ...")
473         raw = utils.fetch_url(
474             f"https://{source_domain}",
475             network.web_headers,
476             (config.get("connection_timeout"), config.get("read_timeout"))
477         ).text
478         logger.debug("raw[%s]()=%d", type(raw), len(raw))
479
480         doc = bs4.BeautifulSoup(raw, features="html.parser")
481         logger.debug("doc[]='%s'", type(doc))
482
483         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
484         logger.debug("navbar[]='%s'", type(navbar))
485         if navbar is None:
486             logger.warning("Cannot find navigation bar, cannot continue!")
487             return 1
488
489         items = navbar.findAll("a", {"class": "dropdown-item"})
490         logger.debug("items[]='%s'", type(items))
491
492         logger.info("Checking %d menu items ...", len(items))
493         for item in items:
494             logger.debug("item[%s]='%s'", type(item), item)
495             if item.text.lower() == "all":
496                 logger.debug("Skipping 'All' menu entry ...")
497                 continue
498
499             logger.debug("Appending item.text='%s' ...", item.text)
500             types.append(tidyup.domain(item.text))
501     else:
502         logger.info("Adding args.software='%s' as type ...", args.software)
503         types.append(args.software)
504
505     logger.info("Fetching %d different table data ...", len(types))
506     for software in types:
507         logger.debug("software='%s' - BEFORE!", software)
508         if args.software is not None and args.software != software:
509             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
510             continue
511
512         doc = None
513         try:
514             logger.debug("Fetching table data for software='%s' ...", software)
515             raw = utils.fetch_url(
516                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
517                 network.web_headers,
518                 (config.get("connection_timeout"), config.get("read_timeout"))
519             ).text
520             logger.debug("raw[%s]()=%d", type(raw), len(raw))
521
522             doc = bs4.BeautifulSoup(raw, features="html.parser")
523             logger.debug("doc[]='%s'", type(doc))
524         except network.exceptions as exception:
525             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
526             continue
527
528         items = doc.findAll("a", {"class": "url"})
529         logger.info("Checking %d items,software='%s' ...", len(items), software)
530         for item in items:
531             logger.debug("item[]='%s'", type(item))
532             domain = item.decode_contents()
533             logger.debug("domain='%s' - AFTER!", domain)
534
535             if domain == "":
536                 logger.debug("domain is empty - SKIPPED!")
537                 continue
538
539             logger.debug("domain='%s' - BEFORE!", domain)
540             domain = domain.encode("idna").decode("utf-8")
541             logger.debug("domain='%s' - AFTER!", domain)
542
543             if not utils.is_domain_wanted(domain):
544                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
545                 continue
546             elif instances.is_registered(domain):
547                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
548                 continue
549             elif instances.is_recent(domain):
550                 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
551                 continue
552
553             software = software_helper.alias(software)
554             logger.info("Fetching instances for domain='%s'", domain)
555             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
556
557     logger.debug("Success! - EXIT!")
558     return 0
559
560 def fetch_todon_wiki(args: argparse.Namespace) -> int:
561     logger.debug("args[]='%s' - CALLED!", type(args))
562
563     logger.debug("Invoking locking.acquire() ...")
564     locking.acquire()
565
566     source_domain = "wiki.todon.eu"
567     if sources.is_recent(source_domain):
568         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
569         return 0
570     else:
571         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
572         sources.update(source_domain)
573
574     blocklist = {
575         "silenced": list(),
576         "reject": list(),
577     }
578
579     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
580     raw = utils.fetch_url(
581         f"https://{source_domain}/todon/domainblocks",
582         network.web_headers,
583         (config.get("connection_timeout"), config.get("read_timeout"))
584     ).text
585     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
586
587     doc = bs4.BeautifulSoup(raw, "html.parser")
588     logger.debug("doc[]='%s'", type(doc))
589
590     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
591     logger.info("Checking %d silenced/limited entries ...", len(silenced))
592     blocklist["silenced"] = utils.find_domains(silenced, "div")
593
594     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
595     logger.info("Checking %d suspended entries ...", len(suspended))
596     blocklist["reject"] = utils.find_domains(suspended, "div")
597
598     blocking = blocklist["silenced"] + blocklist["reject"]
599     blocker = "todon.eu"
600
601     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
602     instances.set_total_blocks(blocker, blocking)
603
604     blockdict = list()
605     for block_level in blocklist:
606         blockers = blocklist[block_level]
607
608         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
609         for blocked in blockers:
610             logger.debug("blocked='%s'", blocked)
611
612             if not instances.is_registered(blocked):
613                 try:
614                     logger.info("Fetching instances from domain='%s' ...", blocked)
615                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
616                 except network.exceptions as exception:
617                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
618                     instances.set_last_error(blocked, exception)
619
620             if blocks.is_instance_blocked(blocker, blocked, block_level):
621                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
622                 continue
623
624             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
625             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
626                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
627                 blockdict.append({
628                     "blocked": blocked,
629                     "reason" : None,
630                 })
631
632         logger.debug("Invoking commit() ...")
633         database.connection.commit()
634
635         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
636         if config.get("bot_enabled") and len(blockdict) > 0:
637             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
638             network.send_bot_post(blocker, blockdict)
639
640     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
641     if instances.has_pending(blocker):
642         logger.debug("Flushing updates for blocker='%s' ...", blocker)
643         instances.update_data(blocker)
644
645     logger.debug("Success! - EXIT!")
646     return 0
647
648 def fetch_cs(args: argparse.Namespace):
649     logger.debug("args[]='%s' - CALLED!", type(args))
650
651     logger.debug("Invoking locking.acquire() ...")
652     locking.acquire()
653
654     extensions = [
655         "extra",
656         "abbr",
657         "attr_list",
658         "def_list",
659         "fenced_code",
660         "footnotes",
661         "md_in_html",
662         "admonition",
663         "codehilite",
664         "legacy_attrs",
665         "legacy_em",
666         "meta",
667         "nl2br",
668         "sane_lists",
669         "smarty",
670         "toc",
671         "wikilinks"
672     ]
673
674     blocklist = {
675         "silenced": list(),
676         "reject"  : list(),
677     }
678
679     source_domain = "raw.githubusercontent.com"
680     if sources.is_recent(source_domain):
681         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
682         return 0
683     else:
684         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
685         sources.update(source_domain)
686
687     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
688     raw = utils.fetch_url(
689         f"https://{source_domain}/chaossocial/meta/master/federation.md",
690         network.web_headers,
691         (config.get("connection_timeout"), config.get("read_timeout"))
692     ).text
693     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
694
695     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
696     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
697
698     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
699     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
700     blocklist["silenced"] = federation.find_domains(silenced)
701
702     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
703     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
704     blocklist["reject"] = federation.find_domains(blocked)
705
706     blocking = blocklist["silenced"] + blocklist["reject"]
707     blocker = "chaos.social"
708
709     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
710     instances.set_total_blocks(blocker, blocking)
711
712     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
713     if len(blocking) > 0:
714         blockdict = list()
715         for block_level in blocklist:
716             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
717
718             for row in blocklist[block_level]:
719                 logger.debug("row[%s]='%s'", type(row), row)
720                 if not "domain" in row:
721                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
722                     continue
723                 elif not instances.is_registered(row["domain"]):
724                     try:
725                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
726                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
727                     except network.exceptions as exception:
728                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
729                         instances.set_last_error(row["domain"], exception)
730
731                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
732                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
733                     blockdict.append({
734                         "blocked": row["domain"],
735                         "reason" : row["reason"],
736                     })
737
738         logger.debug("Invoking commit() ...")
739         database.connection.commit()
740
741         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
742         if config.get("bot_enabled") and len(blockdict) > 0:
743             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
744             network.send_bot_post(blocker, blockdict)
745
746     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
747     if instances.has_pending(blocker):
748         logger.debug("Flushing updates for blocker='%s' ...", blocker)
749         instances.update_data(blocker)
750
751     logger.debug("Success! - EXIT!")
752     return 0
753
754 def fetch_fba_rss(args: argparse.Namespace) -> int:
755     logger.debug("args[]='%s' - CALLED!", type(args))
756
757     domains = list()
758
759     logger.debug("Invoking locking.acquire() ...")
760     locking.acquire()
761
762     components = urlparse(args.feed)
763
764     if sources.is_recent(components.netloc):
765         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
766         return 0
767     else:
768         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
769         sources.update(components.netloc)
770
771     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
772     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
773
774     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
775     if response.ok and response.status_code < 300 and len(response.text) > 0:
776         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
777         rss = atoma.parse_rss_bytes(response.content)
778
779         logger.debug("rss[]='%s'", type(rss))
780         for item in rss.items:
781             logger.debug("item[%s]='%s'", type(item), item)
782             domain = tidyup.domain(item.link.split("=")[1])
783
784             logger.debug("domain='%s' - AFTER!", domain)
785             if domain == "":
786                 logger.debug("domain is empty - SKIPPED!")
787                 continue
788
789             logger.debug("domain='%s' - BEFORE!", domain)
790             domain = domain.encode("idna").decode("utf-8")
791             logger.debug("domain='%s' - AFTER!", domain)
792
793             if not utils.is_domain_wanted(domain):
794                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
795                 continue
796             elif domain in domains:
797                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
798                 continue
799             elif instances.is_registered(domain):
800                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
801                 continue
802             elif instances.is_recent(domain):
803                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
804                 continue
805
806             logger.debug("Adding domain='%s'", domain)
807             domains.append(domain)
808
809     logger.debug("domains()=%d", len(domains))
810     if len(domains) > 0:
811         logger.info("Adding %d new instances ...", len(domains))
812         for domain in domains:
813             logger.debug("domain='%s'", domain)
814             try:
815                 logger.info("Fetching instances from domain='%s' ...", domain)
816                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
817             except network.exceptions as exception:
818                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
819                 instances.set_last_error(domain, exception)
820                 return 100
821
822     logger.debug("Success! - EXIT!")
823     return 0
824
825 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
826     logger.debug("args[]='%s' - CALLED!", type(args))
827
828     logger.debug("Invoking locking.acquire() ...")
829     locking.acquire()
830
831     source_domain = "ryona.agency"
832     if sources.is_recent(source_domain):
833         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
834         return 0
835     else:
836         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
837         sources.update(source_domain)
838
839     feed = f"https://{source_domain}/users/fba/feed.atom"
840
841     domains = list()
842
843     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
844     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
845
846     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
847     if response.ok and response.status_code < 300 and len(response.text) > 0:
848         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
849         atom = atoma.parse_atom_bytes(response.content)
850
851         logger.debug("atom[]='%s'", type(atom))
852         for entry in atom.entries:
853             logger.debug("entry[]='%s'", type(entry))
854             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
855             logger.debug("doc[]='%s'", type(doc))
856             for element in doc.findAll("a"):
857                 logger.debug("element[]='%s'", type(element))
858                 for href in element["href"].split(","):
859                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
860                     domain = tidyup.domain(href)
861
862                     logger.debug("domain='%s' - AFTER!", domain)
863                     if domain == "":
864                         logger.debug("domain is empty - SKIPPED!")
865                         continue
866
867                     logger.debug("domain='%s' - BEFORE!", domain)
868                     domain = domain.encode("idna").decode("utf-8")
869                     logger.debug("domain='%s' - AFTER!", domain)
870
871                     if not utils.is_domain_wanted(domain):
872                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
873                         continue
874                     elif domain in domains:
875                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
876                         continue
877                     elif instances.is_registered(domain):
878                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
879                         continue
880                     elif instances.is_recent(domain):
881                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
882                         continue
883
884                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
885                     domains.append(domain)
886
887     logger.debug("domains()=%d", len(domains))
888     if len(domains) > 0:
889         logger.info("Adding %d new instances ...", len(domains))
890         for domain in domains:
891             logger.debug("domain='%s'", domain)
892             try:
893                 logger.info("Fetching instances from domain='%s' ...", domain)
894                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
895             except network.exceptions as exception:
896                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
897                 instances.set_last_error(domain, exception)
898                 return 100
899
900     logger.debug("Success! - EXIT!")
901     return 0
902
903 def fetch_instances(args: argparse.Namespace) -> int:
904     logger.debug("args[]='%s' - CALLED!", type(args))
905
906     logger.debug("args.domain='%s' - checking ...", args.domain)
907     if not validators.domain(args.domain):
908         logger.warning("args.domain='%s' is not valid.", args.domain)
909         return 100
910     elif blacklist.is_blacklisted(args.domain):
911         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
912         return 101
913
914     logger.debug("Invoking locking.acquire() ...")
915     locking.acquire()
916
917     # Initial fetch
918     try:
919         logger.info("Fetching instances from args.domain='%s' ...", args.domain)
920         federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
921     except network.exceptions as exception:
922         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
923         instances.set_last_error(args.domain, exception)
924         instances.update_data(args.domain)
925         return 100
926
927     if args.single:
928         logger.debug("Not fetching more instances - EXIT!")
929         return 0
930
931     # Loop through some instances
932     database.cursor.execute(
933         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
934     )
935
936     rows = database.cursor.fetchall()
937     logger.info("Checking %d entries ...", len(rows))
938     for row in rows:
939         logger.debug("row[domain]='%s'", row["domain"])
940         if row["domain"] == "":
941             logger.debug("row[domain] is empty - SKIPPED!")
942             continue
943
944         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
945         domain = row["domain"].encode("idna").decode("utf-8")
946         logger.debug("domain='%s' - AFTER!", domain)
947
948         if not utils.is_domain_wanted(domain):
949             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
950             continue
951
952         try:
953             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
954             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
955         except network.exceptions as exception:
956             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
957             instances.set_last_error(domain, exception)
958
959     logger.debug("Success - EXIT!")
960     return 0
961
962 def fetch_oliphant(args: argparse.Namespace) -> int:
963     logger.debug("args[]='%s' - CALLED!", type(args))
964
965     logger.debug("Invoking locking.acquire() ...")
966     locking.acquire()
967
968     source_domain = "codeberg.org"
969     if sources.is_recent(source_domain):
970         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
971         return 0
972     else:
973         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
974         sources.update(source_domain)
975
976     # Base URL
977     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
978
979     # URLs to fetch
980     blocklists = (
981         {
982             "blocker": "artisan.chat",
983             "csv_url": "mastodon/artisan.chat.csv",
984         },{
985             "blocker": "mastodon.art",
986             "csv_url": "mastodon/mastodon.art.csv",
987         },{
988             "blocker": "pleroma.envs.net",
989             "csv_url": "mastodon/pleroma.envs.net.csv",
990         },{
991             "blocker": "oliphant.social",
992             "csv_url": "mastodon/_unified_tier3_blocklist.csv",
993         },{
994             "blocker": "mastodon.online",
995             "csv_url": "mastodon/mastodon.online.csv",
996         },{
997             "blocker": "mastodon.social",
998             "csv_url": "mastodon/mastodon.social.csv",
999         },{
1000             "blocker": "mastodon.social",
1001             "csv_url": "other/missing-tier0-mastodon.social.csv",
1002         },{
1003             "blocker": "rage.love",
1004             "csv_url": "mastodon/rage.love.csv",
1005         },{
1006             "blocker": "sunny.garden",
1007             "csv_url": "mastodon/sunny.garden.csv",
1008         },{
1009             "blocker": "sunny.garden",
1010             "csv_url": "mastodon/gardenfence.csv",
1011         },{
1012             "blocker": "solarpunk.moe",
1013             "csv_url": "mastodon/solarpunk.moe.csv",
1014         },{
1015             "blocker": "toot.wales",
1016             "csv_url": "mastodon/toot.wales.csv",
1017         },{
1018             "blocker": "union.place",
1019             "csv_url": "mastodon/union.place.csv",
1020         },{
1021             "blocker": "oliphant.social",
1022             "csv_url": "mastodon/birdsite.csv",
1023         }
1024     )
1025
1026     domains = list()
1027
1028     logger.debug("Downloading %d files ...", len(blocklists))
1029     for block in blocklists:
1030         # Is domain given and not equal blocker?
1031         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1032             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1033             continue
1034         elif args.domain in domains:
1035             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1036             continue
1037
1038         # Fetch this URL
1039         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1040         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1041
1042         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1043         if not response.ok or response.status_code >= 300 or response.content == "":
1044             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1045             continue
1046
1047         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1048         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1049
1050         blockdict = list()
1051
1052         cnt = 0
1053         for row in reader:
1054             logger.debug("row[%s]='%s'", type(row), row)
1055             domain = severity = None
1056             reject_media = reject_reports = False
1057
1058             if "#domain" in row:
1059                 domain = row["#domain"]
1060             elif "domain" in row:
1061                 domain = row["domain"]
1062             else:
1063                 logger.debug("row='%s' does not contain domain column", row)
1064                 continue
1065
1066             if "#severity" in row:
1067                 severity = blocks.alias_block_level(row["#severity"])
1068             elif "severity" in row:
1069                 severity = blocks.alias_block_level(row["severity"])
1070             else:
1071                 logger.debug("row='%s' does not contain severity column", row)
1072                 continue
1073
1074             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1075                 reject_media = True
1076             elif "reject_media" in row and row["reject_media"].lower() == "true":
1077                 reject_media = True
1078
1079             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1080                 reject_reports = True
1081             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1082                 reject_reports = True
1083
1084             cnt = cnt + 1
1085             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1086             if domain == "":
1087                 logger.debug("domain is empty - SKIPPED!")
1088                 continue
1089             elif domain.endswith(".onion"):
1090                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1091                 continue
1092             elif domain.endswith(".arpa"):
1093                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1094                 continue
1095             elif domain.endswith(".tld"):
1096                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1097                 continue
1098             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1099                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1100                 domain = utils.deobfuscate(domain, block["blocker"])
1101                 logger.debug("domain='%s' - AFTER!", domain)
1102
1103             if not validators.domain(domain):
1104                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1105                 continue
1106             elif blacklist.is_blacklisted(domain):
1107                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1108                 continue
1109             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1110                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1111                 continue
1112
1113             logger.debug("Marking domain='%s' as handled", domain)
1114             domains.append(domain)
1115
1116             logger.debug("Processing domain='%s' ...", domain)
1117             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1118             logger.debug("processed='%s'", processed)
1119
1120             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1121                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1122                 blockdict.append({
1123                     "blocked": domain,
1124                     "reason" : block["reason"],
1125                 })
1126
1127             if reject_media:
1128                 processing.block(block["blocker"], domain, None, "reject_media")
1129             if reject_reports:
1130                 processing.block(block["blocker"], domain, None, "reject_reports")
1131
1132         logger.debug("block[blocker]='%s'", block["blocker"])
1133         if block["blocker"] != "chaos.social":
1134             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1135             instances.set_total_blocks(block["blocker"], domains)
1136
1137         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1138         if instances.has_pending(block["blocker"]):
1139             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1140             instances.update_data(block["blocker"])
1141
1142         logger.debug("Invoking commit() ...")
1143         database.connection.commit()
1144
1145         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1146         if config.get("bot_enabled") and len(blockdict) > 0:
1147             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1148             network.send_bot_post(block["blocker"], blockdict)
1149
1150     logger.debug("Success! - EXIT!")
1151     return 0
1152
1153 def fetch_txt(args: argparse.Namespace) -> int:
1154     logger.debug("args[]='%s' - CALLED!", type(args))
1155
1156     logger.debug("Invoking locking.acquire() ...")
1157     locking.acquire()
1158
1159     # Static URLs
1160     urls = ({
1161         "blocker": "seirdy.one",
1162         "url"    : "https://seirdy.one/pb/bsl.txt",
1163     },)
1164
1165     logger.info("Checking %d text file(s) ...", len(urls))
1166     for row in urls:
1167         logger.debug("Fetching row[url]='%s' ...", row["url"])
1168         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1169
1170         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1171         if response.ok and response.status_code < 300 and response.text != "":
1172             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1173             domains = response.text.split("\n")
1174
1175             logger.info("Processing %d domains ...", len(domains))
1176             for domain in domains:
1177                 logger.debug("domain='%s' - BEFORE!", domain)
1178                 domain = tidyup.domain(domain)
1179
1180                 logger.debug("domain='%s' - AFTER!", domain)
1181                 if domain == "":
1182                     logger.debug("domain is empty - SKIPPED!")
1183                     continue
1184                 elif not utils.is_domain_wanted(domain):
1185                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1186                     continue
1187                 elif instances.is_recent(domain):
1188                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1189                     continue
1190
1191                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1192                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1193
1194                 logger.debug("processed='%s'", processed)
1195                 if not processed:
1196                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1197                     continue
1198
1199     logger.debug("Success! - EXIT!")
1200     return 0
1201
1202 def fetch_fedipact(args: argparse.Namespace) -> int:
1203     logger.debug("args[]='%s' - CALLED!", type(args))
1204
1205     logger.debug("Invoking locking.acquire() ...")
1206     locking.acquire()
1207
1208     source_domain = "fedipact.online"
1209     if sources.is_recent(source_domain):
1210         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1211         return 0
1212     else:
1213         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1214         sources.update(source_domain)
1215
1216     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1217     response = utils.fetch_url(
1218         f"https://{source_domain}",
1219         network.web_headers,
1220         (config.get("connection_timeout"), config.get("read_timeout"))
1221     )
1222
1223     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1224     if response.ok and response.status_code < 300 and response.text != "":
1225         logger.debug("Parsing %d Bytes ...", len(response.text))
1226
1227         doc = bs4.BeautifulSoup(response.text, "html.parser")
1228         logger.debug("doc[]='%s'", type(doc))
1229
1230         rows = doc.findAll("li")
1231         logger.info("Checking %d row(s) ...", len(rows))
1232         for row in rows:
1233             logger.debug("row[]='%s'", type(row))
1234             domain = tidyup.domain(row.contents[0])
1235
1236             logger.debug("domain='%s' - AFTER!", domain)
1237             if domain == "":
1238                 logger.debug("domain is empty - SKIPPED!")
1239                 continue
1240
1241             logger.debug("domain='%s' - BEFORE!", domain)
1242             domain = domain.encode("idna").decode("utf-8")
1243             logger.debug("domain='%s' - AFTER!", domain)
1244
1245             if not utils.is_domain_wanted(domain):
1246                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1247                 continue
1248             elif instances.is_registered(domain):
1249                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1250                 continue
1251             elif instances.is_recent(domain):
1252                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1253                 continue
1254
1255             logger.info("Fetching domain='%s' ...", domain)
1256             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1257
1258     logger.debug("Success! - EXIT!")
1259     return 0
1260
1261 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1262     logger.debug("args[]='%s' - CALLED!", type(args))
1263
1264     logger.debug("Invoking locking.acquire() ...")
1265     locking.acquire()
1266
1267     source_domain = "instances.joinmobilizon.org"
1268     if sources.is_recent(source_domain):
1269         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1270         return 0
1271     else:
1272         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1273         sources.update(source_domain)
1274
1275     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1276     raw = utils.fetch_url(
1277         f"https://{source_domain}/api/v1/instances",
1278         network.web_headers,
1279         (config.get("connection_timeout"), config.get("read_timeout"))
1280     ).text
1281     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1282
1283     parsed = json.loads(raw)
1284     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1285
1286     if "data" not in parsed:
1287         logger.warning("parsed()=%d does not contain key 'data'")
1288         return 1
1289
1290     logger.info("Checking %d instances ...", len(parsed["data"]))
1291     for row in parsed["data"]:
1292         logger.debug("row[]='%s'", type(row))
1293         if "host" not in row:
1294             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1295             continue
1296         elif not utils.is_domain_wanted(row["host"]):
1297             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1298             continue
1299         elif instances.is_registered(row["host"]):
1300             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1301             continue
1302
1303         logger.info("Fetching row[host]='%s' ...", row["host"])
1304         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1305
1306     logger.debug("Success! - EXIT!")
1307     return 0
1308
1309 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1310     logger.debug("args[]='%s' - CALLED!", type(args))
1311
1312     logger.debug("Invoking locking.acquire() ...")
1313     locking.acquire()
1314
1315     source_domain = "instanceapp.misskey.page"
1316     if sources.is_recent(source_domain):
1317         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1318         return 0
1319     else:
1320         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1321         sources.update(source_domain)
1322
1323     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1324     raw = utils.fetch_url(
1325         f"https://{source_domain}/instances.json",
1326         network.web_headers,
1327         (config.get("connection_timeout"), config.get("read_timeout"))
1328     ).text
1329     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1330
1331     parsed = json.loads(raw)
1332     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1333
1334     if "instancesInfos" not in parsed:
1335         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1336         return 1
1337
1338     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1339     for row in parsed["instancesInfos"]:
1340         logger.debug("row[%s]='%s'", type(row), row)
1341         if "url" not in row:
1342             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1343             continue
1344         elif not utils.is_domain_wanted(row["url"]):
1345             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1346             continue
1347         elif instances.is_registered(row["url"]):
1348             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1349             continue
1350
1351         logger.info("Fetching row[url]='%s' ...", row["url"])
1352         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1353
1354     logger.debug("Success! - EXIT!")
1355     return 0
1356
1357 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1358     logger.debug("args[]='%s' - CALLED!", type(args))
1359
1360     logger.debug("Invoking locking.acquire() ...")
1361     locking.acquire()
1362
1363     source_domain = "joinfediverse.wiki"
1364     if sources.is_recent(source_domain):
1365         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1366         return 0
1367     else:
1368         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1369         sources.update(source_domain)
1370
1371     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1372     raw = utils.fetch_url(
1373         f"https://{source_domain}/FediBlock",
1374         network.web_headers,
1375         (config.get("connection_timeout"), config.get("read_timeout"))
1376     ).text
1377     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1378
1379     doc = bs4.BeautifulSoup(raw, "html.parser")
1380     logger.debug("doc[]='%s'", type(doc))
1381
1382     tables = doc.findAll("table", {"class": "wikitable"})
1383
1384     logger.info("Analyzing %d table(s) ...", len(tables))
1385     blocklist = list()
1386     for table in tables:
1387         logger.debug("table[]='%s'", type(table))
1388
1389         rows = table.findAll("tr")
1390         logger.info("Checking %d row(s) ...", len(rows))
1391         block_headers = dict()
1392         for row in rows:
1393             logger.debug("row[%s]='%s'", type(row), row)
1394
1395             headers = row.findAll("th")
1396             logger.debug("Found headers()=%d header(s)", len(headers))
1397             if len(headers) > 1:
1398                 block_headers = dict()
1399                 cnt = 0
1400                 for header in headers:
1401                     cnt = cnt + 1
1402                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1403                     text = header.contents[0]
1404
1405                     logger.debug("text[]='%s'", type(text))
1406                     if not isinstance(text, str):
1407                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1408                         continue
1409                     elif validators.domain(text.strip()):
1410                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1411                         continue
1412
1413                     text = tidyup.domain(text.strip())
1414                     logger.debug("text='%s' - AFTER!", text)
1415                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1416                         logger.debug("Found header: '%s'=%d", text, cnt)
1417                         block_headers[cnt] = text
1418
1419             elif len(block_headers) == 0:
1420                 logger.debug("row is not scrapable - SKIPPED!")
1421                 continue
1422             elif len(block_headers) > 0:
1423                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1424                 cnt = 0
1425                 block = dict()
1426
1427                 for element in row.find_all(["th", "td"]):
1428                     cnt = cnt + 1
1429                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1430                     if cnt in block_headers:
1431                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1432
1433                         text = element.text.strip()
1434                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1435
1436                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1437                         if key in ["domain", "instance"]:
1438                             block[key] = text
1439                         elif key == "reason":
1440                             block[key] = tidyup.reason(text)
1441                         elif key == "subdomain(s)":
1442                             block[key] = list()
1443                             if text != "":
1444                                 block[key] = text.split("/")
1445                         else:
1446                             logger.debug("key='%s'", key)
1447                             block[key] = text
1448
1449                 logger.debug("block()=%d ...", len(block))
1450                 if len(block) > 0:
1451                     logger.debug("Appending block()=%d ...", len(block))
1452                     blocklist.append(block)
1453
1454     logger.debug("blocklist()=%d", len(blocklist))
1455
1456     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1457     domains = database.cursor.fetchall()
1458
1459     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1460     blocking = list()
1461     for block in blocklist:
1462         logger.debug("block='%s'", block)
1463         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1464             origin = block["blocked"]
1465             logger.debug("origin='%s'", origin)
1466             for subdomain in block["subdomain(s)"]:
1467                 block["blocked"] = subdomain + "." + origin
1468                 logger.debug("block[blocked]='%s'", block["blocked"])
1469                 blocking.append(block)
1470         else:
1471             blocking.append(block)
1472
1473     logger.debug("blocking()=%d", blocking)
1474     for block in blocking:
1475         logger.debug("block[]='%s'", type(block))
1476         if "blocked" not in block:
1477             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1478
1479         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1480         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1481
1482         if block["blocked"] == "":
1483             logger.debug("block[blocked] is empty - SKIPPED!")
1484             continue
1485         elif not utils.is_domain_wanted(block["blocked"]):
1486             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1487             continue
1488         elif instances.is_recent(block["blocked"]):
1489             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1490             continue
1491
1492         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1493         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1494
1495     blockdict = list()
1496     for blocker in domains:
1497         blocker = blocker[0]
1498         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1499
1500         for block in blocking:
1501             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1502             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1503
1504             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1505             if block["blocked"] == "":
1506                 logger.debug("block[blocked] is empty - SKIPPED!")
1507                 continue
1508             elif not utils.is_domain_wanted(block["blocked"]):
1509                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1510                 continue
1511
1512             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1513             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1514                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1515                 blockdict.append({
1516                     "blocked": block["blocked"],
1517                     "reason" : block["reason"],
1518                 })
1519
1520         if instances.has_pending(blocker):
1521             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1522             instances.update_data(blocker)
1523
1524         logger.debug("Invoking commit() ...")
1525         database.connection.commit()
1526
1527         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1528         if config.get("bot_enabled") and len(blockdict) > 0:
1529             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1530             network.send_bot_post(blocker, blockdict)
1531
1532     logger.debug("Success! - EXIT!")
1533     return 0
1534
1535 def recheck_obfuscation(args: argparse.Namespace) -> int:
1536     logger.debug("args[]='%s' - CALLED!", type(args))
1537
1538     logger.debug("Invoking locking.acquire() ...")
1539     locking.acquire()
1540
1541     if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1542         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1543     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1544         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1545     else:
1546         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1547
1548     rows = database.cursor.fetchall()
1549     logger.info("Checking %d domains ...", len(rows))
1550     for row in rows:
1551         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1552         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1553             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1554             continue
1555
1556         blocking = list()
1557         if row["software"] == "pleroma":
1558             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1559             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1560         elif row["software"] == "mastodon":
1561             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1562             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1563         elif row["software"] == "lemmy":
1564             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1565             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1566         elif row["software"] == "friendica":
1567             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1568             blocking = friendica.fetch_blocks(row["domain"])
1569         elif row["software"] == "misskey":
1570             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1571             blocking = misskey.fetch_blocks(row["domain"])
1572         else:
1573             logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1574
1575         logger.debug("row[domain]='%s'", row["domain"])
1576         # chaos.social requires special care ...
1577         if row["domain"] != "chaos.social":
1578             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1579             instances.set_total_blocks(row["domain"], blocking)
1580
1581         obfuscated = 0
1582         blockdict = list()
1583
1584         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1585         for block in blocking:
1586             logger.debug("block[blocked]='%s'", block["blocked"])
1587             blocked = None
1588
1589             if block["blocked"] == "":
1590                 logger.debug("block[blocked] is empty - SKIPPED!")
1591                 continue
1592             elif block["blocked"].endswith(".arpa"):
1593                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1594                 continue
1595             elif block["blocked"].endswith(".tld"):
1596                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1597                 continue
1598             elif block["blocked"].endswith(".onion"):
1599                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1600                 continue
1601             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1602                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1603                 obfuscated = obfuscated + 1
1604                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1605             elif not utils.is_domain_wanted(block["blocked"]):
1606                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1607                 continue
1608             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1609                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1610                 continue
1611
1612             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1613             if blocked is not None and blocked != block["blocked"]:
1614                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1615                 obfuscated = obfuscated - 1
1616                 if blocks.is_instance_blocked(row["domain"], blocked):
1617                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1618                     continue
1619
1620                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1621
1622                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1623                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1624                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1625                     blockdict.append({
1626                         "blocked": blocked,
1627                         "reason" : block["reason"],
1628                     })
1629
1630         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1631         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1632
1633         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1634         if obfuscated == 0 and len(blocking) > 0:
1635             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1636             instances.set_has_obfuscation(row["domain"], False)
1637
1638         if instances.has_pending(row["domain"]):
1639             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1640             instances.update_data(row["domain"])
1641
1642         logger.debug("Invoking commit() ...")
1643         database.connection.commit()
1644
1645         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1646         if config.get("bot_enabled") and len(blockdict) > 0:
1647             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1648             network.send_bot_post(row["domain"], blockdict)
1649
1650     logger.debug("Success! - EXIT!")
1651     return 0
1652
1653 def fetch_fedilist(args: argparse.Namespace) -> int:
1654     logger.debug("args[]='%s' - CALLED!", type(args))
1655
1656     logger.debug("Invoking locking.acquire() ...")
1657     locking.acquire()
1658
1659     source_domain = "demo.fedilist.com"
1660     if sources.is_recent(source_domain):
1661         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1662         return 0
1663     else:
1664         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1665         sources.update(source_domain)
1666
1667     url = f"http://{source_domain}/instance/csv?onion=not"
1668     if args.software is not None and args.software != "":
1669         logger.debug("args.software='%s'", args.software)
1670         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1671
1672     logger.info("Fetching url='%s' ...", url)
1673     response = reqto.get(
1674         url,
1675         headers=network.web_headers,
1676         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1677         allow_redirects=False
1678     )
1679
1680     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1681     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1682         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1683         return 1
1684
1685     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1686
1687     logger.debug("reader[]='%s'", type(reader))
1688     if reader is None:
1689         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1690         return 2
1691
1692     rows = list(reader)
1693
1694     logger.info("Checking %d rows ...", len(rows))
1695     for row in rows:
1696         logger.debug("row[]='%s'", type(row))
1697         if "hostname" not in row:
1698             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1699             continue
1700
1701         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1702         domain = tidyup.domain(row["hostname"])
1703         logger.debug("domain='%s' - AFTER!", domain)
1704
1705         if domain == "":
1706             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1707             continue
1708
1709         logger.debug("domain='%s' - BEFORE!", domain)
1710         domain = domain.encode("idna").decode("utf-8")
1711         logger.debug("domain='%s' - AFTER!", domain)
1712
1713         if not utils.is_domain_wanted(domain):
1714             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1715             continue
1716         elif (args.force is None or not args.force) and instances.is_registered(domain):
1717             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1718             continue
1719         elif instances.is_recent(domain):
1720             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1721             continue
1722
1723         logger.info("Fetching instances from domain='%s' ...", domain)
1724         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1725
1726     logger.debug("Success! - EXIT!")
1727     return 0
1728
1729 def update_nodeinfo(args: argparse.Namespace) -> int:
1730     logger.debug("args[]='%s' - CALLED!", type(args))
1731
1732     logger.debug("Invoking locking.acquire() ...")
1733     locking.acquire()
1734
1735     if args.domain is not None and args.domain != "":
1736         logger.debug("Fetching args.domain='%s'", args.domain)
1737         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1738     elif args.software is not None and args.software != "":
1739         logger.info("Fetching domains for args.software='%s'", args.software)
1740         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1741     else:
1742         logger.info("Fetching domains for recently updated ...")
1743         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1744
1745     domains = database.cursor.fetchall()
1746
1747     logger.info("Checking %d domain(s) ...", len(domains))
1748     cnt = 0
1749     for row in domains:
1750         logger.debug("row[]='%s'", type(row))
1751         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1752             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1753             continue
1754
1755         try:
1756             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1757             software = federation.determine_software(row["domain"])
1758
1759             logger.debug("Determined software='%s'", software)
1760             if (software != row["software"] and software is not None) or args.force is True:
1761                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1762                 instances.set_software(row["domain"], software)
1763
1764             instances.set_success(row["domain"])
1765         except network.exceptions as exception:
1766             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1767             instances.set_last_error(row["domain"], exception)
1768
1769         instances.set_last_nodeinfo(row["domain"])
1770         instances.update_data(row["domain"])
1771         cnt = cnt + 1
1772
1773     logger.debug("Success! - EXIT!")
1774     return 0
1775
1776 def fetch_instances_social(args: argparse.Namespace) -> int:
1777     logger.debug("args[]='%s' - CALLED!", type(args))
1778
1779     logger.debug("Invoking locking.acquire() ...")
1780     locking.acquire()
1781
1782     source_domain = "instances.social"
1783
1784     if config.get("instances_social_api_key") == "":
1785         logger.error("API key not set. Please set in your config.json file.")
1786         return 1
1787     elif sources.is_recent(source_domain):
1788         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1789         return 0
1790     else:
1791         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1792         sources.update(source_domain)
1793
1794     headers = {
1795         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1796     }
1797
1798     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1799     fetched = network.get_json_api(
1800         source_domain,
1801         "/api/1.0/instances/list?count=0&sort_by=name",
1802         headers,
1803         (config.get("connection_timeout"), config.get("read_timeout"))
1804     )
1805     logger.debug("fetched[]='%s'", type(fetched))
1806
1807     if "error_message" in fetched:
1808         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1809         return 2
1810     elif "exception" in fetched:
1811         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1812         return 3
1813     elif "json" not in fetched:
1814         logger.warning("fetched has no element 'json' - EXIT!")
1815         return 4
1816     elif "instances" not in fetched["json"]:
1817         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1818         return 5
1819
1820     domains = list()
1821     rows = fetched["json"]["instances"]
1822
1823     logger.info("Checking %d row(s) ...", len(rows))
1824     for row in rows:
1825         logger.debug("row[]='%s'", type(row))
1826         domain = tidyup.domain(row["name"])
1827         logger.debug("domain='%s' - AFTER!", domain)
1828
1829         if domain == "":
1830             logger.debug("domain is empty - SKIPPED!")
1831             continue
1832
1833         logger.debug("domain='%s' - BEFORE!", domain)
1834         domain = domain.encode("idna").decode("utf-8")
1835         logger.debug("domain='%s' - AFTER!", domain)
1836
1837         if not utils.is_domain_wanted(domain):
1838             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1839             continue
1840         elif domain in domains:
1841             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1842             continue
1843         elif instances.is_registered(domain):
1844             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1845             continue
1846         elif instances.is_recent(domain):
1847             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1848             continue
1849
1850         logger.info("Fetching instances from domain='%s'", domain)
1851         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1852
1853     logger.debug("Success! - EXIT!")
1854     return 0
1855
1856 def fetch_relays(args: argparse.Namespace) -> int:
1857     logger.debug("args[]='%s' - CALLED!", type(args))
1858
1859     if args.domain is not None and args.domain != "":
1860         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1861     else:
1862         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1863
1864     domains = list()
1865     rows = database.cursor.fetchall()
1866
1867     logger.info("Checking %d relays ...", len(rows))
1868     for row in rows:
1869         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1870         if not args.force and instances.is_recent(row["domain"]):
1871             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1872             continue
1873
1874         try:
1875             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1876             raw = utils.fetch_url(
1877                 f"https://{row['domain']}",
1878                 network.web_headers,
1879                 (config.get("connection_timeout"), config.get("read_timeout"))
1880             ).text
1881             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1882         except network.exceptions as exception:
1883             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1884             instances.set_last_error(row["domain"], exception)
1885             instances.set_last_instance_fetch(row["domain"])
1886             instances.update_data(row["domain"])
1887             continue
1888
1889         doc = bs4.BeautifulSoup(raw, features="html.parser")
1890         logger.debug("doc[]='%s'", type(doc))
1891
1892         logger.debug("row[software]='%s'", row["software"])
1893         if row["software"] == "activityrelay":
1894             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1895             tags = doc.findAll("p")
1896
1897             logger.debug("Checking %d paragraphs ...", len(tags))
1898             for tag in tags:
1899                 logger.debug("tag[]='%s'", type(tag))
1900                 if len(tag.contents) == 0:
1901                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1902                     continue
1903                 elif "registered instances" not in tag.contents[0]:
1904                     logger.debug("Skipping paragraph, text not found.")
1905                     continue
1906
1907                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1908                 for domain in tag.contents:
1909                     logger.debug("domain[%s]='%s'", type(domain), domain)
1910                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1911                         continue
1912
1913                     domain = str(domain)
1914                     if not utils.is_domain_wanted(domain):
1915                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1916                         continue
1917
1918                     logger.debug("domain='%s' - BEFORE!", domain)
1919                     domain = tidyup.domain(domain)
1920                     logger.debug("domain='%s' - AFTER!", domain)
1921
1922                     if domain == "":
1923                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1924                         continue
1925                     elif instances.is_registered(domain):
1926                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1927                         continue
1928                     elif dict_helper.has_key(domains, "domain", domain):
1929                         logger.debug("domain='%s' already added", domain)
1930                         continue
1931
1932                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1933                     domains.append({
1934                         "domain": domain,
1935                         "origin": row["domain"],
1936                     })
1937         elif row["software"] in ["aoderelay", "selective-relay"]:
1938             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1939             if row["software"] == "aoderelay":
1940                 tags = doc.findAll("section", {"class": "instance"})
1941             else:
1942                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1943
1944             logger.debug("Checking %d tags ...", len(tags))
1945             for tag in tags:
1946                 logger.debug("tag[]='%s'", type(tag))
1947
1948                 link = tag.find("a")
1949                 logger.debug("link[%s]='%s'", type(link), link)
1950                 if link is None:
1951                     logger.warning("tag='%s' has no a-tag ...", tag)
1952                     continue
1953
1954                 components = urlparse(link["href"])
1955                 domain = components.netloc.lower()
1956
1957                 if not utils.is_domain_wanted(domain):
1958                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1959                     continue
1960
1961                 logger.debug("domain='%s' - BEFORE!", domain)
1962                 domain = tidyup.domain(domain)
1963                 logger.debug("domain='%s' - AFTER!", domain)
1964
1965                 if domain == "":
1966                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1967                     continue
1968                 elif instances.is_registered(domain):
1969                     logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1970                     continue
1971                 elif dict_helper.has_key(domains, "domain", domain):
1972                     logger.debug("domain='%s' already added", domain)
1973                     continue
1974
1975                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1976                 domains.append({
1977                     "domain": domain,
1978                     "origin": row["domain"],
1979                 })
1980         else:
1981             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1982
1983         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1984         instances.set_last_instance_fetch(row["domain"])
1985         instances.update_data(row["domain"])
1986
1987     logger.info("Found %d domains to add ...", len(domains))
1988     for row in domains:
1989         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1990         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1991
1992     logger.debug("Success! - EXIT!")
1993     return 0
1994
1995 def convert_idna(args: argparse.Namespace) -> int:
1996     logger.debug("args[]='%s' - CALLED!", type(args))
1997
1998     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1999     rows = database.cursor.fetchall()
2000
2001     logger.debug("rows[]='%s'", type(rows))
2002     instances.translate_idnas(rows, "domain")
2003
2004     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2005     rows = database.cursor.fetchall()
2006
2007     logger.debug("rows[]='%s'", type(rows))
2008     instances.translate_idnas(rows, "origin")
2009
2010     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2011     rows = database.cursor.fetchall()
2012
2013     logger.debug("rows[]='%s'", type(rows))
2014     blocks.translate_idnas(rows, "blocker")
2015
2016     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2017     rows = database.cursor.fetchall()
2018
2019     logger.debug("rows[]='%s'", type(rows))
2020     blocks.translate_idnas(rows, "blocked")
2021
2022     logger.debug("Success! - EXIT!")
2023     return 0