]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Sorting order changed:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import blocklists
38 from fba.helpers import config
39 from fba.helpers import cookies
40 from fba.helpers import dicts as dict_helper
41 from fba.helpers import domain as domain_helper
42 from fba.helpers import locking
43 from fba.helpers import processing
44 from fba.helpers import software as software_helper
45 from fba.helpers import tidyup
46
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66     status = 0
67     if not validators.domain(args.domain):
68         logger.warning("args.domain='%s' is not valid", args.domain)
69         status = 100
70     elif blacklist.is_blacklisted(args.domain):
71         logger.warning("args.domain='%s' is blacklisted", args.domain)
72         status = 101
73     elif instances.is_registered(args.domain):
74         logger.warning("args.domain='%s' is already registered", args.domain)
75         status = 102
76     else:
77         logger.info("args.domain='%s' is not known", args.domain)
78
79     logger.debug("status=%d - EXIT!", status)
80     return status
81
82 def check_nodeinfo(args: argparse.Namespace) -> int:
83     logger.debug("args[]='%s' - CALLED!", type(args))
84
85     # Fetch rows
86     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
87
88     cnt = 0
89     for row in database.cursor.fetchall():
90         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
91         punycode = row["domain"].encode("idna").decode("utf-8")
92
93         if row["nodeinfo_url"].startswith("/"):
94             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
95             continue
96         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
97             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
98             cnt = cnt + 1
99
100     logger.info("Found %d row(s)", cnt)
101
102     logger.debug("EXIT!")
103     return 0
104
105 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
106     logger.debug("args[]='%s' - CALLED!", type(args))
107
108     # No CSRF by default, you don't have to add network.source_headers by yourself here
109     headers = tuple()
110     source_domain = "pixelfed.org"
111
112     if sources.is_recent(source_domain):
113         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
114         return 0
115     else:
116         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
117         sources.update(source_domain)
118
119     try:
120         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
121         headers = csrf.determine(source_domain, dict())
122     except network.exceptions as exception:
123         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
124         return list()
125
126     try:
127         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
128         fetched = network.get_json_api(
129             source_domain,
130             "/api/v1/servers/all.json?scope=All&country=all&language=all",
131             headers,
132             (config.get("connection_timeout"), config.get("read_timeout"))
133         )
134
135         logger.debug("JSON API returned %d elements", len(fetched))
136         if "error_message" in fetched:
137             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
138             return 101
139         elif "data" not in fetched["json"]:
140             logger.warning("API did not return JSON with 'data' element - EXIT!")
141             return 102
142
143         rows = fetched["json"]["data"]
144         logger.info("Checking %d fetched rows ...", len(rows))
145         for row in rows:
146             logger.debug("row[]='%s'", type(row))
147             if "domain" not in row:
148                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
149                 continue
150             elif row["domain"] == "":
151                 logger.debug("row[domain] is empty - SKIPPED!")
152                 continue
153
154             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
155             domain = row["domain"].encode("idna").decode("utf-8")
156             logger.debug("domain='%s' - AFTER!", domain)
157
158             if not domain_helper.is_wanted(domain):
159                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
160                 continue
161             elif instances.is_registered(domain):
162                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
163                 continue
164             elif instances.is_recent(domain):
165                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
166                 continue
167
168             logger.debug("Fetching instances from domain='%s' ...", domain)
169             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
170
171     except network.exceptions as exception:
172         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
173         return 103
174
175     logger.debug("Success! - EXIT!")
176     return 0
177
178 def fetch_bkali(args: argparse.Namespace) -> int:
179     logger.debug("args[]='%s' - CALLED!", type(args))
180
181     logger.debug("Invoking locking.acquire() ...")
182     locking.acquire()
183
184     source_domain = "gql.api.bka.li"
185     if sources.is_recent(source_domain):
186         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
187         return 0
188     else:
189         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
190         sources.update(source_domain)
191
192     domains = list()
193     try:
194         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
195         fetched = network.post_json_api(
196             source_domain,
197             "/v1/graphql",
198             json.dumps({
199                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
200             })
201         )
202
203         logger.debug("fetched[]='%s'", type(fetched))
204         if "error_message" in fetched:
205             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
206             return 100
207         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
208             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
209             return 101
210
211         rows = fetched["json"]
212
213         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
214         if len(rows) == 0:
215             raise Exception("WARNING: Returned no records")
216         elif "data" not in rows:
217             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
218         elif "nodeinfo" not in rows["data"]:
219             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
220
221         for entry in rows["data"]["nodeinfo"]:
222             logger.debug("entry[%s]='%s'", type(entry), entry)
223             if "domain" not in entry:
224                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
225                 continue
226             elif entry["domain"] == "":
227                 logger.debug("entry[domain] is empty - SKIPPED!")
228                 continue
229             elif not domain_helper.is_wanted(entry["domain"]):
230                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
231                 continue
232             elif instances.is_registered(entry["domain"]):
233                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
234                 continue
235             elif instances.is_recent(entry["domain"]):
236                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
237                 continue
238
239             logger.debug("Adding domain='%s' ...", entry["domain"])
240             domains.append(entry["domain"])
241
242     except network.exceptions as exception:
243         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
244         return 102
245
246     logger.debug("domains()=%d", len(domains))
247     if len(domains) > 0:
248         logger.info("Adding %d new instances ...", len(domains))
249         for domain in domains:
250             logger.debug("domain='%s' - BEFORE!", domain)
251             domain = domain.encode("idna").decode("utf-8")
252             logger.debug("domain='%s' - AFTER!", domain)
253
254             try:
255                 logger.info("Fetching instances from domain='%s' ...", domain)
256                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
257             except network.exceptions as exception:
258                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
259                 instances.set_last_error(domain, exception)
260                 return 100
261
262     logger.debug("Success - EXIT!")
263     return 0
264
265 def fetch_blocks(args: argparse.Namespace) -> int:
266     logger.debug("args[]='%s' - CALLED!", type(args))
267     if args.domain is not None and args.domain != "":
268         logger.debug("args.domain='%s' - checking ...", args.domain)
269         if not validators.domain(args.domain):
270             logger.warning("args.domain='%s' is not valid.", args.domain)
271             return 100
272         elif blacklist.is_blacklisted(args.domain):
273             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
274             return 101
275         elif not instances.is_registered(args.domain):
276             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
277             return 102
278
279     logger.debug("Invoking locking.acquire() ...")
280     locking.acquire()
281
282     if args.domain is not None and args.domain != "":
283         # Re-check single domain
284         logger.debug("Querying database for args.domain='%s' ...", args.domain)
285         database.cursor.execute(
286             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
287         )
288     elif args.software is not None and args.software != "":
289         # Re-check single software
290         logger.debug("Querying database for args.software='%s' ...", args.software)
291         database.cursor.execute(
292             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
293         )
294     elif args.force:
295         # Re-check all
296         logger.debug("Re-checking all instances ...")
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if nodeinfo_url is None:
312             logger.debug("blocker='%s',software='%s' has no nodeinfo_url set - SKIPPED!", blocker, software)
313             continue
314         elif not domain_helper.is_wanted(blocker):
315             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         blocking = list()
323
324         # c.s isn't part of oliphant's "hidden" blocklists
325         logger.debug("blocker='%s'", blocker)
326         if blocker != "chaos.social" and not blocklists.is_excluded(blocker):
327             logger.debug("blocker='%s',software='%s'", blocker, software)
328             if software == "pleroma":
329                 logger.info("blocker='%s',software='%s'", blocker, software)
330                 blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
331                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
332             elif software == "mastodon":
333                 logger.info("blocker='%s',software='%s'", blocker, software)
334                 blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
335                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
336             elif software == "lemmy":
337                 logger.info("blocker='%s',software='%s'", blocker, software)
338                 blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
339                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340             elif software == "friendica":
341                 logger.info("blocker='%s',software='%s'", blocker, software)
342                 blocking = friendica.fetch_blocks(blocker)
343                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
344             elif software == "misskey":
345                 logger.info("blocker='%s',software='%s'", blocker, software)
346                 blocking = misskey.fetch_blocks(blocker)
347                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348             else:
349                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
350
351             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
352             instances.set_total_blocks(blocker, blocking)
353         else:
354             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
355
356         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
357         blockdict = list()
358         for block in blocking:
359             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
360
361             if block["block_level"] == "":
362                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
363                 continue
364
365             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
366             block["blocked"] = tidyup.domain(block["blocked"])
367             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
368             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
369
370             if block["blocked"] == "":
371                 logger.warning("blocked is empty, blocker='%s'", blocker)
372                 continue
373             elif block["blocked"].endswith(".onion"):
374                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
375                 continue
376             elif block["blocked"].endswith(".arpa"):
377                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
378                 continue
379             elif block["blocked"].endswith(".tld"):
380                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
381                 continue
382             elif block["blocked"].find("*") >= 0:
383                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
384
385                 # Some friendica servers also obscure domains without hash
386                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
387
388                 logger.debug("row[]='%s'", type(row))
389                 if row is None:
390                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
391                     instances.set_has_obfuscation(blocker, True)
392                     continue
393
394                 block["blocked"] = row["domain"]
395                 origin           = row["origin"]
396                 nodeinfo_url     = row["nodeinfo_url"]
397             elif block["blocked"].find("?") >= 0:
398                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
399
400                 # Some obscure them with question marks, not sure if that's dependent on version or not
401                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
402
403                 logger.debug("row[]='%s'", type(row))
404                 if row is None:
405                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
406                     instances.set_has_obfuscation(blocker, True)
407                     continue
408
409                 block["blocked"] = row["domain"]
410                 origin           = row["origin"]
411                 nodeinfo_url     = row["nodeinfo_url"]
412
413             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
414             if block["blocked"] == "":
415                 logger.debug("block[blocked] is empty - SKIPPED!")
416                 continue
417
418             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
419             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
420             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
421
422             if not domain_helper.is_wanted(block["blocked"]):
423                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
424                 continue
425             elif block["block_level"] in ["accept", "accepted"]:
426                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
427                 continue
428             elif not instances.is_registered(block["blocked"]):
429                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
430                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
431
432             block["block_level"] = blocks.alias_block_level(block["block_level"])
433
434             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
435                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
436                 blockdict.append({
437                     "blocked": block["blocked"],
438                     "reason" : block["reason"],
439                 })
440
441             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
442             cookies.clear(block["blocked"])
443
444         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
445         if instances.has_pending(blocker):
446             logger.debug("Flushing updates for blocker='%s' ...", blocker)
447             instances.update_data(blocker)
448
449         logger.debug("Invoking commit() ...")
450         database.connection.commit()
451
452         logger.debug("Invoking cookies.clear(%s) ...", blocker)
453         cookies.clear(blocker)
454
455         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
456         if config.get("bot_enabled") and len(blockdict) > 0:
457             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
458             network.send_bot_post(blocker, blockdict)
459
460     logger.debug("Success! - EXIT!")
461     return 0
462
463 def fetch_observer(args: argparse.Namespace) -> int:
464     logger.debug("args[]='%s' - CALLED!", type(args))
465
466     logger.debug("Invoking locking.acquire() ...")
467     locking.acquire()
468
469     source_domain = "fediverse.observer"
470     if sources.is_recent(source_domain):
471         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
472         return 0
473     else:
474         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
475         sources.update(source_domain)
476
477     types = list()
478     if args.software is None:
479         logger.info("Fetching software list ...")
480         raw = utils.fetch_url(
481             f"https://{source_domain}",
482             network.web_headers,
483             (config.get("connection_timeout"), config.get("read_timeout"))
484         ).text
485         logger.debug("raw[%s]()=%d", type(raw), len(raw))
486
487         doc = bs4.BeautifulSoup(raw, features="html.parser")
488         logger.debug("doc[]='%s'", type(doc))
489
490         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
491         logger.debug("navbar[]='%s'", type(navbar))
492         if navbar is None:
493             logger.warning("Cannot find navigation bar, cannot continue!")
494             return 1
495
496         items = navbar.findAll("a", {"class": "dropdown-item"})
497         logger.debug("items[]='%s'", type(items))
498
499         logger.info("Checking %d menu items ...", len(items))
500         for item in items:
501             logger.debug("item[%s]='%s'", type(item), item)
502             if item.text.lower() == "all":
503                 logger.debug("Skipping 'All' menu entry ...")
504                 continue
505
506             logger.debug("Appending item.text='%s' ...", item.text)
507             types.append(tidyup.domain(item.text))
508     else:
509         logger.info("Adding args.software='%s' as type ...", args.software)
510         types.append(args.software)
511
512     logger.info("Fetching %d different table data ...", len(types))
513     for software in types:
514         logger.debug("software='%s' - BEFORE!", software)
515         if args.software is not None and args.software != software:
516             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
517             continue
518
519         doc = None
520         try:
521             logger.debug("Fetching table data for software='%s' ...", software)
522             raw = utils.fetch_url(
523                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
524                 network.web_headers,
525                 (config.get("connection_timeout"), config.get("read_timeout"))
526             ).text
527             logger.debug("raw[%s]()=%d", type(raw), len(raw))
528
529             doc = bs4.BeautifulSoup(raw, features="html.parser")
530             logger.debug("doc[]='%s'", type(doc))
531         except network.exceptions as exception:
532             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
533             continue
534
535         items = doc.findAll("a", {"class": "url"})
536         logger.info("Checking %d items,software='%s' ...", len(items), software)
537         for item in items:
538             logger.debug("item[]='%s'", type(item))
539             domain = item.decode_contents()
540             logger.debug("domain='%s' - AFTER!", domain)
541
542             if domain == "":
543                 logger.debug("domain is empty - SKIPPED!")
544                 continue
545
546             logger.debug("domain='%s' - BEFORE!", domain)
547             domain = domain.encode("idna").decode("utf-8")
548             logger.debug("domain='%s' - AFTER!", domain)
549
550             if not domain_helper.is_wanted(domain):
551                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
552                 continue
553             elif instances.is_registered(domain):
554                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
555                 continue
556
557             software = software_helper.alias(software)
558             logger.info("Fetching instances for domain='%s'", domain)
559             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
560
561     logger.debug("Success! - EXIT!")
562     return 0
563
564 def fetch_todon_wiki(args: argparse.Namespace) -> int:
565     logger.debug("args[]='%s' - CALLED!", type(args))
566
567     logger.debug("Invoking locking.acquire() ...")
568     locking.acquire()
569
570     source_domain = "wiki.todon.eu"
571     if sources.is_recent(source_domain):
572         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
573         return 0
574     else:
575         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
576         sources.update(source_domain)
577
578     blocklist = {
579         "silenced": list(),
580         "reject": list(),
581     }
582
583     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
584     raw = utils.fetch_url(
585         f"https://{source_domain}/todon/domainblocks",
586         network.web_headers,
587         (config.get("connection_timeout"), config.get("read_timeout"))
588     ).text
589     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
590
591     doc = bs4.BeautifulSoup(raw, "html.parser")
592     logger.debug("doc[]='%s'", type(doc))
593
594     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
595     logger.info("Checking %d silenced/limited entries ...", len(silenced))
596     blocklist["silenced"] = utils.find_domains(silenced, "div")
597
598     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
599     logger.info("Checking %d suspended entries ...", len(suspended))
600     blocklist["reject"] = utils.find_domains(suspended, "div")
601
602     blocking = blocklist["silenced"] + blocklist["reject"]
603     blocker = "todon.eu"
604
605     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
606     instances.set_last_blocked(blocker)
607     instances.set_total_blocks(blocker, blocking)
608
609     blockdict = list()
610     for block_level in blocklist:
611         blockers = blocklist[block_level]
612
613         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
614         for blocked in blockers:
615             logger.debug("blocked='%s'", blocked)
616
617             if not instances.is_registered(blocked):
618                 try:
619                     logger.info("Fetching instances from domain='%s' ...", blocked)
620                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
621                 except network.exceptions as exception:
622                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
623                     instances.set_last_error(blocked, exception)
624
625             if blocks.is_instance_blocked(blocker, blocked, block_level):
626                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
627                 continue
628
629             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
630             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
631                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
632                 blockdict.append({
633                     "blocked": blocked,
634                     "reason" : None,
635                 })
636
637         logger.debug("Invoking commit() ...")
638         database.connection.commit()
639
640         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
641         if config.get("bot_enabled") and len(blockdict) > 0:
642             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
643             network.send_bot_post(blocker, blockdict)
644
645     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
646     if instances.has_pending(blocker):
647         logger.debug("Flushing updates for blocker='%s' ...", blocker)
648         instances.update_data(blocker)
649
650     logger.debug("Success! - EXIT!")
651     return 0
652
653 def fetch_cs(args: argparse.Namespace):
654     logger.debug("args[]='%s' - CALLED!", type(args))
655
656     logger.debug("Invoking locking.acquire() ...")
657     locking.acquire()
658
659     extensions = [
660         "extra",
661         "abbr",
662         "attr_list",
663         "def_list",
664         "fenced_code",
665         "footnotes",
666         "md_in_html",
667         "admonition",
668         "codehilite",
669         "legacy_attrs",
670         "legacy_em",
671         "meta",
672         "nl2br",
673         "sane_lists",
674         "smarty",
675         "toc",
676         "wikilinks"
677     ]
678
679     blocklist = {
680         "silenced": list(),
681         "reject"  : list(),
682     }
683
684     source_domain = "raw.githubusercontent.com"
685     if sources.is_recent(source_domain):
686         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
687         return 0
688     else:
689         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
690         sources.update(source_domain)
691
692     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
693     raw = utils.fetch_url(
694         f"https://{source_domain}/chaossocial/meta/master/federation.md",
695         network.web_headers,
696         (config.get("connection_timeout"), config.get("read_timeout"))
697     ).text
698     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
699
700     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
701     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
702
703     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
704     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
705     blocklist["silenced"] = federation.find_domains(silenced)
706
707     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
708     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
709     blocklist["reject"] = federation.find_domains(blocked)
710
711     blocking = blocklist["silenced"] + blocklist["reject"]
712     blocker = "chaos.social"
713
714     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
715     instances.set_last_blocked(blocker)
716     instances.set_total_blocks(blocker, blocking)
717
718     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
719     if len(blocking) > 0:
720         blockdict = list()
721         for block_level in blocklist:
722             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
723
724             for row in blocklist[block_level]:
725                 logger.debug("row[%s]='%s'", type(row), row)
726                 if not "domain" in row:
727                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
728                     continue
729                 elif not instances.is_registered(row["domain"]):
730                     try:
731                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
732                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
733                     except network.exceptions as exception:
734                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
735                         instances.set_last_error(row["domain"], exception)
736
737                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
738                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
739                     blockdict.append({
740                         "blocked": row["domain"],
741                         "reason" : row["reason"],
742                     })
743
744         logger.debug("Invoking commit() ...")
745         database.connection.commit()
746
747         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
748         if config.get("bot_enabled") and len(blockdict) > 0:
749             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
750             network.send_bot_post(blocker, blockdict)
751
752     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
753     if instances.has_pending(blocker):
754         logger.debug("Flushing updates for blocker='%s' ...", blocker)
755         instances.update_data(blocker)
756
757     logger.debug("Success! - EXIT!")
758     return 0
759
760 def fetch_fba_rss(args: argparse.Namespace) -> int:
761     logger.debug("args[]='%s' - CALLED!", type(args))
762
763     domains = list()
764
765     logger.debug("Invoking locking.acquire() ...")
766     locking.acquire()
767
768     components = urlparse(args.feed)
769
770     if sources.is_recent(components.netloc):
771         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
772         return 0
773     else:
774         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
775         sources.update(components.netloc)
776
777     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
778     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
779
780     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
781     if response.ok and response.status_code < 300 and len(response.text) > 0:
782         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
783         rss = atoma.parse_rss_bytes(response.content)
784
785         logger.debug("rss[]='%s'", type(rss))
786         for item in rss.items:
787             logger.debug("item[%s]='%s'", type(item), item)
788             domain = tidyup.domain(item.link.split("=")[1])
789
790             logger.debug("domain='%s' - AFTER!", domain)
791             if domain == "":
792                 logger.debug("domain is empty - SKIPPED!")
793                 continue
794
795             logger.debug("domain='%s' - BEFORE!", domain)
796             domain = domain.encode("idna").decode("utf-8")
797             logger.debug("domain='%s' - AFTER!", domain)
798
799             if not domain_helper.is_wanted(domain):
800                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
801                 continue
802             elif domain in domains:
803                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
804                 continue
805             elif instances.is_registered(domain):
806                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
807                 continue
808             elif instances.is_recent(domain):
809                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
810                 continue
811
812             logger.debug("Adding domain='%s'", domain)
813             domains.append(domain)
814
815     logger.debug("domains()=%d", len(domains))
816     if len(domains) > 0:
817         logger.info("Adding %d new instances ...", len(domains))
818         for domain in domains:
819             logger.debug("domain='%s'", domain)
820             try:
821                 logger.info("Fetching instances from domain='%s' ...", domain)
822                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
823             except network.exceptions as exception:
824                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
825                 instances.set_last_error(domain, exception)
826                 return 100
827
828     logger.debug("Success! - EXIT!")
829     return 0
830
831 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
832     logger.debug("args[]='%s' - CALLED!", type(args))
833
834     logger.debug("Invoking locking.acquire() ...")
835     locking.acquire()
836
837     source_domain = "ryona.agency"
838     feed = f"https://{source_domain}/users/fba/feed.atom"
839
840     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
841     if args.feed is not None and validators.url(args.feed):
842         logger.debug("Setting feed='%s' ...", args.feed)
843         feed = str(args.feed)
844         source_domain = urlparse(args.feed).netloc
845
846     if sources.is_recent(source_domain):
847         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
848         return 0
849     else:
850         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
851         sources.update(source_domain)
852
853     domains = list()
854
855     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
856     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
857
858     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
859     if response.ok and response.status_code < 300 and len(response.text) > 0:
860         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
861         atom = atoma.parse_atom_bytes(response.content)
862
863         logger.debug("atom[]='%s'", type(atom))
864         for entry in atom.entries:
865             logger.debug("entry[]='%s'", type(entry))
866             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
867             logger.debug("doc[]='%s'", type(doc))
868             for element in doc.findAll("a"):
869                 logger.debug("element[]='%s'", type(element))
870                 for href in element["href"].split(","):
871                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
872                     domain = tidyup.domain(href)
873
874                     logger.debug("domain='%s' - AFTER!", domain)
875                     if domain == "":
876                         logger.debug("domain is empty - SKIPPED!")
877                         continue
878
879                     logger.debug("domain='%s' - BEFORE!", domain)
880                     domain = domain.encode("idna").decode("utf-8")
881                     logger.debug("domain='%s' - AFTER!", domain)
882
883                     if not domain_helper.is_wanted(domain):
884                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
885                         continue
886                     elif domain in domains:
887                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
888                         continue
889                     elif instances.is_registered(domain):
890                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
891                         continue
892                     elif instances.is_recent(domain):
893                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
894                         continue
895
896                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
897                     domains.append(domain)
898
899     logger.debug("domains()=%d", len(domains))
900     if len(domains) > 0:
901         logger.info("Adding %d new instances ...", len(domains))
902         for domain in domains:
903             logger.debug("domain='%s'", domain)
904             try:
905                 logger.info("Fetching instances from domain='%s' ...", domain)
906                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
907             except network.exceptions as exception:
908                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
909                 instances.set_last_error(domain, exception)
910                 return 100
911
912     logger.debug("Success! - EXIT!")
913     return 0
914
915 def fetch_instances(args: argparse.Namespace) -> int:
916     logger.debug("args[]='%s' - CALLED!", type(args))
917
918     logger.debug("args.domain='%s' - checking ...", args.domain)
919     if not validators.domain(args.domain):
920         logger.warning("args.domain='%s' is not valid.", args.domain)
921         return 100
922     elif blacklist.is_blacklisted(args.domain):
923         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
924         return 101
925
926     logger.debug("Invoking locking.acquire() ...")
927     locking.acquire()
928
929     # Initialize values
930     domain = tidyup.domain(args.domain)
931     origin = software = None
932
933     # Fetch record
934     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
935     row = database.cursor.fetchone()
936     if row is not None:
937         origin = row["origin"]
938         software = row["software"]
939
940     # Initial fetch
941     try:
942         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
943         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
944     except network.exceptions as exception:
945         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
946         instances.set_last_error(args.domain, exception)
947         instances.update_data(args.domain)
948         return 100
949
950     if args.single:
951         logger.debug("Not fetching more instances - EXIT!")
952         return 0
953
954     # Loop through some instances
955     database.cursor.execute(
956         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
957     )
958
959     rows = database.cursor.fetchall()
960     logger.info("Checking %d entries ...", len(rows))
961     for row in rows:
962         logger.debug("row[domain]='%s'", row["domain"])
963         if row["domain"] == "":
964             logger.debug("row[domain] is empty - SKIPPED!")
965             continue
966
967         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
968         domain = row["domain"].encode("idna").decode("utf-8")
969         logger.debug("domain='%s' - AFTER!", domain)
970
971         if not domain_helper.is_wanted(domain):
972             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
973             continue
974
975         try:
976             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
977             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
978         except network.exceptions as exception:
979             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
980             instances.set_last_error(domain, exception)
981
982     logger.debug("Success - EXIT!")
983     return 0
984
985 def fetch_oliphant(args: argparse.Namespace) -> int:
986     logger.debug("args[]='%s' - CALLED!", type(args))
987
988     logger.debug("Invoking locking.acquire() ...")
989     locking.acquire()
990
991     source_domain = "codeberg.org"
992     if sources.is_recent(source_domain):
993         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
994         return 0
995     else:
996         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
997         sources.update(source_domain)
998
999     # Base URL
1000     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1001
1002     domains = list()
1003
1004     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1005     for block in blocklists.oliphant_blocklists:
1006         # Is domain given and not equal blocker?
1007         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1008             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1009             continue
1010         elif args.domain in domains:
1011             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1012             continue
1013
1014         instances.set_last_blocked(block["blocker"])
1015
1016         # Fetch this URL
1017         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1018         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1019
1020         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1021         if not response.ok or response.status_code >= 300 or response.content == "":
1022             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1023             continue
1024
1025         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1026         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1027
1028         blockdict = list()
1029
1030         cnt = 0
1031         for row in reader:
1032             logger.debug("row[%s]='%s'", type(row), row)
1033             domain = severity = None
1034             reject_media = reject_reports = False
1035
1036             if "#domain" in row:
1037                 domain = row["#domain"]
1038             elif "domain" in row:
1039                 domain = row["domain"]
1040             else:
1041                 logger.debug("row='%s' does not contain domain column", row)
1042                 continue
1043
1044             if "#severity" in row:
1045                 severity = blocks.alias_block_level(row["#severity"])
1046             elif "severity" in row:
1047                 severity = blocks.alias_block_level(row["severity"])
1048             else:
1049                 logger.debug("row='%s' does not contain severity column", row)
1050                 continue
1051
1052             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1053                 reject_media = True
1054             elif "reject_media" in row and row["reject_media"].lower() == "true":
1055                 reject_media = True
1056
1057             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1058                 reject_reports = True
1059             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1060                 reject_reports = True
1061
1062             cnt = cnt + 1
1063             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1064             if domain == "":
1065                 logger.debug("domain is empty - SKIPPED!")
1066                 continue
1067             elif domain.endswith(".onion"):
1068                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1069                 continue
1070             elif domain.endswith(".arpa"):
1071                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1072                 continue
1073             elif domain.endswith(".tld"):
1074                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1075                 continue
1076             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1077                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1078                 domain = utils.deobfuscate(domain, block["blocker"])
1079                 logger.debug("domain='%s' - AFTER!", domain)
1080
1081             if not validators.domain(domain):
1082                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1083                 continue
1084             elif blacklist.is_blacklisted(domain):
1085                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1086                 continue
1087             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1088                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1089                 continue
1090
1091             logger.debug("Marking domain='%s' as handled", domain)
1092             domains.append(domain)
1093
1094             logger.debug("Processing domain='%s' ...", domain)
1095             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1096             logger.debug("processed='%s'", processed)
1097
1098             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1099                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1100                 blockdict.append({
1101                     "blocked": domain,
1102                     "reason" : block["reason"],
1103                 })
1104
1105             if reject_media:
1106                 processing.block(block["blocker"], domain, None, "reject_media")
1107             if reject_reports:
1108                 processing.block(block["blocker"], domain, None, "reject_reports")
1109
1110         logger.debug("block[blocker]='%s'", block["blocker"])
1111         if not blocklists.is_excluded(block["blocker"]):
1112             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1113             instances.set_total_blocks(block["blocker"], domains)
1114
1115         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1116         if instances.has_pending(block["blocker"]):
1117             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1118             instances.update_data(block["blocker"])
1119
1120         logger.debug("Invoking commit() ...")
1121         database.connection.commit()
1122
1123         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1124         if config.get("bot_enabled") and len(blockdict) > 0:
1125             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1126             network.send_bot_post(block["blocker"], blockdict)
1127
1128     logger.debug("Success! - EXIT!")
1129     return 0
1130
1131 def fetch_txt(args: argparse.Namespace) -> int:
1132     logger.debug("args[]='%s' - CALLED!", type(args))
1133
1134     logger.debug("Invoking locking.acquire() ...")
1135     locking.acquire()
1136
1137     # Static URLs
1138     urls = ({
1139         "blocker": "seirdy.one",
1140         "url"    : "https://seirdy.one/pb/bsl.txt",
1141     },)
1142
1143     logger.info("Checking %d text file(s) ...", len(urls))
1144     for row in urls:
1145         logger.debug("Fetching row[url]='%s' ...", row["url"])
1146         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1147
1148         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1149         if response.ok and response.status_code < 300 and response.text != "":
1150             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1151             domains = response.text.split("\n")
1152
1153             logger.info("Processing %d domains ...", len(domains))
1154             for domain in domains:
1155                 logger.debug("domain='%s' - BEFORE!", domain)
1156                 domain = tidyup.domain(domain)
1157
1158                 logger.debug("domain='%s' - AFTER!", domain)
1159                 if domain == "":
1160                     logger.debug("domain is empty - SKIPPED!")
1161                     continue
1162                 elif not domain_helper.is_wanted(domain):
1163                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1164                     continue
1165                 elif instances.is_recent(domain):
1166                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1167                     continue
1168
1169                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1170                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1171
1172                 logger.debug("processed='%s'", processed)
1173                 if not processed:
1174                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1175                     continue
1176
1177     logger.debug("Success! - EXIT!")
1178     return 0
1179
1180 def fetch_fedipact(args: argparse.Namespace) -> int:
1181     logger.debug("args[]='%s' - CALLED!", type(args))
1182
1183     logger.debug("Invoking locking.acquire() ...")
1184     locking.acquire()
1185
1186     source_domain = "fedipact.online"
1187     if sources.is_recent(source_domain):
1188         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1189         return 0
1190     else:
1191         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1192         sources.update(source_domain)
1193
1194     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1195     response = utils.fetch_url(
1196         f"https://{source_domain}",
1197         network.web_headers,
1198         (config.get("connection_timeout"), config.get("read_timeout"))
1199     )
1200
1201     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1202     if response.ok and response.status_code < 300 and response.text != "":
1203         logger.debug("Parsing %d Bytes ...", len(response.text))
1204
1205         doc = bs4.BeautifulSoup(response.text, "html.parser")
1206         logger.debug("doc[]='%s'", type(doc))
1207
1208         rows = doc.findAll("li")
1209         logger.info("Checking %d row(s) ...", len(rows))
1210         for row in rows:
1211             logger.debug("row[]='%s'", type(row))
1212             domain = tidyup.domain(row.contents[0])
1213
1214             logger.debug("domain='%s' - AFTER!", domain)
1215             if domain == "":
1216                 logger.debug("domain is empty - SKIPPED!")
1217                 continue
1218
1219             logger.debug("domain='%s' - BEFORE!", domain)
1220             domain = domain.encode("idna").decode("utf-8")
1221             logger.debug("domain='%s' - AFTER!", domain)
1222
1223             if not domain_helper.is_wanted(domain):
1224                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1225                 continue
1226             elif instances.is_registered(domain):
1227                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1228                 continue
1229             elif instances.is_recent(domain):
1230                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1231                 continue
1232
1233             logger.info("Fetching domain='%s' ...", domain)
1234             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1235
1236     logger.debug("Success! - EXIT!")
1237     return 0
1238
1239 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1240     logger.debug("args[]='%s' - CALLED!", type(args))
1241
1242     logger.debug("Invoking locking.acquire() ...")
1243     locking.acquire()
1244
1245     source_domain = "instances.joinmobilizon.org"
1246     if sources.is_recent(source_domain):
1247         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1248         return 0
1249     else:
1250         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1251         sources.update(source_domain)
1252
1253     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1254     raw = utils.fetch_url(
1255         f"https://{source_domain}/api/v1/instances",
1256         network.web_headers,
1257         (config.get("connection_timeout"), config.get("read_timeout"))
1258     ).text
1259     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1260
1261     parsed = json.loads(raw)
1262     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1263
1264     if "data" not in parsed:
1265         logger.warning("parsed()=%d does not contain key 'data'")
1266         return 1
1267
1268     logger.info("Checking %d instances ...", len(parsed["data"]))
1269     for row in parsed["data"]:
1270         logger.debug("row[]='%s'", type(row))
1271         if "host" not in row:
1272             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1273             continue
1274         elif not domain_helper.is_wanted(row["host"]):
1275             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1276             continue
1277         elif instances.is_registered(row["host"]):
1278             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1279             continue
1280
1281         logger.info("Fetching row[host]='%s' ...", row["host"])
1282         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1283
1284     logger.debug("Success! - EXIT!")
1285     return 0
1286
1287 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1288     logger.debug("args[]='%s' - CALLED!", type(args))
1289
1290     logger.debug("Invoking locking.acquire() ...")
1291     locking.acquire()
1292
1293     source_domain = "instanceapp.misskey.page"
1294     if sources.is_recent(source_domain):
1295         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1296         return 0
1297     else:
1298         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1299         sources.update(source_domain)
1300
1301     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1302     raw = utils.fetch_url(
1303         f"https://{source_domain}/instances.json",
1304         network.web_headers,
1305         (config.get("connection_timeout"), config.get("read_timeout"))
1306     ).text
1307     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1308
1309     parsed = json.loads(raw)
1310     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1311
1312     if "instancesInfos" not in parsed:
1313         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1314         return 1
1315
1316     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1317     for row in parsed["instancesInfos"]:
1318         logger.debug("row[%s]='%s'", type(row), row)
1319         if "url" not in row:
1320             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1321             continue
1322         elif not domain_helper.is_wanted(row["url"]):
1323             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1324             continue
1325         elif instances.is_registered(row["url"]):
1326             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1327             continue
1328
1329         logger.info("Fetching row[url]='%s' ...", row["url"])
1330         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1331
1332     logger.debug("Success! - EXIT!")
1333     return 0
1334
1335 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1336     logger.debug("args[]='%s' - CALLED!", type(args))
1337
1338     logger.debug("Invoking locking.acquire() ...")
1339     locking.acquire()
1340
1341     source_domain = "joinfediverse.wiki"
1342     if sources.is_recent(source_domain):
1343         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1344         return 0
1345     else:
1346         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1347         sources.update(source_domain)
1348
1349     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1350     raw = utils.fetch_url(
1351         f"https://{source_domain}/FediBlock",
1352         network.web_headers,
1353         (config.get("connection_timeout"), config.get("read_timeout"))
1354     ).text
1355     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1356
1357     doc = bs4.BeautifulSoup(raw, "html.parser")
1358     logger.debug("doc[]='%s'", type(doc))
1359
1360     tables = doc.findAll("table", {"class": "wikitable"})
1361
1362     logger.info("Analyzing %d table(s) ...", len(tables))
1363     blocklist = list()
1364     for table in tables:
1365         logger.debug("table[]='%s'", type(table))
1366
1367         rows = table.findAll("tr")
1368         logger.info("Checking %d row(s) ...", len(rows))
1369         block_headers = dict()
1370         for row in rows:
1371             logger.debug("row[%s]='%s'", type(row), row)
1372
1373             headers = row.findAll("th")
1374             logger.debug("Found headers()=%d header(s)", len(headers))
1375             if len(headers) > 1:
1376                 block_headers = dict()
1377                 cnt = 0
1378                 for header in headers:
1379                     cnt = cnt + 1
1380                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1381                     text = header.contents[0]
1382
1383                     logger.debug("text[]='%s'", type(text))
1384                     if not isinstance(text, str):
1385                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1386                         continue
1387                     elif validators.domain(text.strip()):
1388                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1389                         continue
1390
1391                     text = tidyup.domain(text.strip())
1392                     logger.debug("text='%s' - AFTER!", text)
1393                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1394                         logger.debug("Found header: '%s'=%d", text, cnt)
1395                         block_headers[cnt] = text
1396
1397             elif len(block_headers) == 0:
1398                 logger.debug("row is not scrapable - SKIPPED!")
1399                 continue
1400             elif len(block_headers) > 0:
1401                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1402                 cnt = 0
1403                 block = dict()
1404
1405                 for element in row.find_all(["th", "td"]):
1406                     cnt = cnt + 1
1407                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1408                     if cnt in block_headers:
1409                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1410
1411                         text = element.text.strip()
1412                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1413
1414                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1415                         if key in ["domain", "instance"]:
1416                             block[key] = text
1417                         elif key == "reason":
1418                             block[key] = tidyup.reason(text)
1419                         elif key == "subdomain(s)":
1420                             block[key] = list()
1421                             if text != "":
1422                                 block[key] = text.split("/")
1423                         else:
1424                             logger.debug("key='%s'", key)
1425                             block[key] = text
1426
1427                 logger.debug("block()=%d ...", len(block))
1428                 if len(block) > 0:
1429                     logger.debug("Appending block()=%d ...", len(block))
1430                     blocklist.append(block)
1431
1432     logger.debug("blocklist()=%d", len(blocklist))
1433
1434     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1435     domains = database.cursor.fetchall()
1436
1437     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1438     blocking = list()
1439     for block in blocklist:
1440         logger.debug("block='%s'", block)
1441         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1442             origin = block["blocked"]
1443             logger.debug("origin='%s'", origin)
1444             for subdomain in block["subdomain(s)"]:
1445                 block["blocked"] = subdomain + "." + origin
1446                 logger.debug("block[blocked]='%s'", block["blocked"])
1447                 blocking.append(block)
1448         else:
1449             blocking.append(block)
1450
1451     logger.debug("blocking()=%d", blocking)
1452     for block in blocking:
1453         logger.debug("block[]='%s'", type(block))
1454         if "blocked" not in block:
1455             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1456
1457         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1458         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1459
1460         if block["blocked"] == "":
1461             logger.debug("block[blocked] is empty - SKIPPED!")
1462             continue
1463         elif not domain_helper.is_wanted(block["blocked"]):
1464             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1465             continue
1466         elif instances.is_recent(block["blocked"]):
1467             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1468             continue
1469
1470         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1471         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1472
1473     blockdict = list()
1474     for blocker in domains:
1475         blocker = blocker[0]
1476         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1477         instances.set_last_blocked(blocker)
1478
1479         for block in blocking:
1480             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1481             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1482
1483             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1484             if block["blocked"] == "":
1485                 logger.debug("block[blocked] is empty - SKIPPED!")
1486                 continue
1487             elif not domain_helper.is_wanted(block["blocked"]):
1488                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1489                 continue
1490
1491             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1492             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1493                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1494                 blockdict.append({
1495                     "blocked": block["blocked"],
1496                     "reason" : block["reason"],
1497                 })
1498
1499         if instances.has_pending(blocker):
1500             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1501             instances.update_data(blocker)
1502
1503         logger.debug("Invoking commit() ...")
1504         database.connection.commit()
1505
1506         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1507         if config.get("bot_enabled") and len(blockdict) > 0:
1508             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1509             network.send_bot_post(blocker, blockdict)
1510
1511     logger.debug("Success! - EXIT!")
1512     return 0
1513
1514 def recheck_obfuscation(args: argparse.Namespace) -> int:
1515     logger.debug("args[]='%s' - CALLED!", type(args))
1516
1517     logger.debug("Invoking locking.acquire() ...")
1518     locking.acquire()
1519
1520     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1521         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1522     elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1523         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1524     else:
1525         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1526
1527     rows = database.cursor.fetchall()
1528     logger.info("Checking %d domains ...", len(rows))
1529     for row in rows:
1530         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1531         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1532             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1533             continue
1534
1535         blocking = list()
1536         if row["software"] == "pleroma":
1537             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1538             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1539         elif row["software"] == "mastodon":
1540             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1541             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1542         elif row["software"] == "lemmy":
1543             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1544             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1545         elif row["software"] == "friendica":
1546             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1547             blocking = friendica.fetch_blocks(row["domain"])
1548         elif row["software"] == "misskey":
1549             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1550             blocking = misskey.fetch_blocks(row["domain"])
1551         else:
1552             logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1553
1554         # c.s isn't part of oliphant's "hidden" blocklists
1555         logger.debug("row[domain]='%s'", row["domain"])
1556         if row["domain"] != "chaos.social" and not blocklists.is_excluded(row["domain"]):
1557             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1558             instances.set_last_blocked(row["domain"])
1559             instances.set_total_blocks(row["domain"], blocking)
1560
1561         obfuscated = 0
1562         blockdict = list()
1563
1564         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1565         for block in blocking:
1566             logger.debug("block[blocked]='%s'", block["blocked"])
1567             blocked = None
1568
1569             if block["blocked"] == "":
1570                 logger.debug("block[blocked] is empty - SKIPPED!")
1571                 continue
1572             elif block["blocked"].endswith(".arpa"):
1573                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1574                 continue
1575             elif block["blocked"].endswith(".tld"):
1576                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1577                 continue
1578             elif block["blocked"].endswith(".onion"):
1579                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1580                 continue
1581             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1582                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1583                 obfuscated = obfuscated + 1
1584                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1585             elif not domain_helper.is_wanted(block["blocked"]):
1586                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1587                 continue
1588             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1589                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1590                 continue
1591
1592             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1593             if blocked is not None and blocked != block["blocked"]:
1594                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1595                 obfuscated = obfuscated - 1
1596
1597                 if blocks.is_instance_blocked(row["domain"], blocked):
1598                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1599                     continue
1600                 elif blacklist.is_blacklisted(blocked):
1601                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1602                     continue
1603
1604                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1605
1606                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1607                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1608                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1609                     blockdict.append({
1610                         "blocked": blocked,
1611                         "reason" : block["reason"],
1612                     })
1613
1614         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1615         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1616
1617         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1618         if obfuscated == 0 and len(blocking) > 0:
1619             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1620             instances.set_has_obfuscation(row["domain"], False)
1621
1622         if instances.has_pending(row["domain"]):
1623             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1624             instances.update_data(row["domain"])
1625
1626         logger.debug("Invoking commit() ...")
1627         database.connection.commit()
1628
1629         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1630         if config.get("bot_enabled") and len(blockdict) > 0:
1631             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1632             network.send_bot_post(row["domain"], blockdict)
1633
1634     logger.debug("Success! - EXIT!")
1635     return 0
1636
1637 def fetch_fedilist(args: argparse.Namespace) -> int:
1638     logger.debug("args[]='%s' - CALLED!", type(args))
1639
1640     logger.debug("Invoking locking.acquire() ...")
1641     locking.acquire()
1642
1643     source_domain = "demo.fedilist.com"
1644     if sources.is_recent(source_domain):
1645         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1646         return 0
1647     else:
1648         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1649         sources.update(source_domain)
1650
1651     url = f"http://{source_domain}/instance/csv?onion=not"
1652     if args.software is not None and args.software != "":
1653         logger.debug("args.software='%s'", args.software)
1654         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1655
1656     logger.info("Fetching url='%s' ...", url)
1657     response = reqto.get(
1658         url,
1659         headers=network.web_headers,
1660         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1661         allow_redirects=False
1662     )
1663
1664     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1665     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1666         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1667         return 1
1668
1669     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1670
1671     logger.debug("reader[]='%s'", type(reader))
1672     if reader is None:
1673         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1674         return 2
1675
1676     rows = list(reader)
1677
1678     logger.info("Checking %d rows ...", len(rows))
1679     for row in rows:
1680         logger.debug("row[]='%s'", type(row))
1681         if "hostname" not in row:
1682             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1683             continue
1684
1685         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1686         domain = tidyup.domain(row["hostname"])
1687         logger.debug("domain='%s' - AFTER!", domain)
1688
1689         if domain == "":
1690             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1691             continue
1692
1693         logger.debug("domain='%s' - BEFORE!", domain)
1694         domain = domain.encode("idna").decode("utf-8")
1695         logger.debug("domain='%s' - AFTER!", domain)
1696
1697         if not domain_helper.is_wanted(domain):
1698             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1699             continue
1700         elif (args.force is None or not args.force) and instances.is_registered(domain):
1701             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1702             continue
1703         elif instances.is_recent(domain):
1704             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1705             continue
1706
1707         logger.info("Fetching instances from domain='%s' ...", domain)
1708         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1709
1710     logger.debug("Success! - EXIT!")
1711     return 0
1712
1713 def update_nodeinfo(args: argparse.Namespace) -> int:
1714     logger.debug("args[]='%s' - CALLED!", type(args))
1715
1716     logger.debug("Invoking locking.acquire() ...")
1717     locking.acquire()
1718
1719     if args.domain is not None and args.domain != "":
1720         logger.debug("Fetching args.domain='%s'", args.domain)
1721         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1722     elif args.software is not None and args.software != "":
1723         logger.info("Fetching domains for args.software='%s'", args.software)
1724         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1725     elif args.mode is not None and args.mode != "":
1726         logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1727         database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1728     elif args.no_software:
1729         logger.info("Fetching domains with no software type detected ...")
1730         database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1731     else:
1732         logger.info("Fetching domains for recently updated ...")
1733         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1734
1735     domains = database.cursor.fetchall()
1736
1737     logger.info("Checking %d domain(s) ...", len(domains))
1738     cnt = 0
1739     for row in domains:
1740         logger.debug("row[]='%s'", type(row))
1741         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1742             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1743             continue
1744
1745         try:
1746             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1747             software = federation.determine_software(row["domain"])
1748
1749             logger.debug("Determined software='%s'", software)
1750             if (software != row["software"] and software is not None) or args.force is True:
1751                 logger.debug("software='%s'", software)
1752                 if software is None:
1753                     logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1754                     instances.set_nodeinfo_url(row["domain"], None)
1755
1756                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1757                 instances.set_software(row["domain"], software)
1758
1759             if software is not None:
1760                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1761                 instances.set_success(row["domain"])
1762         except network.exceptions as exception:
1763             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1764             instances.set_last_error(row["domain"], exception)
1765
1766         instances.set_last_nodeinfo(row["domain"])
1767         instances.update_data(row["domain"])
1768         cnt = cnt + 1
1769
1770     logger.debug("Success! - EXIT!")
1771     return 0
1772
1773 def fetch_instances_social(args: argparse.Namespace) -> int:
1774     logger.debug("args[]='%s' - CALLED!", type(args))
1775
1776     logger.debug("Invoking locking.acquire() ...")
1777     locking.acquire()
1778
1779     source_domain = "instances.social"
1780
1781     if config.get("instances_social_api_key") == "":
1782         logger.error("API key not set. Please set in your config.json file.")
1783         return 1
1784     elif sources.is_recent(source_domain):
1785         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1786         return 0
1787     else:
1788         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1789         sources.update(source_domain)
1790
1791     headers = {
1792         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1793     }
1794
1795     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1796     fetched = network.get_json_api(
1797         source_domain,
1798         "/api/1.0/instances/list?count=0&sort_by=name",
1799         headers,
1800         (config.get("connection_timeout"), config.get("read_timeout"))
1801     )
1802     logger.debug("fetched[]='%s'", type(fetched))
1803
1804     if "error_message" in fetched:
1805         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1806         return 2
1807     elif "exception" in fetched:
1808         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1809         return 3
1810     elif "json" not in fetched:
1811         logger.warning("fetched has no element 'json' - EXIT!")
1812         return 4
1813     elif "instances" not in fetched["json"]:
1814         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1815         return 5
1816
1817     domains = list()
1818     rows = fetched["json"]["instances"]
1819
1820     logger.info("Checking %d row(s) ...", len(rows))
1821     for row in rows:
1822         logger.debug("row[]='%s'", type(row))
1823         domain = tidyup.domain(row["name"])
1824         logger.debug("domain='%s' - AFTER!", domain)
1825
1826         if domain == "":
1827             logger.debug("domain is empty - SKIPPED!")
1828             continue
1829
1830         logger.debug("domain='%s' - BEFORE!", domain)
1831         domain = domain.encode("idna").decode("utf-8")
1832         logger.debug("domain='%s' - AFTER!", domain)
1833
1834         if not domain_helper.is_wanted(domain):
1835             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1836             continue
1837         elif domain in domains:
1838             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1839             continue
1840         elif instances.is_registered(domain):
1841             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1842             continue
1843         elif instances.is_recent(domain):
1844             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1845             continue
1846
1847         logger.info("Fetching instances from domain='%s'", domain)
1848         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1849
1850     logger.debug("Success! - EXIT!")
1851     return 0
1852
1853 def fetch_relays(args: argparse.Namespace) -> int:
1854     logger.debug("args[]='%s' - CALLED!", type(args))
1855
1856     logger.debug("Invoking locking.acquire() ...")
1857     locking.acquire()
1858
1859     if args.domain is not None and args.domain != "":
1860         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1861     else:
1862         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1863
1864     domains = list()
1865     rows = database.cursor.fetchall()
1866
1867     logger.info("Checking %d relays ...", len(rows))
1868     for row in rows:
1869         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1870         peers = list()
1871         if not args.force and instances.is_recent(row["domain"]):
1872             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1873             continue
1874
1875         try:
1876             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1877             raw = utils.fetch_url(
1878                 f"https://{row['domain']}",
1879                 network.web_headers,
1880                 (config.get("connection_timeout"), config.get("read_timeout"))
1881             ).text
1882             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1883         except network.exceptions as exception:
1884             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1885             instances.set_last_error(row["domain"], exception)
1886             instances.set_last_instance_fetch(row["domain"])
1887             instances.update_data(row["domain"])
1888             continue
1889
1890         doc = bs4.BeautifulSoup(raw, features="html.parser")
1891         logger.debug("doc[]='%s'", type(doc))
1892
1893         logger.debug("row[software]='%s'", row["software"])
1894         if row["software"] == "activityrelay":
1895             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1896             tags = doc.findAll("p")
1897
1898             logger.debug("Checking %d paragraphs ...", len(tags))
1899             for tag in tags:
1900                 logger.debug("tag[]='%s'", type(tag))
1901                 if len(tag.contents) == 0:
1902                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1903                     continue
1904                 elif "registered instances" not in tag.contents[0]:
1905                     logger.debug("Skipping paragraph, text not found.")
1906                     continue
1907
1908                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1909                 for domain in tag.contents:
1910                     logger.debug("domain[%s]='%s'", type(domain), domain)
1911                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1912                         continue
1913
1914                     domain = str(domain)
1915                     logger.debug("domain='%s'", domain)
1916                     if not domain_helper.is_wanted(domain):
1917                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1918                         continue
1919
1920                     logger.debug("domain='%s' - BEFORE!", domain)
1921                     domain = tidyup.domain(domain)
1922                     logger.debug("domain='%s' - AFTER!", domain)
1923
1924                     if domain == "":
1925                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1926                         continue
1927                     elif domain not in peers:
1928                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1929                         peers.append(domain)
1930
1931                     if dict_helper.has_key(domains, "domain", domain):
1932                         logger.debug("domain='%s' already added", domain)
1933                         continue
1934
1935                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1936                     domains.append({
1937                         "domain": domain,
1938                         "origin": row["domain"],
1939                     })
1940         elif row["software"] in ["aoderelay", "selective-relay"]:
1941             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1942             if row["software"] == "aoderelay":
1943                 tags = doc.findAll("section", {"class": "instance"})
1944             else:
1945                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1946
1947             logger.debug("Checking %d tags ...", len(tags))
1948             for tag in tags:
1949                 logger.debug("tag[]='%s'", type(tag))
1950
1951                 link = tag.find("a")
1952                 logger.debug("link[%s]='%s'", type(link), link)
1953                 if link is None:
1954                     logger.warning("tag='%s' has no a-tag ...", tag)
1955                     continue
1956
1957                 components = urlparse(link["href"])
1958                 domain = components.netloc.lower()
1959
1960                 if not domain_helper.is_wanted(domain):
1961                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1962                     continue
1963
1964                 logger.debug("domain='%s' - BEFORE!", domain)
1965                 domain = tidyup.domain(domain)
1966                 logger.debug("domain='%s' - AFTER!", domain)
1967
1968                 if domain == "":
1969                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1970                     continue
1971                 elif domain not in peers:
1972                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1973                     peers.append(domain)
1974
1975                 if dict_helper.has_key(domains, "domain", domain):
1976                     logger.debug("domain='%s' already added", domain)
1977                     continue
1978
1979                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1980                 domains.append({
1981                     "domain": domain,
1982                     "origin": row["domain"],
1983                 })
1984         else:
1985             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1986
1987         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1988         instances.set_last_instance_fetch(row["domain"])
1989
1990         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1991         instances.set_total_peers(row["domain"], peers)
1992
1993         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1994         instances.update_data(row["domain"])
1995
1996     logger.info("Checking %d domains ...", len(domains))
1997     for row in domains:
1998         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1999         if instances.is_registered(row["domain"]):
2000             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2001             continue
2002
2003         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2004         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2005
2006     logger.debug("Success! - EXIT!")
2007     return 0
2008
2009 def convert_idna(args: argparse.Namespace) -> int:
2010     logger.debug("args[]='%s' - CALLED!", type(args))
2011
2012     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2013     rows = database.cursor.fetchall()
2014
2015     logger.debug("rows[]='%s'", type(rows))
2016     instances.translate_idnas(rows, "domain")
2017
2018     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2019     rows = database.cursor.fetchall()
2020
2021     logger.debug("rows[]='%s'", type(rows))
2022     instances.translate_idnas(rows, "origin")
2023
2024     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2025     rows = database.cursor.fetchall()
2026
2027     logger.debug("rows[]='%s'", type(rows))
2028     blocks.translate_idnas(rows, "blocker")
2029
2030     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2031     rows = database.cursor.fetchall()
2032
2033     logger.debug("rows[]='%s'", type(rows))
2034     blocks.translate_idnas(rows, "blocked")
2035
2036     logger.debug("Success! - EXIT!")
2037     return 0
2038
2039 def remove_invalid(args: argparse.Namespace) -> int:
2040     logger.debug("args[]='%s' - CALLED!", type(args))
2041
2042     logger.debug("Invoking locking.acquire() ...")
2043     locking.acquire()
2044
2045     database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2046     rows = database.cursor.fetchall()
2047
2048     logger.info("Checking %d domains ...", len(rows))
2049     for row in rows:
2050         logger.debug("row[domain]='%s'", row["domain"])
2051         if not validators.domain(row["domain"].split("/")[0]):
2052             logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2053             database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2054             database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2055
2056     logger.debug("Invoking commit() ...")
2057     database.connection.commit()
2058
2059     logger.info("Vaccum cleaning database ...")
2060     database.cursor.execute("VACUUM")
2061
2062     logger.debug("Success! - EXIT!")
2063     return 0