]> git.mxchange.org Git - fba.git/blob - fba/commands.py
Continued:
[fba.git] / fba / commands.py
1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU Affero General Public License for more details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17 import csv
18 import inspect
19 import json
20 import logging
21 import time
22
23 from urllib.parse import urlparse
24
25 import argparse
26 import atoma
27 import bs4
28 import markdown
29 import reqto
30 import validators
31
32 from fba import csrf
33 from fba import database
34 from fba import utils
35
36 from fba.helpers import blacklist
37 from fba.helpers import blocklists
38 from fba.helpers import config
39 from fba.helpers import cookies
40 from fba.helpers import dicts as dict_helper
41 from fba.helpers import domain as domain_helper
42 from fba.helpers import locking
43 from fba.helpers import processing
44 from fba.helpers import software as software_helper
45 from fba.helpers import tidyup
46
47 from fba.http import federation
48 from fba.http import network
49
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
53
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
59
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
63
64 def check_instance(args: argparse.Namespace) -> int:
65     logger.debug("args.domain='%s' - CALLED!", args.domain)
66     status = 0
67     if not validators.hostname(args.domain):
68         logger.warning("args.domain='%s' is not valid", args.domain)
69         status = 100
70     elif blacklist.is_blacklisted(args.domain):
71         logger.warning("args.domain='%s' is blacklisted", args.domain)
72         status = 101
73     elif instances.is_registered(args.domain):
74         logger.warning("args.domain='%s' is already registered", args.domain)
75         status = 102
76     else:
77         logger.info("args.domain='%s' is not known", args.domain)
78
79     logger.debug("status=%d - EXIT!", status)
80     return status
81
82 def check_nodeinfo(args: argparse.Namespace) -> int:
83     logger.debug("args[]='%s' - CALLED!", type(args))
84
85     # Fetch rows
86     database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
87
88     cnt = 0
89     for row in database.cursor.fetchall():
90         logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
91         punycode = row["domain"].encode("idna").decode("utf-8")
92
93         if row["nodeinfo_url"].startswith("/"):
94             logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
95             continue
96         elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
97             logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
98             cnt = cnt + 1
99
100     logger.info("Found %d row(s)", cnt)
101
102     logger.debug("EXIT!")
103     return 0
104
105 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
106     logger.debug("args[]='%s' - CALLED!", type(args))
107
108     # No CSRF by default, you don't have to add network.source_headers by yourself here
109     headers = tuple()
110     source_domain = "pixelfed.org"
111
112     if sources.is_recent(source_domain):
113         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
114         return 0
115     else:
116         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
117         sources.update(source_domain)
118
119     try:
120         logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
121         headers = csrf.determine(source_domain, dict())
122     except network.exceptions as exception:
123         logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
124         return list()
125
126     try:
127         logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
128         fetched = network.get_json_api(
129             source_domain,
130             "/api/v1/servers/all.json?scope=All&country=all&language=all",
131             headers,
132             (config.get("connection_timeout"), config.get("read_timeout"))
133         )
134
135         logger.debug("JSON API returned %d elements", len(fetched))
136         if "error_message" in fetched:
137             logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
138             return 101
139         elif "data" not in fetched["json"]:
140             logger.warning("API did not return JSON with 'data' element - EXIT!")
141             return 102
142
143         rows = fetched["json"]["data"]
144         logger.info("Checking %d fetched rows ...", len(rows))
145         for row in rows:
146             logger.debug("row[]='%s'", type(row))
147             if "domain" not in row:
148                 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
149                 continue
150             elif row["domain"] == "":
151                 logger.debug("row[domain] is empty - SKIPPED!")
152                 continue
153
154             logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
155             domain = row["domain"].encode("idna").decode("utf-8")
156             logger.debug("domain='%s' - AFTER!", domain)
157
158             if not domain_helper.is_wanted(domain):
159                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
160                 continue
161             elif instances.is_registered(domain):
162                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
163                 continue
164             elif instances.is_recent(domain):
165                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
166                 continue
167
168             logger.debug("Fetching instances from domain='%s' ...", domain)
169             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
170
171     except network.exceptions as exception:
172         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
173         return 103
174
175     logger.debug("Success! - EXIT!")
176     return 0
177
178 def fetch_bkali(args: argparse.Namespace) -> int:
179     logger.debug("args[]='%s' - CALLED!", type(args))
180
181     logger.debug("Invoking locking.acquire() ...")
182     locking.acquire()
183
184     source_domain = "gql.api.bka.li"
185     if sources.is_recent(source_domain):
186         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
187         return 0
188     else:
189         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
190         sources.update(source_domain)
191
192     domains = list()
193     try:
194         logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
195         fetched = network.post_json_api(
196             source_domain,
197             "/v1/graphql",
198             json.dumps({
199                 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
200             })
201         )
202
203         logger.debug("fetched[]='%s'", type(fetched))
204         if "error_message" in fetched:
205             logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
206             return 100
207         elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
208             logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
209             return 101
210
211         rows = fetched["json"]
212
213         logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
214         if len(rows) == 0:
215             raise Exception("WARNING: Returned no records")
216         elif "data" not in rows:
217             raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
218         elif "nodeinfo" not in rows["data"]:
219             raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
220
221         for entry in rows["data"]["nodeinfo"]:
222             logger.debug("entry[%s]='%s'", type(entry), entry)
223             if "domain" not in entry:
224                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
225                 continue
226             elif entry["domain"] == "":
227                 logger.debug("entry[domain] is empty - SKIPPED!")
228                 continue
229             elif not domain_helper.is_wanted(entry["domain"]):
230                 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
231                 continue
232             elif instances.is_registered(entry["domain"]):
233                 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
234                 continue
235             elif instances.is_recent(entry["domain"]):
236                 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
237                 continue
238
239             logger.debug("Adding domain='%s' ...", entry["domain"])
240             domains.append(entry["domain"])
241
242     except network.exceptions as exception:
243         logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
244         return 102
245
246     logger.debug("domains()=%d", len(domains))
247     if len(domains) > 0:
248         logger.info("Adding %d new instances ...", len(domains))
249         for domain in domains:
250             logger.debug("domain='%s' - BEFORE!", domain)
251             domain = domain.encode("idna").decode("utf-8")
252             logger.debug("domain='%s' - AFTER!", domain)
253
254             try:
255                 logger.info("Fetching instances from domain='%s' ...", domain)
256                 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
257             except network.exceptions as exception:
258                 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
259                 instances.set_last_error(domain, exception)
260                 return 100
261
262     logger.debug("Success - EXIT!")
263     return 0
264
265 def fetch_blocks(args: argparse.Namespace) -> int:
266     logger.debug("args[]='%s' - CALLED!", type(args))
267     if args.domain is not None and args.domain != "":
268         logger.debug("args.domain='%s' - checking ...", args.domain)
269         if not validators.hostname(args.domain):
270             logger.warning("args.domain='%s' is not valid.", args.domain)
271             return 100
272         elif blacklist.is_blacklisted(args.domain):
273             logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
274             return 101
275         elif not instances.is_registered(args.domain):
276             logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
277             return 102
278
279     logger.debug("Invoking locking.acquire() ...")
280     locking.acquire()
281
282     if args.domain is not None and args.domain != "":
283         # Re-check single domain
284         logger.debug("Querying database for args.domain='%s' ...", args.domain)
285         database.cursor.execute(
286             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
287         )
288     elif args.software is not None and args.software != "":
289         # Re-check single software
290         logger.debug("Querying database for args.software='%s' ...", args.software)
291         database.cursor.execute(
292             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
293         )
294     elif args.force:
295         # Re-check all
296         logger.debug("Re-checking all instances ...")
297         database.cursor.execute(
298             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC"
299         )
300     else:
301         # Re-check after "timeout" (aka. minimum interval)
302         database.cursor.execute(
303             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
304         )
305
306     rows = database.cursor.fetchall()
307     logger.info("Checking %d entries ...", len(rows))
308     for blocker, software, origin, nodeinfo_url in rows:
309         logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
310
311         if nodeinfo_url is None:
312             logger.debug("blocker='%s',software='%s' has no nodeinfo_url set - SKIPPED!", blocker, software)
313             continue
314         elif not domain_helper.is_wanted(blocker):
315             logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
316             continue
317
318         logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
319         instances.set_last_blocked(blocker)
320         instances.set_has_obfuscation(blocker, False)
321
322         blocking = list()
323
324         if blocker != "chaos.social" and not blocklists.is_excluded(blocker):
325             logger.debug("blocker='%s',software='%s'", blocker, software)
326             if software == "pleroma":
327                 logger.info("blocker='%s',software='%s'", blocker, software)
328                 blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
329                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
330             elif software == "mastodon":
331                 logger.info("blocker='%s',software='%s'", blocker, software)
332                 blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
333                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334             elif software == "lemmy":
335                 logger.info("blocker='%s',software='%s'", blocker, software)
336                 blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
337                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
338             elif software == "friendica":
339                 logger.info("blocker='%s',software='%s'", blocker, software)
340                 blocking = friendica.fetch_blocks(blocker)
341                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342             elif software == "misskey":
343                 logger.info("blocker='%s',software='%s'", blocker, software)
344                 blocking = misskey.fetch_blocks(blocker)
345                 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
346             else:
347                 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
348
349             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350             instances.set_total_blocks(blocker, blocking)
351         else:
352             logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
353
354         logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
355         blockdict = list()
356         for block in blocking:
357             logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358
359             if block["block_level"] == "":
360                 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
361                 continue
362
363             logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
364             block["blocked"] = tidyup.domain(block["blocked"])
365             block["reason"]  = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
366             logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367
368             if block["blocked"] == "":
369                 logger.warning("blocked is empty, blocker='%s'", blocker)
370                 continue
371             elif block["blocked"].endswith(".onion"):
372                 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373                 continue
374             elif block["blocked"].endswith(".arpa"):
375                 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
376                 continue
377             elif block["blocked"].endswith(".tld"):
378                 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
379                 continue
380             elif block["blocked"].find("*") >= 0:
381                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
382
383                 # Some friendica servers also obscure domains without hash
384                 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
385
386                 logger.debug("row[]='%s'", type(row))
387                 if row is None:
388                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
389                     instances.set_has_obfuscation(blocker, True)
390                     continue
391
392                 block["blocked"] = row["domain"]
393                 origin           = row["origin"]
394                 nodeinfo_url     = row["nodeinfo_url"]
395             elif block["blocked"].find("?") >= 0:
396                 logger.debug("blocker='%s' uses obfuscated domains", blocker)
397
398                 # Some obscure them with question marks, not sure if that's dependent on version or not
399                 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
400
401                 logger.debug("row[]='%s'", type(row))
402                 if row is None:
403                     logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
404                     instances.set_has_obfuscation(blocker, True)
405                     continue
406
407                 block["blocked"] = row["domain"]
408                 origin           = row["origin"]
409                 nodeinfo_url     = row["nodeinfo_url"]
410
411             logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
412             if block["blocked"] == "":
413                 logger.debug("block[blocked] is empty - SKIPPED!")
414                 continue
415
416             logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
417             block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
418             logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
419
420             if not domain_helper.is_wanted(block["blocked"]):
421                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
422                 continue
423             elif block["block_level"] in ["accept", "accepted"]:
424                 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
425                 continue
426             elif not instances.is_registered(block["blocked"]):
427                 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
428                 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
429
430             block["block_level"] = blocks.alias_block_level(block["block_level"])
431
432             if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
433                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
434                 blockdict.append({
435                     "blocked": block["blocked"],
436                     "reason" : block["reason"],
437                 })
438
439             logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
440             cookies.clear(block["blocked"])
441
442         logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
443         if instances.has_pending(blocker):
444             logger.debug("Flushing updates for blocker='%s' ...", blocker)
445             instances.update_data(blocker)
446
447         logger.debug("Invoking commit() ...")
448         database.connection.commit()
449
450         logger.debug("Invoking cookies.clear(%s) ...", blocker)
451         cookies.clear(blocker)
452
453         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
454         if config.get("bot_enabled") and len(blockdict) > 0:
455             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
456             network.send_bot_post(blocker, blockdict)
457
458     logger.debug("Success! - EXIT!")
459     return 0
460
461 def fetch_observer(args: argparse.Namespace) -> int:
462     logger.debug("args[]='%s' - CALLED!", type(args))
463
464     logger.debug("Invoking locking.acquire() ...")
465     locking.acquire()
466
467     source_domain = "fediverse.observer"
468     if sources.is_recent(source_domain):
469         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
470         return 0
471     else:
472         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
473         sources.update(source_domain)
474
475     types = list()
476     if args.software is None:
477         logger.info("Fetching software list ...")
478         raw = utils.fetch_url(
479             f"https://{source_domain}",
480             network.web_headers,
481             (config.get("connection_timeout"), config.get("read_timeout"))
482         ).text
483         logger.debug("raw[%s]()=%d", type(raw), len(raw))
484
485         doc = bs4.BeautifulSoup(raw, features="html.parser")
486         logger.debug("doc[]='%s'", type(doc))
487
488         navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
489         logger.debug("navbar[]='%s'", type(navbar))
490         if navbar is None:
491             logger.warning("Cannot find navigation bar, cannot continue!")
492             return 1
493
494         items = navbar.findAll("a", {"class": "dropdown-item"})
495         logger.debug("items[]='%s'", type(items))
496
497         logger.info("Checking %d menu items ...", len(items))
498         for item in items:
499             logger.debug("item[%s]='%s'", type(item), item)
500             if item.text.lower() == "all":
501                 logger.debug("Skipping 'All' menu entry ...")
502                 continue
503
504             logger.debug("Appending item.text='%s' ...", item.text)
505             types.append(tidyup.domain(item.text))
506     else:
507         logger.info("Adding args.software='%s' as type ...", args.software)
508         types.append(args.software)
509
510     logger.info("Fetching %d different table data ...", len(types))
511     for software in types:
512         logger.debug("software='%s' - BEFORE!", software)
513         if args.software is not None and args.software != software:
514             logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
515             continue
516
517         doc = None
518         try:
519             logger.debug("Fetching table data for software='%s' ...", software)
520             raw = utils.fetch_url(
521                 f"https://{source_domain}/app/views/tabledata.php?software={software}",
522                 network.web_headers,
523                 (config.get("connection_timeout"), config.get("read_timeout"))
524             ).text
525             logger.debug("raw[%s]()=%d", type(raw), len(raw))
526
527             doc = bs4.BeautifulSoup(raw, features="html.parser")
528             logger.debug("doc[]='%s'", type(doc))
529         except network.exceptions as exception:
530             logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
531             continue
532
533         items = doc.findAll("a", {"class": "url"})
534         logger.info("Checking %d items,software='%s' ...", len(items), software)
535         for item in items:
536             logger.debug("item[]='%s'", type(item))
537             domain = item.decode_contents()
538             logger.debug("domain='%s' - AFTER!", domain)
539
540             if domain == "":
541                 logger.debug("domain is empty - SKIPPED!")
542                 continue
543
544             logger.debug("domain='%s' - BEFORE!", domain)
545             domain = domain.encode("idna").decode("utf-8")
546             logger.debug("domain='%s' - AFTER!", domain)
547
548             if not domain_helper.is_wanted(domain):
549                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
550                 continue
551             elif instances.is_registered(domain):
552                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
553                 continue
554
555             software = software_helper.alias(software)
556             logger.info("Fetching instances for domain='%s'", domain)
557             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
558
559     logger.debug("Success! - EXIT!")
560     return 0
561
562 def fetch_todon_wiki(args: argparse.Namespace) -> int:
563     logger.debug("args[]='%s' - CALLED!", type(args))
564
565     logger.debug("Invoking locking.acquire() ...")
566     locking.acquire()
567
568     source_domain = "wiki.todon.eu"
569     if sources.is_recent(source_domain):
570         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
571         return 0
572     else:
573         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
574         sources.update(source_domain)
575
576     blocklist = {
577         "silenced": list(),
578         "reject": list(),
579     }
580
581     logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
582     raw = utils.fetch_url(
583         f"https://{source_domain}/todon/domainblocks",
584         network.web_headers,
585         (config.get("connection_timeout"), config.get("read_timeout"))
586     ).text
587     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
588
589     doc = bs4.BeautifulSoup(raw, "html.parser")
590     logger.debug("doc[]='%s'", type(doc))
591
592     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
593     logger.info("Checking %d silenced/limited entries ...", len(silenced))
594     blocklist["silenced"] = utils.find_domains(silenced, "div")
595
596     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
597     logger.info("Checking %d suspended entries ...", len(suspended))
598     blocklist["reject"] = utils.find_domains(suspended, "div")
599
600     blocking = blocklist["silenced"] + blocklist["reject"]
601     blocker = "todon.eu"
602
603     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
604     instances.set_last_blocked(blocker)
605     instances.set_total_blocks(blocker, blocking)
606
607     blockdict = list()
608     for block_level in blocklist:
609         blockers = blocklist[block_level]
610
611         logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
612         for blocked in blockers:
613             logger.debug("blocked='%s'", blocked)
614
615             if not instances.is_registered(blocked):
616                 try:
617                     logger.info("Fetching instances from domain='%s' ...", blocked)
618                     federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
619                 except network.exceptions as exception:
620                     logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
621                     instances.set_last_error(blocked, exception)
622
623             if blocks.is_instance_blocked(blocker, blocked, block_level):
624                 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
625                 continue
626
627             logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
628             if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
629                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
630                 blockdict.append({
631                     "blocked": blocked,
632                     "reason" : None,
633                 })
634
635         logger.debug("Invoking commit() ...")
636         database.connection.commit()
637
638         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
639         if config.get("bot_enabled") and len(blockdict) > 0:
640             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
641             network.send_bot_post(blocker, blockdict)
642
643     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
644     if instances.has_pending(blocker):
645         logger.debug("Flushing updates for blocker='%s' ...", blocker)
646         instances.update_data(blocker)
647
648     logger.debug("Success! - EXIT!")
649     return 0
650
651 def fetch_cs(args: argparse.Namespace):
652     logger.debug("args[]='%s' - CALLED!", type(args))
653
654     logger.debug("Invoking locking.acquire() ...")
655     locking.acquire()
656
657     extensions = [
658         "extra",
659         "abbr",
660         "attr_list",
661         "def_list",
662         "fenced_code",
663         "footnotes",
664         "md_in_html",
665         "admonition",
666         "codehilite",
667         "legacy_attrs",
668         "legacy_em",
669         "meta",
670         "nl2br",
671         "sane_lists",
672         "smarty",
673         "toc",
674         "wikilinks"
675     ]
676
677     blocklist = {
678         "silenced": list(),
679         "reject"  : list(),
680     }
681
682     source_domain = "raw.githubusercontent.com"
683     if sources.is_recent(source_domain):
684         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
685         return 0
686     else:
687         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
688         sources.update(source_domain)
689
690     logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
691     raw = utils.fetch_url(
692         f"https://{source_domain}/chaossocial/meta/master/federation.md",
693         network.web_headers,
694         (config.get("connection_timeout"), config.get("read_timeout"))
695     ).text
696     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
697
698     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
699     logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
700
701     silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
702     logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
703     blocklist["silenced"] = federation.find_domains(silenced)
704
705     blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
706     logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
707     blocklist["reject"] = federation.find_domains(blocked)
708
709     blocking = blocklist["silenced"] + blocklist["reject"]
710     blocker = "chaos.social"
711
712     logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
713     instances.set_last_blocked(blocker)
714     instances.set_total_blocks(blocker, blocking)
715
716     logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
717     if len(blocking) > 0:
718         blockdict = list()
719         for block_level in blocklist:
720             logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
721
722             for row in blocklist[block_level]:
723                 logger.debug("row[%s]='%s'", type(row), row)
724                 if not "domain" in row:
725                     logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
726                     continue
727                 elif not instances.is_registered(row["domain"]):
728                     try:
729                         logger.info("Fetching instances from domain='%s' ...", row["domain"])
730                         federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
731                     except network.exceptions as exception:
732                         logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
733                         instances.set_last_error(row["domain"], exception)
734
735                 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
736                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
737                     blockdict.append({
738                         "blocked": row["domain"],
739                         "reason" : row["reason"],
740                     })
741
742         logger.debug("Invoking commit() ...")
743         database.connection.commit()
744
745         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
746         if config.get("bot_enabled") and len(blockdict) > 0:
747             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
748             network.send_bot_post(blocker, blockdict)
749
750     logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
751     if instances.has_pending(blocker):
752         logger.debug("Flushing updates for blocker='%s' ...", blocker)
753         instances.update_data(blocker)
754
755     logger.debug("Success! - EXIT!")
756     return 0
757
758 def fetch_fba_rss(args: argparse.Namespace) -> int:
759     logger.debug("args[]='%s' - CALLED!", type(args))
760
761     domains = list()
762
763     logger.debug("Invoking locking.acquire() ...")
764     locking.acquire()
765
766     components = urlparse(args.feed)
767
768     if sources.is_recent(components.netloc):
769         logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
770         return 0
771     else:
772         logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
773         sources.update(components.netloc)
774
775     logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
776     response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
777
778     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
779     if response.ok and response.status_code < 300 and len(response.text) > 0:
780         logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
781         rss = atoma.parse_rss_bytes(response.content)
782
783         logger.debug("rss[]='%s'", type(rss))
784         for item in rss.items:
785             logger.debug("item[%s]='%s'", type(item), item)
786             domain = tidyup.domain(item.link.split("=")[1])
787
788             logger.debug("domain='%s' - AFTER!", domain)
789             if domain == "":
790                 logger.debug("domain is empty - SKIPPED!")
791                 continue
792
793             logger.debug("domain='%s' - BEFORE!", domain)
794             domain = domain.encode("idna").decode("utf-8")
795             logger.debug("domain='%s' - AFTER!", domain)
796
797             if not domain_helper.is_wanted(domain):
798                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
799                 continue
800             elif domain in domains:
801                 logger.debug("domain='%s' is already added - SKIPPED!", domain)
802                 continue
803             elif instances.is_registered(domain):
804                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
805                 continue
806             elif instances.is_recent(domain):
807                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
808                 continue
809
810             logger.debug("Adding domain='%s'", domain)
811             domains.append(domain)
812
813     logger.debug("domains()=%d", len(domains))
814     if len(domains) > 0:
815         logger.info("Adding %d new instances ...", len(domains))
816         for domain in domains:
817             logger.debug("domain='%s'", domain)
818             try:
819                 logger.info("Fetching instances from domain='%s' ...", domain)
820                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
821             except network.exceptions as exception:
822                 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
823                 instances.set_last_error(domain, exception)
824                 return 100
825
826     logger.debug("Success! - EXIT!")
827     return 0
828
829 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
830     logger.debug("args[]='%s' - CALLED!", type(args))
831
832     logger.debug("Invoking locking.acquire() ...")
833     locking.acquire()
834
835     source_domain = "ryona.agency"
836     feed = f"https://{source_domain}/users/fba/feed.atom"
837
838     logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
839     if args.feed is not None and validators.url(args.feed):
840         logger.debug("Setting feed='%s' ...", args.feed)
841         feed = str(args.feed)
842         source_domain = urlparse(args.feed).netloc
843
844     if sources.is_recent(source_domain):
845         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
846         return 0
847     else:
848         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
849         sources.update(source_domain)
850
851     domains = list()
852
853     logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
854     response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
855
856     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
857     if response.ok and response.status_code < 300 and len(response.text) > 0:
858         logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
859         atom = atoma.parse_atom_bytes(response.content)
860
861         logger.debug("atom[]='%s'", type(atom))
862         for entry in atom.entries:
863             logger.debug("entry[]='%s'", type(entry))
864             doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
865             logger.debug("doc[]='%s'", type(doc))
866             for element in doc.findAll("a"):
867                 logger.debug("element[]='%s'", type(element))
868                 for href in element["href"].split(","):
869                     logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
870                     domain = tidyup.domain(href)
871
872                     logger.debug("domain='%s' - AFTER!", domain)
873                     if domain == "":
874                         logger.debug("domain is empty - SKIPPED!")
875                         continue
876
877                     logger.debug("domain='%s' - BEFORE!", domain)
878                     domain = domain.encode("idna").decode("utf-8")
879                     logger.debug("domain='%s' - AFTER!", domain)
880
881                     if not domain_helper.is_wanted(domain):
882                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
883                         continue
884                     elif domain in domains:
885                         logger.debug("domain='%s' is already added - SKIPPED!", domain)
886                         continue
887                     elif instances.is_registered(domain):
888                         logger.debug("domain='%s' is already registered - SKIPPED!", domain)
889                         continue
890                     elif instances.is_recent(domain):
891                         logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
892                         continue
893
894                     logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
895                     domains.append(domain)
896
897     logger.debug("domains()=%d", len(domains))
898     if len(domains) > 0:
899         logger.info("Adding %d new instances ...", len(domains))
900         for domain in domains:
901             logger.debug("domain='%s'", domain)
902             try:
903                 logger.info("Fetching instances from domain='%s' ...", domain)
904                 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
905             except network.exceptions as exception:
906                 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
907                 instances.set_last_error(domain, exception)
908                 return 100
909
910     logger.debug("Success! - EXIT!")
911     return 0
912
913 def fetch_instances(args: argparse.Namespace) -> int:
914     logger.debug("args[]='%s' - CALLED!", type(args))
915
916     logger.debug("args.domain='%s' - checking ...", args.domain)
917     if not validators.hostname(args.domain):
918         logger.warning("args.domain='%s' is not valid.", args.domain)
919         return 100
920     elif blacklist.is_blacklisted(args.domain):
921         logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
922         return 101
923
924     logger.debug("Invoking locking.acquire() ...")
925     locking.acquire()
926
927     # Initialize values
928     domain = tidyup.domain(args.domain)
929     origin = software = None
930
931     # Fetch record
932     database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
933     row = database.cursor.fetchone()
934     if row is not None:
935         origin = row["origin"]
936         software = row["software"]
937
938     # Initial fetch
939     try:
940         logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
941         federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
942     except network.exceptions as exception:
943         logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
944         instances.set_last_error(args.domain, exception)
945         instances.update_data(args.domain)
946         return 100
947
948     if args.single:
949         logger.debug("Not fetching more instances - EXIT!")
950         return 0
951
952     # Loop through some instances
953     database.cursor.execute(
954         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
955     )
956
957     rows = database.cursor.fetchall()
958     logger.info("Checking %d entries ...", len(rows))
959     for row in rows:
960         logger.debug("row[domain]='%s'", row["domain"])
961         if row["domain"] == "":
962             logger.debug("row[domain] is empty - SKIPPED!")
963             continue
964
965         logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
966         domain = row["domain"].encode("idna").decode("utf-8")
967         logger.debug("domain='%s' - AFTER!", domain)
968
969         if not domain_helper.is_wanted(domain):
970             logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
971             continue
972
973         try:
974             logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
975             federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
976         except network.exceptions as exception:
977             logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
978             instances.set_last_error(domain, exception)
979
980     logger.debug("Success - EXIT!")
981     return 0
982
983 def fetch_oliphant(args: argparse.Namespace) -> int:
984     logger.debug("args[]='%s' - CALLED!", type(args))
985
986     logger.debug("Invoking locking.acquire() ...")
987     locking.acquire()
988
989     source_domain = "codeberg.org"
990     if sources.is_recent(source_domain):
991         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
992         return 0
993     else:
994         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
995         sources.update(source_domain)
996
997     # Base URL
998     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
999
1000     domains = list()
1001
1002     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1003     for block in blocklists.oliphant_blocklists:
1004         # Is domain given and not equal blocker?
1005         if isinstance(args.domain, str) and args.domain != block["blocker"]:
1006             logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1007             continue
1008         elif args.domain in domains:
1009             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1010             continue
1011
1012         instances.set_last_blocked(block["blocker"])
1013
1014         # Fetch this URL
1015         logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1016         response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1017
1018         logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1019         if not response.ok or response.status_code >= 300 or response.content == "":
1020             logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1021             continue
1022
1023         logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1024         reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1025
1026         blockdict = list()
1027
1028         cnt = 0
1029         for row in reader:
1030             logger.debug("row[%s]='%s'", type(row), row)
1031             domain = severity = None
1032             reject_media = reject_reports = False
1033
1034             if "#domain" in row:
1035                 domain = row["#domain"]
1036             elif "domain" in row:
1037                 domain = row["domain"]
1038             else:
1039                 logger.debug("row='%s' does not contain domain column", row)
1040                 continue
1041
1042             if "#severity" in row:
1043                 severity = blocks.alias_block_level(row["#severity"])
1044             elif "severity" in row:
1045                 severity = blocks.alias_block_level(row["severity"])
1046             else:
1047                 logger.debug("row='%s' does not contain severity column", row)
1048                 continue
1049
1050             if "#reject_media" in row and row["#reject_media"].lower() == "true":
1051                 reject_media = True
1052             elif "reject_media" in row and row["reject_media"].lower() == "true":
1053                 reject_media = True
1054
1055             if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1056                 reject_reports = True
1057             elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1058                 reject_reports = True
1059
1060             cnt = cnt + 1
1061             logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1062             if domain == "":
1063                 logger.debug("domain is empty - SKIPPED!")
1064                 continue
1065             elif domain.endswith(".onion"):
1066                 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1067                 continue
1068             elif domain.endswith(".arpa"):
1069                 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1070                 continue
1071             elif domain.endswith(".tld"):
1072                 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1073                 continue
1074             elif domain.find("*") >= 0 or domain.find("?") >= 0:
1075                 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1076                 domain = utils.deobfuscate(domain, block["blocker"])
1077                 logger.debug("domain='%s' - AFTER!", domain)
1078
1079             if not validators.hostname(domain):
1080                 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1081                 continue
1082             elif blacklist.is_blacklisted(domain):
1083                 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1084                 continue
1085             elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1086                 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1087                 continue
1088
1089             logger.debug("Marking domain='%s' as handled", domain)
1090             domains.append(domain)
1091
1092             logger.debug("Processing domain='%s' ...", domain)
1093             processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1094             logger.debug("processed='%s'", processed)
1095
1096             if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1097                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1098                 blockdict.append({
1099                     "blocked": domain,
1100                     "reason" : block["reason"],
1101                 })
1102
1103             if reject_media:
1104                 processing.block(block["blocker"], domain, None, "reject_media")
1105             if reject_reports:
1106                 processing.block(block["blocker"], domain, None, "reject_reports")
1107
1108         logger.debug("block[blocker]='%s'", block["blocker"])
1109         if not blocklists.is_excluded(block["blocker"]):
1110             logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1111             instances.set_total_blocks(block["blocker"], domains)
1112
1113         logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1114         if instances.has_pending(block["blocker"]):
1115             logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1116             instances.update_data(block["blocker"])
1117
1118         logger.debug("Invoking commit() ...")
1119         database.connection.commit()
1120
1121         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1122         if config.get("bot_enabled") and len(blockdict) > 0:
1123             logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1124             network.send_bot_post(block["blocker"], blockdict)
1125
1126     logger.debug("Success! - EXIT!")
1127     return 0
1128
1129 def fetch_txt(args: argparse.Namespace) -> int:
1130     logger.debug("args[]='%s' - CALLED!", type(args))
1131
1132     logger.debug("Invoking locking.acquire() ...")
1133     locking.acquire()
1134
1135     # Static URLs
1136     urls = ({
1137         "blocker": "seirdy.one",
1138         "url"    : "https://seirdy.one/pb/bsl.txt",
1139     },)
1140
1141     logger.info("Checking %d text file(s) ...", len(urls))
1142     for row in urls:
1143         logger.debug("Fetching row[url]='%s' ...", row["url"])
1144         response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1145
1146         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1147         if response.ok and response.status_code < 300 and response.text != "":
1148             logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1149             domains = response.text.split("\n")
1150
1151             logger.info("Processing %d domains ...", len(domains))
1152             for domain in domains:
1153                 logger.debug("domain='%s' - BEFORE!", domain)
1154                 domain = tidyup.domain(domain)
1155
1156                 logger.debug("domain='%s' - AFTER!", domain)
1157                 if domain == "":
1158                     logger.debug("domain is empty - SKIPPED!")
1159                     continue
1160                 elif not domain_helper.is_wanted(domain):
1161                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1162                     continue
1163                 elif instances.is_recent(domain):
1164                     logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1165                     continue
1166
1167                 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1168                 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1169
1170                 logger.debug("processed='%s'", processed)
1171                 if not processed:
1172                     logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1173                     continue
1174
1175     logger.debug("Success! - EXIT!")
1176     return 0
1177
1178 def fetch_fedipact(args: argparse.Namespace) -> int:
1179     logger.debug("args[]='%s' - CALLED!", type(args))
1180
1181     logger.debug("Invoking locking.acquire() ...")
1182     locking.acquire()
1183
1184     source_domain = "fedipact.online"
1185     if sources.is_recent(source_domain):
1186         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1187         return 0
1188     else:
1189         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1190         sources.update(source_domain)
1191
1192     logger.info("Fetching / from source_domain='%s' ...", source_domain)
1193     response = utils.fetch_url(
1194         f"https://{source_domain}",
1195         network.web_headers,
1196         (config.get("connection_timeout"), config.get("read_timeout"))
1197     )
1198
1199     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1200     if response.ok and response.status_code < 300 and response.text != "":
1201         logger.debug("Parsing %d Bytes ...", len(response.text))
1202
1203         doc = bs4.BeautifulSoup(response.text, "html.parser")
1204         logger.debug("doc[]='%s'", type(doc))
1205
1206         rows = doc.findAll("li")
1207         logger.info("Checking %d row(s) ...", len(rows))
1208         for row in rows:
1209             logger.debug("row[]='%s'", type(row))
1210             domain = tidyup.domain(row.contents[0])
1211
1212             logger.debug("domain='%s' - AFTER!", domain)
1213             if domain == "":
1214                 logger.debug("domain is empty - SKIPPED!")
1215                 continue
1216
1217             logger.debug("domain='%s' - BEFORE!", domain)
1218             domain = domain.encode("idna").decode("utf-8")
1219             logger.debug("domain='%s' - AFTER!", domain)
1220
1221             if not domain_helper.is_wanted(domain):
1222                 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1223                 continue
1224             elif instances.is_registered(domain):
1225                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1226                 continue
1227             elif instances.is_recent(domain):
1228                 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1229                 continue
1230
1231             logger.info("Fetching domain='%s' ...", domain)
1232             federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1233
1234     logger.debug("Success! - EXIT!")
1235     return 0
1236
1237 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1238     logger.debug("args[]='%s' - CALLED!", type(args))
1239
1240     logger.debug("Invoking locking.acquire() ...")
1241     locking.acquire()
1242
1243     source_domain = "instances.joinmobilizon.org"
1244     if sources.is_recent(source_domain):
1245         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1246         return 0
1247     else:
1248         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1249         sources.update(source_domain)
1250
1251     logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1252     raw = utils.fetch_url(
1253         f"https://{source_domain}/api/v1/instances",
1254         network.web_headers,
1255         (config.get("connection_timeout"), config.get("read_timeout"))
1256     ).text
1257     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1258
1259     parsed = json.loads(raw)
1260     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1261
1262     if "data" not in parsed:
1263         logger.warning("parsed()=%d does not contain key 'data'")
1264         return 1
1265
1266     logger.info("Checking %d instances ...", len(parsed["data"]))
1267     for row in parsed["data"]:
1268         logger.debug("row[]='%s'", type(row))
1269         if "host" not in row:
1270             logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1271             continue
1272         elif not domain_helper.is_wanted(row["host"]):
1273             logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1274             continue
1275         elif instances.is_registered(row["host"]):
1276             logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1277             continue
1278
1279         logger.info("Fetching row[host]='%s' ...", row["host"])
1280         federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1281
1282     logger.debug("Success! - EXIT!")
1283     return 0
1284
1285 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1286     logger.debug("args[]='%s' - CALLED!", type(args))
1287
1288     logger.debug("Invoking locking.acquire() ...")
1289     locking.acquire()
1290
1291     source_domain = "instanceapp.misskey.page"
1292     if sources.is_recent(source_domain):
1293         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1294         return 0
1295     else:
1296         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1297         sources.update(source_domain)
1298
1299     logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1300     raw = utils.fetch_url(
1301         f"https://{source_domain}/instances.json",
1302         network.web_headers,
1303         (config.get("connection_timeout"), config.get("read_timeout"))
1304     ).text
1305     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1306
1307     parsed = json.loads(raw)
1308     logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1309
1310     if "instancesInfos" not in parsed:
1311         logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1312         return 1
1313
1314     logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1315     for row in parsed["instancesInfos"]:
1316         logger.debug("row[%s]='%s'", type(row), row)
1317         if "url" not in row:
1318             logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1319             continue
1320         elif not domain_helper.is_wanted(row["url"]):
1321             logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1322             continue
1323         elif instances.is_registered(row["url"]):
1324             logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1325             continue
1326
1327         logger.info("Fetching row[url]='%s' ...", row["url"])
1328         federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1329
1330     logger.debug("Success! - EXIT!")
1331     return 0
1332
1333 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1334     logger.debug("args[]='%s' - CALLED!", type(args))
1335
1336     logger.debug("Invoking locking.acquire() ...")
1337     locking.acquire()
1338
1339     source_domain = "joinfediverse.wiki"
1340     if sources.is_recent(source_domain):
1341         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1342         return 0
1343     else:
1344         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1345         sources.update(source_domain)
1346
1347     logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1348     raw = utils.fetch_url(
1349         f"https://{source_domain}/FediBlock",
1350         network.web_headers,
1351         (config.get("connection_timeout"), config.get("read_timeout"))
1352     ).text
1353     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1354
1355     doc = bs4.BeautifulSoup(raw, "html.parser")
1356     logger.debug("doc[]='%s'", type(doc))
1357
1358     tables = doc.findAll("table", {"class": "wikitable"})
1359
1360     logger.info("Analyzing %d table(s) ...", len(tables))
1361     blocklist = list()
1362     for table in tables:
1363         logger.debug("table[]='%s'", type(table))
1364
1365         rows = table.findAll("tr")
1366         logger.info("Checking %d row(s) ...", len(rows))
1367         block_headers = dict()
1368         for row in rows:
1369             logger.debug("row[%s]='%s'", type(row), row)
1370
1371             headers = row.findAll("th")
1372             logger.debug("Found headers()=%d header(s)", len(headers))
1373             if len(headers) > 1:
1374                 block_headers = dict()
1375                 cnt = 0
1376                 for header in headers:
1377                     cnt = cnt + 1
1378                     logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1379                     text = header.contents[0]
1380
1381                     logger.debug("text[]='%s'", type(text))
1382                     if not isinstance(text, str):
1383                         logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1384                         continue
1385                     elif validators.hostname(text.strip()):
1386                         logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1387                         continue
1388
1389                     text = tidyup.domain(text.strip())
1390                     logger.debug("text='%s' - AFTER!", text)
1391                     if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1392                         logger.debug("Found header: '%s'=%d", text, cnt)
1393                         block_headers[cnt] = text
1394
1395             elif len(block_headers) == 0:
1396                 logger.debug("row is not scrapable - SKIPPED!")
1397                 continue
1398             elif len(block_headers) > 0:
1399                 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1400                 cnt = 0
1401                 block = dict()
1402
1403                 for element in row.find_all(["th", "td"]):
1404                     cnt = cnt + 1
1405                     logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1406                     if cnt in block_headers:
1407                         logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1408
1409                         text = element.text.strip()
1410                         key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1411
1412                         logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1413                         if key in ["domain", "instance"]:
1414                             block[key] = text
1415                         elif key == "reason":
1416                             block[key] = tidyup.reason(text)
1417                         elif key == "subdomain(s)":
1418                             block[key] = list()
1419                             if text != "":
1420                                 block[key] = text.split("/")
1421                         else:
1422                             logger.debug("key='%s'", key)
1423                             block[key] = text
1424
1425                 logger.debug("block()=%d ...", len(block))
1426                 if len(block) > 0:
1427                     logger.debug("Appending block()=%d ...", len(block))
1428                     blocklist.append(block)
1429
1430     logger.debug("blocklist()=%d", len(blocklist))
1431
1432     database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1433     domains = database.cursor.fetchall()
1434
1435     logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1436     blocking = list()
1437     for block in blocklist:
1438         logger.debug("block='%s'", block)
1439         if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1440             origin = block["blocked"]
1441             logger.debug("origin='%s'", origin)
1442             for subdomain in block["subdomain(s)"]:
1443                 block["blocked"] = subdomain + "." + origin
1444                 logger.debug("block[blocked]='%s'", block["blocked"])
1445                 blocking.append(block)
1446         else:
1447             blocking.append(block)
1448
1449     logger.debug("blocking()=%d", blocking)
1450     for block in blocking:
1451         logger.debug("block[]='%s'", type(block))
1452         if "blocked" not in block:
1453             raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1454
1455         block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1456         logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1457
1458         if block["blocked"] == "":
1459             logger.debug("block[blocked] is empty - SKIPPED!")
1460             continue
1461         elif not domain_helper.is_wanted(block["blocked"]):
1462             logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1463             continue
1464         elif instances.is_recent(block["blocked"]):
1465             logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1466             continue
1467
1468         logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1469         processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1470
1471     blockdict = list()
1472     for blocker in domains:
1473         blocker = blocker[0]
1474         logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1475         instances.set_last_blocked(blocker)
1476
1477         for block in blocking:
1478             logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1479             block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1480
1481             logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1482             if block["blocked"] == "":
1483                 logger.debug("block[blocked] is empty - SKIPPED!")
1484                 continue
1485             elif not domain_helper.is_wanted(block["blocked"]):
1486                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1487                 continue
1488
1489             logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1490             if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1491                 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1492                 blockdict.append({
1493                     "blocked": block["blocked"],
1494                     "reason" : block["reason"],
1495                 })
1496
1497         if instances.has_pending(blocker):
1498             logger.debug("Flushing updates for blocker='%s' ...", blocker)
1499             instances.update_data(blocker)
1500
1501         logger.debug("Invoking commit() ...")
1502         database.connection.commit()
1503
1504         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1505         if config.get("bot_enabled") and len(blockdict) > 0:
1506             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1507             network.send_bot_post(blocker, blockdict)
1508
1509     logger.debug("Success! - EXIT!")
1510     return 0
1511
1512 def recheck_obfuscation(args: argparse.Namespace) -> int:
1513     logger.debug("args[]='%s' - CALLED!", type(args))
1514
1515     logger.debug("Invoking locking.acquire() ...")
1516     locking.acquire()
1517
1518     if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1519         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1520     elif isinstance(args.software, str) and args.software != "" and validators.hostname(args.software) == args.software:
1521         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1522     else:
1523         database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1524
1525     rows = database.cursor.fetchall()
1526     logger.info("Checking %d domains ...", len(rows))
1527     for row in rows:
1528         logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1529         if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1530             logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1531             continue
1532
1533         blocking = list()
1534         if row["software"] == "pleroma":
1535             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1536             blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1537         elif row["software"] == "mastodon":
1538             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1539             blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1540         elif row["software"] == "lemmy":
1541             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1542             blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1543         elif row["software"] == "friendica":
1544             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1545             blocking = friendica.fetch_blocks(row["domain"])
1546         elif row["software"] == "misskey":
1547             logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1548             blocking = misskey.fetch_blocks(row["domain"])
1549         else:
1550             logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1551
1552         logger.debug("row[domain]='%s'", row["domain"])
1553
1554         # chaos.social requires special care ...
1555         if row["domain"] != "chaos.social" and not blocklists.is_excluded(row["domain"]):
1556             logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1557             instances.set_last_blocked(row["domain"])
1558             instances.set_total_blocks(row["domain"], blocking)
1559
1560         obfuscated = 0
1561         blockdict = list()
1562
1563         logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1564         for block in blocking:
1565             logger.debug("block[blocked]='%s'", block["blocked"])
1566             blocked = None
1567
1568             if block["blocked"] == "":
1569                 logger.debug("block[blocked] is empty - SKIPPED!")
1570                 continue
1571             elif block["blocked"].endswith(".arpa"):
1572                 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1573                 continue
1574             elif block["blocked"].endswith(".tld"):
1575                 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1576                 continue
1577             elif block["blocked"].endswith(".onion"):
1578                 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1579                 continue
1580             elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1581                 logger.debug("block='%s' is obfuscated.", block["blocked"])
1582                 obfuscated = obfuscated + 1
1583                 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1584             elif not domain_helper.is_wanted(block["blocked"]):
1585                 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1586                 continue
1587             elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1588                 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1589                 continue
1590
1591             logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1592             if blocked is not None and blocked != block["blocked"]:
1593                 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1594                 obfuscated = obfuscated - 1
1595
1596                 if blocks.is_instance_blocked(row["domain"], blocked):
1597                     logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1598                     continue
1599                 elif blacklist.is_blacklisted(blocked):
1600                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1601                     continue
1602
1603                 block["block_level"] = blocks.alias_block_level(block["block_level"])
1604
1605                 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1606                 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1607                     logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1608                     blockdict.append({
1609                         "blocked": blocked,
1610                         "reason" : block["reason"],
1611                     })
1612
1613         logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1614         instances.set_obfuscated_blocks(row["domain"], obfuscated)
1615
1616         logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1617         if obfuscated == 0 and len(blocking) > 0:
1618             logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1619             instances.set_has_obfuscation(row["domain"], False)
1620
1621         if instances.has_pending(row["domain"]):
1622             logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1623             instances.update_data(row["domain"])
1624
1625         logger.debug("Invoking commit() ...")
1626         database.connection.commit()
1627
1628         logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1629         if config.get("bot_enabled") and len(blockdict) > 0:
1630             logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1631             network.send_bot_post(row["domain"], blockdict)
1632
1633     logger.debug("Success! - EXIT!")
1634     return 0
1635
1636 def fetch_fedilist(args: argparse.Namespace) -> int:
1637     logger.debug("args[]='%s' - CALLED!", type(args))
1638
1639     logger.debug("Invoking locking.acquire() ...")
1640     locking.acquire()
1641
1642     source_domain = "demo.fedilist.com"
1643     if sources.is_recent(source_domain):
1644         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1645         return 0
1646     else:
1647         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1648         sources.update(source_domain)
1649
1650     url = f"http://{source_domain}/instance/csv?onion=not"
1651     if args.software is not None and args.software != "":
1652         logger.debug("args.software='%s'", args.software)
1653         url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1654
1655     logger.info("Fetching url='%s' ...", url)
1656     response = reqto.get(
1657         url,
1658         headers=network.web_headers,
1659         timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1660         allow_redirects=False
1661     )
1662
1663     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1664     if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1665         logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1666         return 1
1667
1668     reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1669
1670     logger.debug("reader[]='%s'", type(reader))
1671     if reader is None:
1672         logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1673         return 2
1674
1675     rows = list(reader)
1676
1677     logger.info("Checking %d rows ...", len(rows))
1678     for row in rows:
1679         logger.debug("row[]='%s'", type(row))
1680         if "hostname" not in row:
1681             logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1682             continue
1683
1684         logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1685         domain = tidyup.domain(row["hostname"])
1686         logger.debug("domain='%s' - AFTER!", domain)
1687
1688         if domain == "":
1689             logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1690             continue
1691
1692         logger.debug("domain='%s' - BEFORE!", domain)
1693         domain = domain.encode("idna").decode("utf-8")
1694         logger.debug("domain='%s' - AFTER!", domain)
1695
1696         if not domain_helper.is_wanted(domain):
1697             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1698             continue
1699         elif (args.force is None or not args.force) and instances.is_registered(domain):
1700             logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1701             continue
1702         elif instances.is_recent(domain):
1703             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1704             continue
1705
1706         logger.info("Fetching instances from domain='%s' ...", domain)
1707         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1708
1709     logger.debug("Success! - EXIT!")
1710     return 0
1711
1712 def update_nodeinfo(args: argparse.Namespace) -> int:
1713     logger.debug("args[]='%s' - CALLED!", type(args))
1714
1715     logger.debug("Invoking locking.acquire() ...")
1716     locking.acquire()
1717
1718     if args.domain is not None and args.domain != "":
1719         logger.debug("Fetching args.domain='%s'", args.domain)
1720         database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1721     elif args.software is not None and args.software != "":
1722         logger.info("Fetching domains for args.software='%s'", args.software)
1723         database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1724     else:
1725         logger.info("Fetching domains for recently updated ...")
1726         database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1727
1728     domains = database.cursor.fetchall()
1729
1730     logger.info("Checking %d domain(s) ...", len(domains))
1731     cnt = 0
1732     for row in domains:
1733         logger.debug("row[]='%s'", type(row))
1734         if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1735             logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1736             continue
1737
1738         try:
1739             logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1740             software = federation.determine_software(row["domain"])
1741
1742             logger.debug("Determined software='%s'", software)
1743             if (software != row["software"] and software is not None) or args.force is True:
1744                 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1745                 instances.set_software(row["domain"], software)
1746
1747             if software is not None:
1748                 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1749                 instances.set_success(row["domain"])
1750         except network.exceptions as exception:
1751             logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1752             instances.set_last_error(row["domain"], exception)
1753
1754         instances.set_last_nodeinfo(row["domain"])
1755         instances.update_data(row["domain"])
1756         cnt = cnt + 1
1757
1758     logger.debug("Success! - EXIT!")
1759     return 0
1760
1761 def fetch_instances_social(args: argparse.Namespace) -> int:
1762     logger.debug("args[]='%s' - CALLED!", type(args))
1763
1764     logger.debug("Invoking locking.acquire() ...")
1765     locking.acquire()
1766
1767     source_domain = "instances.social"
1768
1769     if config.get("instances_social_api_key") == "":
1770         logger.error("API key not set. Please set in your config.json file.")
1771         return 1
1772     elif sources.is_recent(source_domain):
1773         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1774         return 0
1775     else:
1776         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1777         sources.update(source_domain)
1778
1779     headers = {
1780         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1781     }
1782
1783     logger.info("Fetching list from source_domain='%s' ...", source_domain)
1784     fetched = network.get_json_api(
1785         source_domain,
1786         "/api/1.0/instances/list?count=0&sort_by=name",
1787         headers,
1788         (config.get("connection_timeout"), config.get("read_timeout"))
1789     )
1790     logger.debug("fetched[]='%s'", type(fetched))
1791
1792     if "error_message" in fetched:
1793         logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1794         return 2
1795     elif "exception" in fetched:
1796         logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1797         return 3
1798     elif "json" not in fetched:
1799         logger.warning("fetched has no element 'json' - EXIT!")
1800         return 4
1801     elif "instances" not in fetched["json"]:
1802         logger.warning("fetched[row] has no element 'instances' - EXIT!")
1803         return 5
1804
1805     domains = list()
1806     rows = fetched["json"]["instances"]
1807
1808     logger.info("Checking %d row(s) ...", len(rows))
1809     for row in rows:
1810         logger.debug("row[]='%s'", type(row))
1811         domain = tidyup.domain(row["name"])
1812         logger.debug("domain='%s' - AFTER!", domain)
1813
1814         if domain == "":
1815             logger.debug("domain is empty - SKIPPED!")
1816             continue
1817
1818         logger.debug("domain='%s' - BEFORE!", domain)
1819         domain = domain.encode("idna").decode("utf-8")
1820         logger.debug("domain='%s' - AFTER!", domain)
1821
1822         if not domain_helper.is_wanted(domain):
1823             logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1824             continue
1825         elif domain in domains:
1826             logger.debug("domain='%s' is already added - SKIPPED!", domain)
1827             continue
1828         elif instances.is_registered(domain):
1829             logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1830             continue
1831         elif instances.is_recent(domain):
1832             logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1833             continue
1834
1835         logger.info("Fetching instances from domain='%s'", domain)
1836         federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1837
1838     logger.debug("Success! - EXIT!")
1839     return 0
1840
1841 def fetch_relays(args: argparse.Namespace) -> int:
1842     logger.debug("args[]='%s' - CALLED!", type(args))
1843
1844     logger.debug("Invoking locking.acquire() ...")
1845     locking.acquire()
1846
1847     if args.domain is not None and args.domain != "":
1848         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1849     else:
1850         database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1851
1852     domains = list()
1853     rows = database.cursor.fetchall()
1854
1855     logger.info("Checking %d relays ...", len(rows))
1856     for row in rows:
1857         logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1858         peers = list()
1859         if not args.force and instances.is_recent(row["domain"]):
1860             logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1861             continue
1862
1863         try:
1864             logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1865             raw = utils.fetch_url(
1866                 f"https://{row['domain']}",
1867                 network.web_headers,
1868                 (config.get("connection_timeout"), config.get("read_timeout"))
1869             ).text
1870             logger.debug("raw[%s]()=%d", type(raw), len(raw))
1871         except network.exceptions as exception:
1872             logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1873             instances.set_last_error(row["domain"], exception)
1874             instances.set_last_instance_fetch(row["domain"])
1875             instances.update_data(row["domain"])
1876             continue
1877
1878         doc = bs4.BeautifulSoup(raw, features="html.parser")
1879         logger.debug("doc[]='%s'", type(doc))
1880
1881         logger.debug("row[software]='%s'", row["software"])
1882         if row["software"] == "activityrelay":
1883             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1884             tags = doc.findAll("p")
1885
1886             logger.debug("Checking %d paragraphs ...", len(tags))
1887             for tag in tags:
1888                 logger.debug("tag[]='%s'", type(tag))
1889                 if len(tag.contents) == 0:
1890                     logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1891                     continue
1892                 elif "registered instances" not in tag.contents[0]:
1893                     logger.debug("Skipping paragraph, text not found.")
1894                     continue
1895
1896                 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1897                 for domain in tag.contents:
1898                     logger.debug("domain[%s]='%s'", type(domain), domain)
1899                     if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1900                         continue
1901
1902                     domain = str(domain)
1903                     logger.debug("domain='%s'", domain)
1904                     if not domain_helper.is_wanted(domain):
1905                         logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1906                         continue
1907
1908                     logger.debug("domain='%s' - BEFORE!", domain)
1909                     domain = tidyup.domain(domain)
1910                     logger.debug("domain='%s' - AFTER!", domain)
1911
1912                     if domain == "":
1913                         logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1914                         continue
1915                     elif domain not in peers:
1916                         logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1917                         peers.append(domain)
1918
1919                     if dict_helper.has_key(domains, "domain", domain):
1920                         logger.debug("domain='%s' already added", domain)
1921                         continue
1922
1923                     logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1924                     domains.append({
1925                         "domain": domain,
1926                         "origin": row["domain"],
1927                     })
1928         elif row["software"] in ["aoderelay", "selective-relay"]:
1929             logger.debug("Checking row[domain]='%s' ...", row["domain"])
1930             if row["software"] == "aoderelay":
1931                 tags = doc.findAll("section", {"class": "instance"})
1932             else:
1933                 tags = doc.find("div", {"id": "instances"}).findAll("li")
1934
1935             logger.debug("Checking %d tags ...", len(tags))
1936             for tag in tags:
1937                 logger.debug("tag[]='%s'", type(tag))
1938
1939                 link = tag.find("a")
1940                 logger.debug("link[%s]='%s'", type(link), link)
1941                 if link is None:
1942                     logger.warning("tag='%s' has no a-tag ...", tag)
1943                     continue
1944
1945                 components = urlparse(link["href"])
1946                 domain = components.netloc.lower()
1947
1948                 if not domain_helper.is_wanted(domain):
1949                     logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1950                     continue
1951
1952                 logger.debug("domain='%s' - BEFORE!", domain)
1953                 domain = tidyup.domain(domain)
1954                 logger.debug("domain='%s' - AFTER!", domain)
1955
1956                 if domain == "":
1957                     logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1958                     continue
1959                 elif domain not in peers:
1960                     logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1961                     peers.append(domain)
1962
1963                 if dict_helper.has_key(domains, "domain", domain):
1964                     logger.debug("domain='%s' already added", domain)
1965                     continue
1966
1967                 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1968                 domains.append({
1969                     "domain": domain,
1970                     "origin": row["domain"],
1971                 })
1972         else:
1973             logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1974
1975         logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1976         instances.set_last_instance_fetch(row["domain"])
1977
1978         logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1979         instances.set_total_peers(row["domain"], peers)
1980
1981         logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1982         instances.update_data(row["domain"])
1983
1984     logger.info("Checking %d domains ...", len(domains))
1985     for row in domains:
1986         logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1987         if instances.is_registered(row["domain"]):
1988             logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1989             continue
1990
1991         logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1992         federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1993
1994     logger.debug("Success! - EXIT!")
1995     return 0
1996
1997 def convert_idna(args: argparse.Namespace) -> int:
1998     logger.debug("args[]='%s' - CALLED!", type(args))
1999
2000     database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2001     rows = database.cursor.fetchall()
2002
2003     logger.debug("rows[]='%s'", type(rows))
2004     instances.translate_idnas(rows, "domain")
2005
2006     database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2007     rows = database.cursor.fetchall()
2008
2009     logger.debug("rows[]='%s'", type(rows))
2010     instances.translate_idnas(rows, "origin")
2011
2012     database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2013     rows = database.cursor.fetchall()
2014
2015     logger.debug("rows[]='%s'", type(rows))
2016     blocks.translate_idnas(rows, "blocker")
2017
2018     database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2019     rows = database.cursor.fetchall()
2020
2021     logger.debug("rows[]='%s'", type(rows))
2022     blocks.translate_idnas(rows, "blocked")
2023
2024     logger.debug("Success! - EXIT!")
2025     return 0