1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from urllib.parse import urlparse
32 from fba import database
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
64 def check_instance(args: argparse.Namespace) -> int:
65 logger.debug("args.domain='%s' - CALLED!", args.domain)
68 if not validators.domain(args.domain):
69 logger.warning("args.domain='%s' is not valid", args.domain)
71 elif blacklist.is_blacklisted(args.domain):
72 logger.warning("args.domain='%s' is blacklisted", args.domain)
74 elif instances.is_registered(args.domain):
75 logger.warning("args.domain='%s' is already registered", args.domain)
78 logger.info("args.domain='%s' is not known", args.domain)
80 logger.debug("status=%d - EXIT!", status)
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84 logger.debug("args[]='%s' - CALLED!", type(args))
87 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
90 for row in database.cursor.fetchall():
91 logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92 punycode = row["domain"].encode("idna").decode("utf-8")
94 if row["nodeinfo_url"].startswith("/"):
95 logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
97 elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98 logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
101 logger.info("Found %d row(s)", cnt)
103 logger.debug("EXIT!")
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107 logger.debug("args[]='%s' - CALLED!", type(args))
109 # No CSRF by default, you don't have to add network.source_headers by yourself here
111 source_domain = "pixelfed.org"
113 if sources.is_recent(source_domain):
114 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
117 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118 sources.update(source_domain)
121 logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122 headers = csrf.determine(source_domain, dict())
123 except network.exceptions as exception:
124 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
128 logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129 fetched = network.get_json_api(
131 "/api/v1/servers/all.json?scope=All&country=all&language=all",
133 (config.get("connection_timeout"), config.get("read_timeout"))
136 logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
137 if "error_message" in fetched:
138 logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
140 elif "data" not in fetched["json"]:
141 logger.warning("API did not return JSON with 'data' element - EXIT!")
144 rows = fetched["json"]["data"]
145 logger.info("Checking %d fetched rows ...", len(rows))
147 logger.debug("row[]='%s'", type(row))
148 if "domain" not in row:
149 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
151 elif row["domain"] in [None, ""]:
152 logger.debug("row[domain]='%s' is empty - SKIPPED!", row["domain"])
155 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156 domain = row["domain"].encode("idna").decode("utf-8")
157 logger.debug("domain='%s' - AFTER!", domain)
159 if not domain_helper.is_wanted(domain):
160 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
162 elif instances.is_registered(domain):
163 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
165 elif instances.is_recent(domain):
166 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
169 logger.debug("Fetching instances from domain='%s' ...", domain)
170 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
172 except network.exceptions as exception:
173 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
176 logger.debug("Success! - EXIT!")
179 def fetch_bkali(args: argparse.Namespace) -> int:
180 logger.debug("args[]='%s' - CALLED!", type(args))
182 logger.debug("Invoking locking.acquire() ...")
185 source_domain = "gql.api.bka.li"
186 if sources.is_recent(source_domain):
187 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
190 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191 sources.update(source_domain)
195 logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196 fetched = network.post_json_api(
200 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
204 logger.debug("fetched[]='%s'", type(fetched))
205 if "error_message" in fetched:
206 logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s' - EXIT!", fetched["error_message"])
208 elif "json" not in fetched:
209 logger.warning("post_json_api() returned fetched[]='%s' with missing 'json' element - EXIT!", type(fetched))
211 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
212 logger.warning("post_json_api() returned error: '%s' - EXIT!", fetched["json"]["error"]["message"])
215 rows = fetched["json"]
217 logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
219 raise Exception("WARNING: Returned no records")
220 elif "data" not in rows:
221 raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
222 elif "nodeinfo" not in rows["data"]:
223 raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
225 for entry in rows["data"]["nodeinfo"]:
226 logger.debug("entry[%s]='%s'", type(entry), entry)
227 if "domain" not in entry:
228 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
230 elif entry["domain"] in [None, ""]:
231 logger.debug("entry[domain]='%s' is empty - SKIPPED!", entry["domain"])
233 elif not domain_helper.is_wanted(entry["domain"]):
234 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
236 elif instances.is_registered(entry["domain"]):
237 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
239 elif instances.is_recent(entry["domain"]):
240 logger.debug("entry[domain]='%s' has recently been crawled - SKIPPED!", entry["domain"])
243 logger.debug("Adding domain='%s' ...", entry["domain"])
244 domains.append(entry["domain"])
246 except network.exceptions as exception:
247 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
250 logger.debug("domains()=%d", len(domains))
252 logger.info("Adding %d new instances ...", len(domains))
253 for domain in domains:
254 logger.debug("domain='%s' - BEFORE!", domain)
255 domain = domain.encode("idna").decode("utf-8")
256 logger.debug("domain='%s' - AFTER!", domain)
259 logger.info("Fetching instances from domain='%s' ...", domain)
260 federation.fetch_instances(domain, "tak.teleyal.blog", None, inspect.currentframe().f_code.co_name)
261 except network.exceptions as exception:
262 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
263 instances.set_last_error(domain, exception)
266 logger.debug("Success - EXIT!")
269 def fetch_blocks(args: argparse.Namespace) -> int:
270 logger.debug("args[]='%s' - CALLED!", type(args))
271 if args.domain is not None and args.domain != "":
272 logger.debug("args.domain='%s' - checking ...", args.domain)
273 if not validators.domain(args.domain):
274 logger.warning("args.domain='%s' is not valid.", args.domain)
276 elif blacklist.is_blacklisted(args.domain):
277 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
279 elif not instances.is_registered(args.domain):
280 logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
283 logger.debug("Invoking locking.acquire() ...")
286 if args.domain is not None and args.domain != "":
287 # Re-check single domain
288 logger.debug("Querying database for args.domain='%s' ...", args.domain)
289 database.cursor.execute(
290 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
292 elif args.software is not None and args.software != "":
293 # Re-check single software
294 logger.debug("Querying database for args.software='%s' ...", args.software)
295 database.cursor.execute(
296 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_blocked ASC", [args.software]
299 # Check only entries with total_blocked=None
300 database.cursor.execute(
301 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'piefed') AND nodeinfo_url IS NOT NULL AND total_blocks IS NULL ORDER BY last_blocked ASC, total_blocks DESC"
304 # Re-check after "timeout" (aka. minimum interval)
305 database.cursor.execute(
306 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'piefed') AND nodeinfo_url IS NOT NULL ORDER BY last_blocked ASC, total_blocks DESC"
309 rows = database.cursor.fetchall()
310 logger.info("Checking %d entries ...", len(rows))
311 for blocker, software, origin, nodeinfo_url in rows:
312 logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
314 if not domain_helper.is_wanted(blocker):
315 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
317 elif not args.force and instances.is_recent(blocker, "last_blocked"):
318 logger.debug("blocker='%s' has recently been crawled - SKIPPED!", blocker)
321 logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
322 instances.set_last_blocked(blocker)
323 instances.set_has_obfuscation(blocker, False)
325 # c.s isn't part of oliphant's "hidden" blocklists
326 if blocker == "chaos.social" or software_helper.is_relay(software) or blocklists.has(blocker):
327 logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs, fetch_oliphant, fetch_csv instead!", blocker)
330 logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
331 blocking = federation.fetch_blocks(blocker)
333 logger.debug("blocker='%s',software='%s',blocking()=%d", blocker, software, len(blocking))
334 if len(blocking) == 0:
335 logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
336 if software == "pleroma":
337 blocking = pleroma.fetch_blocks(blocker)
338 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
339 elif software == "mastodon":
340 blocking = mastodon.fetch_blocks(blocker)
341 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
342 elif software == "lemmy":
343 blocking = lemmy.fetch_blocks(blocker)
344 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
345 elif software == "friendica":
346 blocking = friendica.fetch_blocks(blocker)
347 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
348 elif software == "misskey":
349 blocking = misskey.fetch_blocks(blocker)
350 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
352 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
354 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
355 instances.set_total_blocks(blocker, blocking)
358 deobfuscated = obfuscated = 0
360 logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
361 for block in blocking:
362 logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
364 if block["block_level"] == "":
365 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
368 logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
369 block["blocked"] = tidyup.domain(block["blocked"])
370 block["reason"] = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
371 logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
373 if block["blocked"] in [None, ""]:
374 logger.warning("block[blocked]='%s' is empty, blocker='%s'", block["blocked"], blocker)
376 elif block["blocked"].endswith(".onion"):
377 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
379 elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
380 logger.debug("blocked='%s' is an I2P .onion domain - SKIPPED", block["blocked"])
382 elif block["blocked"].endswith(".arpa"):
383 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
385 elif block["blocked"].endswith(".tld"):
386 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
388 elif block["blocked"].find("*") >= 0:
389 logger.debug("blocker='%s' uses obfuscated domains", blocker)
390 instances.set_has_obfuscation(blocker, True)
391 obfuscated = obfuscated + 1
393 # Some friendica servers also obscure domains without hash
394 row = instances.deobfuscate("*", block["blocked"], block["digest"] if "digest" in block else None)
396 logger.debug("row[]='%s'", type(row))
398 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
401 deobfuscated = deobfuscated + 1
402 block["blocked"] = row["domain"]
403 origin = row["origin"]
404 nodeinfo_url = row["nodeinfo_url"]
405 elif block["blocked"].find("?") >= 0:
406 logger.debug("blocker='%s' uses obfuscated domains", blocker)
407 instances.set_has_obfuscation(blocker, True)
408 obfuscated = obfuscated + 1
410 # Some obscure them with question marks, not sure if that's dependent on version or not
411 row = instances.deobfuscate("?", block["blocked"], block["digest"] if "digest" in block else None)
413 logger.debug("row[]='%s'", type(row))
415 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
418 deobfuscated = deobfuscated + 1
419 block["blocked"] = row["domain"]
420 origin = row["origin"]
421 nodeinfo_url = row["nodeinfo_url"]
423 logger.debug("Looking up instance by domain, blocked='%s'", block["blocked"])
424 if block["blocked"] in [None, ""]:
425 logger.debug("block[blocked]='%s' is empty - SKIPPED!", block["blocked"])
428 logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
429 block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
430 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
432 if not domain_helper.is_wanted(block["blocked"]):
433 logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
435 elif block["block_level"] in ["accept", "accepted"]:
436 logger.debug("block[blocked]='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
438 elif not instances.is_registered(block["blocked"]):
439 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
440 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
442 block["block_level"] = blocks.alias_block_level(block["block_level"])
444 if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] in ["reject", "suspend"] and config.get("bot_enabled"):
445 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
447 "blocked": block["blocked"],
448 "reason" : block["reason"],
451 logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
452 cookies.clear(block["blocked"])
454 logger.info("blocker='%s' has %d obfuscated domain(s) and %d of them could be deobfuscated.", blocker, obfuscated, deobfuscated)
455 instances.set_obfuscated_blocks(blocker, obfuscated)
457 logger.debug("Flushing updates for blocker='%s' ...", blocker)
458 instances.update(blocker)
460 logger.debug("Invoking commit() ...")
461 database.connection.commit()
463 logger.debug("Invoking cookies.clear(%s) ...", blocker)
464 cookies.clear(blocker)
466 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
467 if config.get("bot_enabled") and len(blockdict) > 0:
468 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
469 network.send_bot_post(blocker, blockdict)
471 logger.debug("Success! - EXIT!")
474 def fetch_observer(args: argparse.Namespace) -> int:
475 logger.debug("args[]='%s' - CALLED!", type(args))
477 logger.debug("Invoking locking.acquire() ...")
480 source_domain = "fediverse.observer"
481 if sources.is_recent(source_domain):
482 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
485 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
486 sources.update(source_domain)
489 if args.software is None:
490 logger.info("Fetching software list ...")
491 raw = network.fetch_url(
492 f"https://{source_domain}",
494 (config.get("connection_timeout"), config.get("read_timeout"))
496 logger.debug("raw[%s]()=%d", type(raw), len(raw))
498 doc = bs4.BeautifulSoup(raw, features="html.parser")
499 logger.debug("doc[]='%s'", type(doc))
501 navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
502 logger.debug("navbar[]='%s'", type(navbar))
504 logger.warning("Cannot find navigation bar, cannot continue!")
507 items = navbar.findAll("a", {"class": "dropdown-item"})
508 logger.debug("items[]='%s'", type(items))
510 logger.info("Checking %d menu items ...", len(items))
512 logger.debug("item[%s]='%s'", type(item), item)
513 if item.text.lower() == "all":
514 logger.debug("Skipping 'All' menu entry ...")
517 logger.debug("Appending item.text='%s' ...", item.text)
518 types.append(tidyup.domain(item.text))
520 logger.info("Adding args.software='%s' as type ...", args.software)
521 types.append(args.software)
523 logger.info("Fetching %d different table data ...", len(types))
524 for software in types:
525 logger.debug("software='%s'", software)
527 if args.software is not None and args.software != software:
528 logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
533 logger.debug("Fetching table data for software='%s' ...", software)
534 raw = network.post_json_api(
535 f"api.{source_domain}",
538 "query": "{nodes(softwarename:\"" + software + "\"){domain}}"
542 logger.debug("raw[%s]()=%d", type(raw), len(raw))
543 if "exception" in raw:
544 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
545 raise raw["exception"]
546 elif "error_message" in raw:
547 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
549 elif not "data" in raw["json"]:
550 logger.warning("Cannot find key 'nodes' in raw[json]()=%d", len(raw["json"]))
552 elif not "nodes" in raw["json"]["data"]:
553 logger.warning("Cannot find key 'nodes' in raw[json][data]()=%d", len(raw["json"]["data"]))
556 items = raw["json"]["data"]["nodes"]
557 logger.debug("items()=%d", len(items))
559 except network.exceptions as exception:
560 logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
563 logger.info("Checking %d items,software='%s' ...", len(items), software)
565 logger.debug("item[]='%s'", type(item))
566 if not "domain" in item:
567 logger.debug("item()=%d has not element 'domain'", len(item))
570 logger.debug("item[domain]='%s' - BEFORE!", item["domain"])
571 domain = tidyup.domain(item["domain"]) if item["domain"] not in [None, ""] else None
572 logger.debug("domain='%s' - AFTER!", domain)
574 if domain in [None, ""]:
575 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): item[domain]='%s' - SKIPPED!", type(domain), domain, item["domain"])
578 logger.debug("domain='%s' - BEFORE!", domain)
579 domain = domain.encode("idna").decode("utf-8")
580 logger.debug("domain='%s' - AFTER!", domain)
582 if not domain_helper.is_wanted(domain):
583 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
585 elif instances.is_registered(domain):
586 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
588 elif instances.is_recent(domain):
589 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
592 logger.info("Fetching instances for domain='%s',software='%s' ...", domain, software)
593 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
595 logger.debug("Success! - EXIT!")
598 def fetch_todon_wiki(args: argparse.Namespace) -> int:
599 logger.debug("args[]='%s' - CALLED!", type(args))
601 logger.debug("Invoking locking.acquire() ...")
604 source_domain = "wiki.todon.eu"
605 if sources.is_recent(source_domain):
606 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
609 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
610 sources.update(source_domain)
617 logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
618 raw = network.fetch_url(
619 f"https://{source_domain}/todon/domainblocks",
621 (config.get("connection_timeout"), config.get("read_timeout"))
623 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
625 doc = bs4.BeautifulSoup(raw, "html.parser")
626 logger.debug("doc[]='%s'", type(doc))
628 silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
629 logger.info("Checking %d silenced/limited entries ...", len(silenced))
630 blocklist["silenced"] = utils.find_domains(silenced, "div")
632 suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
633 logger.info("Checking %d suspended entries ...", len(suspended))
634 blocklist["reject"] = utils.find_domains(suspended, "div")
636 blocking = blocklist["silenced"] + blocklist["reject"]
639 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
640 instances.set_last_blocked(blocker)
641 instances.set_total_blocks(blocker, blocking)
644 for block_level in blocklist:
645 blockers = blocklist[block_level]
647 logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
648 for blocked in blockers:
649 logger.debug("blocked='%s'", blocked)
651 if not domain_helper.is_wanted(blocked):
652 logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
654 elif not domain_helper.is_wanted(blocker):
655 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
657 elif blocks.is_instance_blocked(blocker, blocked, block_level):
658 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
660 elif not instances.is_registered(blocked):
662 logger.info("Fetching instances from domain='%s' ...", blocked)
663 federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
664 except network.exceptions as exception:
665 logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
666 instances.set_last_error(blocked, exception)
669 logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
670 if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
671 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
677 logger.debug("Invoking commit() ...")
678 database.connection.commit()
680 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
681 if config.get("bot_enabled") and len(blockdict) > 0:
682 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
683 network.send_bot_post(blocker, blockdict)
685 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
686 if instances.has_pending(blocker):
687 logger.debug("Flushing updates for blocker='%s' ...", blocker)
688 instances.update(blocker)
690 logger.debug("Success! - EXIT!")
693 def fetch_cs(args: argparse.Namespace):
694 logger.debug("args[]='%s' - CALLED!", type(args))
696 logger.debug("Invoking locking.acquire() ...")
724 source_domain = "raw.githubusercontent.com"
725 if sources.is_recent(source_domain):
726 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
729 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
730 sources.update(source_domain)
732 logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
733 raw = network.fetch_url(
734 f"https://{source_domain}/chaossocial/meta/master/federation.md",
736 (config.get("connection_timeout"), config.get("read_timeout"))
738 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
740 doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
741 logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
743 silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
744 logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
745 blocklist["silenced"] = federation.find_domains(silenced)
747 blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
748 logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
749 blocklist["reject"] = federation.find_domains(blocked)
751 blocking = blocklist["silenced"] + blocklist["reject"]
752 blocker = "chaos.social"
754 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
755 instances.set_last_blocked(blocker)
756 instances.set_total_blocks(blocker, blocking)
758 logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
759 if len(blocking) > 0:
761 for block_level in blocklist:
762 logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
764 for row in blocklist[block_level]:
765 logger.debug("row[%s]='%s'", type(row), row)
766 if not "domain" in row:
767 logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
769 elif not instances.is_registered(row["domain"]):
771 logger.info("Fetching instances from domain='%s' ...", row["domain"])
772 federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
773 except network.exceptions as exception:
774 logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
775 instances.set_last_error(row["domain"], exception)
777 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
778 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
780 "blocked": row["domain"],
781 "reason" : row["reason"],
784 logger.debug("Invoking commit() ...")
785 database.connection.commit()
787 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
788 if config.get("bot_enabled") and len(blockdict) > 0:
789 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
790 network.send_bot_post(blocker, blockdict)
792 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
793 if instances.has_pending(blocker):
794 logger.debug("Flushing updates for blocker='%s' ...", blocker)
795 instances.update(blocker)
797 logger.debug("Success! - EXIT!")
800 def fetch_fba_rss(args: argparse.Namespace) -> int:
801 logger.debug("args[]='%s' - CALLED!", type(args))
805 logger.debug("Invoking locking.acquire() ...")
808 components = urlparse(args.feed)
809 domain = components.netloc.lower().split(":")[0]
811 logger.debug("domain='%s'", domain)
812 if sources.is_recent(domain):
813 logger.info("API from domain='%s' has recently being accessed - EXIT!", domain)
816 logger.debug("domain='%s' has not been recently used, marking ...", domain)
817 sources.update(domain)
819 logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
820 response = network.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
822 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
823 if response.ok and response.status_code == 200 and len(response.text) > 0:
824 logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
825 rss = atoma.parse_rss_bytes(response.content)
827 logger.debug("rss[]='%s'", type(rss))
828 for item in rss.items:
829 logger.debug("item[%s]='%s'", type(item), item)
830 domain = item.link.split("=")[1]
831 domain = tidyup.domain(domain) if domain not in[None, ""] else None
833 logger.debug("domain='%s' - AFTER!", domain)
834 if domain in [None, ""]:
835 logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
838 logger.debug("domain='%s' - BEFORE!", domain)
839 domain = domain.encode("idna").decode("utf-8")
840 logger.debug("domain='%s' - AFTER!", domain)
842 if not domain_helper.is_wanted(domain):
843 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
845 elif domain in domains:
846 logger.debug("domain='%s' is already added - SKIPPED!", domain)
848 elif instances.is_registered(domain):
849 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
851 elif instances.is_recent(domain):
852 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
855 logger.debug("Adding domain='%s'", domain)
856 domains.append(domain)
858 logger.debug("domains()=%d", len(domains))
860 logger.info("Adding %d new instances ...", len(domains))
861 for domain in domains:
862 logger.debug("domain='%s'", domain)
864 logger.info("Fetching instances from domain='%s' ...", domain)
865 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
866 except network.exceptions as exception:
867 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
868 instances.set_last_error(domain, exception)
871 logger.debug("Success! - EXIT!")
874 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
875 logger.debug("args[]='%s' - CALLED!", type(args))
877 logger.debug("Invoking locking.acquire() ...")
880 source_domain = "ryona.agency"
881 feed = f"https://{source_domain}/users/fba/feed.atom"
883 logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
884 if args.feed is not None and validators.url(args.feed):
885 logger.debug("Setting feed='%s' ...", args.feed)
886 feed = str(args.feed)
887 source_domain = urlparse(args.feed).netloc
889 if sources.is_recent(source_domain):
890 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
893 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
894 sources.update(source_domain)
898 logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
899 response = network.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
901 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
902 if response.ok and response.status_code == 200 and len(response.text) > 0:
903 logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
904 atom = atoma.parse_atom_bytes(response.content)
906 logger.debug("atom[]='%s'", type(atom))
907 for entry in atom.entries:
908 logger.debug("entry[]='%s'", type(entry))
909 doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
910 logger.debug("doc[]='%s'", type(doc))
911 elements = doc.findAll("a")
913 logger.debug("Checking %d element(s) ...", len(elements))
914 for element in elements:
915 logger.debug("element[%s]='%s'", type(element), element)
916 for href in element["href"].split(","):
917 logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
918 domain = tidyup.domain(href) if href not in [None, ""] else None
920 logger.debug("domain='%s' - AFTER!", domain)
921 if domain in [None, ""]:
922 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): href='%s' - SKIPPED!", type(domain), domain, href)
925 logger.debug("domain='%s' - BEFORE!", domain)
926 domain = domain.encode("idna").decode("utf-8")
927 logger.debug("domain='%s' - AFTER!", domain)
929 if not domain_helper.is_wanted(domain):
930 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
932 elif domain in domains:
933 logger.debug("domain='%s' is already added - SKIPPED!", domain)
935 elif instances.is_registered(domain):
936 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
938 elif instances.is_recent(domain):
939 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
942 logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
943 domains.append(domain)
945 logger.debug("domains()=%d", len(domains))
947 logger.info("Adding %d new instances ...", len(domains))
948 for domain in domains:
949 logger.debug("domain='%s'", domain)
951 logger.info("Fetching instances from domain='%s' ...", domain)
952 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
953 except network.exceptions as exception:
954 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
955 instances.set_last_error(domain, exception)
958 logger.debug("Success! - EXIT!")
961 def fetch_instances(args: argparse.Namespace) -> int:
962 logger.debug("args[]='%s' - CALLED!", type(args))
964 logger.debug("Invoking locking.acquire() ...")
970 # Is domain or software set?
971 if args.domain not in [None, ""]:
972 logger.debug("args.domain='%s' - checking ...", args.domain)
973 if not validators.domain(args.domain):
974 logger.warning("args.domain='%s' is not valid.", args.domain)
976 elif blacklist.is_blacklisted(args.domain):
977 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
980 logger.debug("args.domain='%s' - BEFORE!", args.domain)
981 domain = tidyup.domain(args.domain)
982 logger.debug("domain='%s' - AFTER!", domain)
985 database.cursor.execute("SELECT domain, origin, software FROM instances WHERE domain = ? LIMIT 1", [domain])
986 rows = database.cursor.fetchall()
987 elif args.software not in [None, ""]:
988 logger.debug("args.software='%s' - BEFORE!", args.software)
989 software = software_helper.alias(args.software)
990 logger.debug("software='%s' - AFTER!", software)
993 database.cursor.execute("SELECT domain, origin, software FROM instances WHERE software = ? ORDER BY last_instance_fetch ASC", [software])
994 rows = database.cursor.fetchall()
996 logger.info("Checking %d entries ...", len(rows))
998 logger.debug("row[domain]='%s',row[origin]='%s',row[software]='%s'", row["domain"], row["origin"], row["software"])
999 if row["software"] is None and instances.is_registered(row["domain"]) :
1000 logger.warning("row[domain]='%s' has no software detected. You can try to run ./fba.py update_nodeinfo --domain=%s --force to get it updated - SKIPPED!", row["domain"], row["domain"])
1002 elif software_helper.is_relay(row["software"]) and instances.is_registered(row["domain"]):
1003 logger.warning("row[domain]='%s' is of software type '%s' which is not supported by this command. Please invoke fetch_relays instead - SKIPPED!", row["domain"], row["software"])
1005 elif not args.force and not args.software in [None, ""]and instances.is_recent(row["domain"]):
1006 logger.debug("row[domain]='%s' has recently been crawled - SKIPPED!", row["domain"])
1011 logger.info("Fetching instances from row[domain]='%s',row[origin]='%s',row[software]='%s' ...", row["domain"], row["origin"], row["software"])
1012 federation.fetch_instances(row["domain"], row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1013 except network.exceptions as exception:
1014 logger.warning("Exception '%s' during fetching instances (fetch_instances) from row[domain]='%s'", type(exception), row["domain"])
1015 instances.set_last_error(row["domain"], exception)
1016 instances.update(row["domain"])
1020 logger.debug("Not fetching more instances - BREAK!")
1023 # Loop through some instances
1024 database.cursor.execute(
1025 "SELECT domain, origin, software \
1027 WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb', 'smithereen', 'vebinet', 'hugo', 'toki') \
1028 ORDER BY total_peers DESC, last_response_time ASC, last_updated ASC"
1031 rows = database.cursor.fetchall()
1032 logger.info("Checking %d entries ...", len(rows))
1034 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
1035 domain = row["domain"].encode("idna").decode("utf-8")
1036 logger.debug("domain='%s' - AFTER!", domain)
1038 if not domain_helper.is_wanted(domain):
1039 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1041 elif instances.is_recent(domain):
1042 logger.debug("domain='%s' has recently been crawled - SKIPPED!")
1046 logger.info("Fetching instances for domain='%s',origin='%s',software='%s' ...", domain, row["origin"], row["software"])
1047 federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name)
1048 except network.exceptions as exception:
1049 logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
1050 instances.set_last_error(domain, exception)
1052 logger.debug("Success - EXIT!")
1055 def fetch_csv(args: argparse.Namespace) -> int:
1056 logger.debug("args[]='%s' - CALLED!", type(args))
1058 logger.debug("Invoking locking.acquire() ...")
1061 logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
1062 for block in blocklists.csv_files:
1063 logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1065 # Is domain given and not equal blocker?
1066 if isinstance(args.domain, str) and args.domain != block["blocker"]:
1067 logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1070 logger.debug("Invoking processing.csv_block(%s, %s, fetch_csv) ...", block["blocker"], block["csv_url"])
1071 processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
1073 logger.debug("Success - EXIT!")
1076 def fetch_oliphant(args: argparse.Namespace) -> int:
1077 logger.debug("args[]='%s' - CALLED!", type(args))
1079 logger.debug("Invoking locking.acquire() ...")
1082 source_domain = "codeberg.org"
1083 if sources.is_recent(source_domain):
1084 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1087 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1088 sources.update(source_domain)
1091 base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1093 logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1094 for block in blocklists.oliphant_blocklists:
1095 # Is domain given and not equal blocker?
1096 logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
1097 if isinstance(args.domain, str) and args.domain != block["blocker"]:
1098 logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1101 url = f"{base_url}/{block['csv_url']}"
1103 logger.debug("Invoking processing.csv_block(%s, %s, fetch_oliphant) ...", block["blocker"], url)
1104 processing.csv_block(block["blocker"], url, inspect.currentframe().f_code.co_name)
1106 logger.debug("Success! - EXIT!")
1109 def fetch_txt(args: argparse.Namespace) -> int:
1110 logger.debug("args[]='%s' - CALLED!", type(args))
1112 logger.debug("Invoking locking.acquire() ...")
1115 logger.info("Checking %d text file(s) ...", len(blocklists.txt_files))
1116 for row in blocklists.txt_files:
1117 logger.debug("Fetching row[url]='%s' ...", row["url"])
1118 response = network.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1120 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1121 if response.ok and response.status_code == 200 and response.text != "":
1122 logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1123 domains = response.text.strip().split("\n")
1125 logger.info("Processing %d domains ...", len(domains))
1126 for domain in domains:
1127 logger.debug("domain='%s' - BEFORE!", domain)
1128 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1129 logger.debug("domain='%s' - AFTER!", domain)
1131 if domain in [None, ""]:
1132 logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1134 elif not domain_helper.is_wanted(domain):
1135 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1137 elif not args.force and instances.is_registered(domain):
1138 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1141 logger.debug("Processing domain='%s',row[blocker]='%s' ...", domain, row["blocker"])
1142 processed = processing.instance(domain, row["blocker"], inspect.currentframe().f_code.co_name, force=args.force)
1143 logger.debug("processed='%s'", processed)
1145 logger.debug("Success! - EXIT!")
1148 def fetch_fedipact(args: argparse.Namespace) -> int:
1149 logger.debug("args[]='%s' - CALLED!", type(args))
1151 logger.debug("Invoking locking.acquire() ...")
1154 source_domain = "fedipact.online"
1155 if sources.is_recent(source_domain):
1156 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1159 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1160 sources.update(source_domain)
1162 logger.info("Fetching / from source_domain='%s' ...", source_domain)
1163 response = network.fetch_url(
1164 f"https://{source_domain}",
1165 network.web_headers,
1166 (config.get("connection_timeout"), config.get("read_timeout"))
1169 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1170 if response.ok and response.status_code == 200 and response.text != "":
1171 logger.debug("Parsing %d Bytes ...", len(response.text))
1173 doc = bs4.BeautifulSoup(response.text, "html.parser")
1174 logger.debug("doc[]='%s'", type(doc))
1176 rows = doc.findAll("li")
1177 logger.info("Checking %d row(s) ...", len(rows))
1179 logger.debug("row[]='%s'", type(row))
1180 domain = tidyup.domain(row.contents[0]) if row.contents[0] not in [None, ""] else None
1182 logger.debug("domain='%s' - AFTER!", domain)
1183 if domain in [None, ""]:
1184 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): row.contents[0]='%s' - SKIPPED!", type(domain), domain, row.contents[0])
1187 logger.debug("domain='%s' - BEFORE!", domain)
1188 domain = domain.encode("idna").decode("utf-8")
1189 logger.debug("domain='%s' - AFTER!", domain)
1191 if not domain_helper.is_wanted(domain):
1192 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1194 elif instances.is_registered(domain):
1195 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1197 elif instances.is_recent(domain):
1198 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1201 logger.info("Fetching domain='%s' ...", domain)
1202 federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1204 logger.debug("Success! - EXIT!")
1207 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1208 logger.debug("args[]='%s' - CALLED!", type(args))
1210 logger.debug("Invoking locking.acquire() ...")
1213 source_domain = "instances.joinmobilizon.org"
1214 if sources.is_recent(source_domain):
1215 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1218 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1219 sources.update(source_domain)
1221 logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1222 raw = network.fetch_url(
1223 f"https://{source_domain}/api/v1/instances",
1224 network.web_headers,
1225 (config.get("connection_timeout"), config.get("read_timeout"))
1227 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1229 parsed = json.loads(raw)
1230 logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1232 if "data" not in parsed:
1233 logger.warning("parsed()=%d does not contain key 'data'")
1236 logger.info("Checking %d instances ...", len(parsed["data"]))
1237 for row in parsed["data"]:
1238 logger.debug("row[]='%s'", type(row))
1239 if "host" not in row:
1240 logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1242 elif not domain_helper.is_wanted(row["host"]):
1243 logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1245 elif instances.is_registered(row["host"]):
1246 logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1248 elif instances.is_recent(row["host"]):
1249 logger.debug("row[host]='%s' has recently been crawled - SKIPPED!", row["host"])
1252 logger.info("Fetching row[host]='%s' ...", row["host"])
1253 federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1255 logger.debug("Success! - EXIT!")
1258 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1259 logger.debug("args[]='%s' - CALLED!", type(args))
1261 logger.debug("Invoking locking.acquire() ...")
1264 source_domain = "instanceapp.misskey.page"
1265 if sources.is_recent(source_domain):
1266 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1269 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1270 sources.update(source_domain)
1272 logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1273 raw = network.fetch_url(
1274 f"https://{source_domain}/instances.json",
1275 network.web_headers,
1276 (config.get("connection_timeout"), config.get("read_timeout"))
1278 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1280 parsed = json.loads(raw)
1281 logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1283 if "instancesInfos" not in parsed:
1284 logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1287 logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1288 for row in parsed["instancesInfos"]:
1289 logger.debug("row[%s]='%s'", type(row), row)
1290 if "url" not in row:
1291 logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1293 elif not domain_helper.is_wanted(row["url"]):
1294 logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1296 elif instances.is_registered(row["url"]):
1297 logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1299 elif instances.is_recent(row["url"]):
1300 logger.debug("row[url]='%s' has recently been crawled - SKIPPED!", row["url"])
1303 logger.info("Fetching row[url]='%s' ...", row["url"])
1304 federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1306 logger.debug("Success! - EXIT!")
1309 def recheck_obfuscation(args: argparse.Namespace) -> int:
1310 logger.debug("args[]='%s' - CALLED!", type(args))
1312 logger.debug("Invoking locking.acquire() ...")
1315 if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1316 logger.debug("Fetching record for args.domain='%s' ...", args.domain)
1317 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND domain = ?", [args.domain])
1318 elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1319 logger.debug("Fetching records for args.software='%s' ...", args.software)
1320 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE (has_obfuscation = 1 OR has_obfuscation IS NULL) AND software = ?", [args.software])
1322 logger.debug("Fetching records where domains have obfuscated block entries ...")
1323 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 OR has_obfuscation IS NULL")
1325 rows = database.cursor.fetchall()
1326 logger.info("Checking %d domains ...", len(rows))
1328 logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1329 if not domain_helper.is_wanted(row["domain"]):
1330 logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1331 if args.delete_unwanted:
1332 logger.info("Deleting unwanted row[domain]='%s' ...", row["domain"])
1333 instances.delete(row["domain"])
1334 blocks.delete(row["domain"])
1336 elif blacklist.is_blacklisted(row["domain"]):
1337 logger.warning("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1339 elif (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1340 logger.debug("row[domain]='%s' has recently been checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1343 logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1344 blocking = federation.fetch_blocks(row["domain"])
1346 logger.debug("blocking()=%d", len(blocking))
1347 if len(blocking) == 0:
1348 logger.debug("Empty blocking list, trying individual fetch_blocks() for row[software]='%s' ...", row["software"])
1349 if row["software"] == "pleroma":
1350 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1351 blocking = pleroma.fetch_blocks(row["domain"])
1352 elif row["software"] == "mastodon":
1353 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1354 blocking = mastodon.fetch_blocks(row["domain"])
1355 elif row["software"] == "lemmy":
1356 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1357 blocking = lemmy.fetch_blocks(row["domain"])
1358 elif row["software"] == "friendica":
1359 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1360 blocking = friendica.fetch_blocks(row["domain"])
1361 elif row["software"] == "misskey":
1362 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1363 blocking = misskey.fetch_blocks(row["domain"])
1365 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1367 # c.s isn't part of oliphant's "hidden" blocklists
1368 logger.debug("row[domain]='%s'", row["domain"])
1369 if row["domain"] != "chaos.social" and row["software"] is not None and not software_helper.is_relay(row["software"]) and not blocklists.has(row["domain"]):
1370 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1371 instances.set_last_blocked(row["domain"])
1372 instances.set_total_blocks(row["domain"], blocking)
1377 logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1378 for block in blocking:
1379 logger.debug("block[blocked]='%s'", block["blocked"])
1382 if block["blocked"] == "":
1383 logger.debug("block[blocked] is empty - SKIPPED!")
1385 elif block["blocked"].endswith(".onion"):
1386 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1388 elif block["blocked"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1389 logger.debug("blocked='%s' is an I2P onion domain name - SKIPPED!", block["blocked"])
1391 elif block["blocked"].endswith(".arpa"):
1392 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1394 elif block["blocked"].endswith(".tld"):
1395 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1397 elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1398 logger.debug("block='%s' is obfuscated.", block["blocked"])
1399 obfuscated = obfuscated + 1
1400 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["digest"] if "digest" in block else None)
1401 elif not domain_helper.is_wanted(block["blocked"]):
1402 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1404 elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1405 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1408 logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1409 if blocked is not None and blocked != block["blocked"]:
1410 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1411 obfuscated = obfuscated - 1
1413 if blacklist.is_blacklisted(blocked):
1414 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1416 elif blacklist.is_blacklisted(row["domain"]):
1417 logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1419 elif blocks.is_instance_blocked(row["domain"], blocked):
1420 logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1423 block["block_level"] = blocks.alias_block_level(block["block_level"])
1425 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1426 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1427 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1430 "reason" : block["reason"],
1433 logger.debug("Setting obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1434 instances.set_has_obfuscation(row["domain"], (obfuscated > 0))
1435 instances.set_obfuscated_blocks(row["domain"], obfuscated)
1437 logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1438 if instances.has_pending(row["domain"]):
1439 logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1440 instances.update(row["domain"])
1442 logger.debug("Invoking commit() ...")
1443 database.connection.commit()
1445 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1446 if config.get("bot_enabled") and len(blockdict) > 0:
1447 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1448 network.send_bot_post(row["domain"], blockdict)
1450 logger.debug("Success! - EXIT!")
1453 def fetch_fedilist(args: argparse.Namespace) -> int:
1454 logger.debug("args[]='%s' - CALLED!", type(args))
1456 logger.debug("Invoking locking.acquire() ...")
1459 source_domain = "demo.fedilist.com"
1460 if sources.is_recent(source_domain):
1461 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1464 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1465 sources.update(source_domain)
1467 url = f"http://{source_domain}/instance/csv?onion=not"
1468 if args.software is not None and args.software != "":
1469 logger.debug("args.software='%s'", args.software)
1470 url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1472 logger.info("Fetching url='%s' ...", url)
1473 response = reqto.get(
1475 headers=network.web_headers,
1476 timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1477 allow_redirects=False
1480 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1481 if not response.ok or response.status_code > 200 or len(response.content) == 0:
1482 logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1485 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1487 logger.debug("reader[]='%s'", type(reader))
1489 logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1494 logger.info("Checking %d rows ...", len(rows))
1496 logger.debug("row[]='%s'", type(row))
1497 if "hostname" not in row:
1498 logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1501 logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1502 domain = tidyup.domain(row["hostname"]) if row["hostname"] not in [None, ""] else None
1503 logger.debug("domain='%s' - AFTER!", domain)
1505 if domain in [None, ""]:
1506 logger.debug("domain[%s]='%s' is empty after tidyup.domain(): row[hostname]='%s' - SKIPPED!", type(domain), domain, row["hostname"])
1509 logger.debug("domain='%s' - BEFORE!", domain)
1510 domain = domain.encode("idna").decode("utf-8")
1511 logger.debug("domain='%s' - AFTER!", domain)
1513 if not domain_helper.is_wanted(domain):
1514 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1516 elif (args.force is None or not args.force) and instances.is_registered(domain):
1517 logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1519 elif instances.is_recent(domain):
1520 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1523 logger.info("Fetching instances from domain='%s' ...", domain)
1524 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1526 logger.debug("Success! - EXIT!")
1529 def update_nodeinfo(args: argparse.Namespace) -> int:
1530 logger.debug("args[]='%s' - CALLED!", type(args))
1532 logger.debug("Invoking locking.acquire() ...")
1535 if args.domain is not None and args.domain != "":
1536 logger.debug("Fetching args.domain='%s'", args.domain)
1537 database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
1538 elif args.software is not None and args.software != "":
1539 logger.info("Fetching domains for args.software='%s'", args.software)
1540 database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? ORDER BY last_updated ASC", [args.software])
1541 elif args.mode is not None and args.mode != "":
1542 logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1543 database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? ORDER BY last_updated ASC", [args.mode])
1544 elif args.no_software:
1545 logger.info("Fetching domains with no software type detected ...")
1546 database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL ORDER BY last_updated ASC")
1547 elif args.with_software:
1548 logger.info("Fetching domains with any software type detected ...")
1549 database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NOT NULL ORDER BY last_updated ASC")
1551 logger.info("Fetching domains with other detection mode than AUTO_DISOVERY being set ...")
1552 database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NOT NULL AND detection_mode != 'AUTO_DISCOVERY' ORDER BY last_updated ASC")
1553 elif args.no_detection:
1554 logger.info("Fetching domains with no detection mode being set ...")
1555 database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode IS NULL ORDER BY last_updated ASC")
1557 logger.info("Fetching domains with domain name and software being the same ...")
1558 database.cursor.execute("SELECT domain, software FROM instances WHERE domain=software ORDER BY last_updated ASC")
1560 logger.info("Fetching domains for recently updated ...")
1561 database.cursor.execute("SELECT domain, software FROM instances ORDER BY last_updated ASC")
1563 domains = database.cursor.fetchall()
1565 logger.info("Checking %d domain(s) ...", len(domains))
1568 logger.debug("row[]='%s'", type(row))
1569 if row["domain"].endswith(".i2p") and not config.get("allow_i2p_domain"):
1570 logger.debug("row[domain]='%s' is an I2P address - SKIPPED", row["domain"])
1572 elif row["domain"].endswith(".onion"):
1573 logger.debug("row[domain]='%s' is a TOR .onion domain - SKIPPED", row["domain"])
1575 elif row["domain"].endswith(".arpa"):
1576 logger.debug("row[domain]='%s' is a reverse IP address - SKIPPED", row["domain"])
1578 elif row["domain"].endswith(".tld"):
1579 logger.debug("row[domain]='%s' is a fake domain - SKIPPED", row["domain"])
1581 elif blacklist.is_blacklisted(row["domain"]):
1582 logger.debug("row[domain]='%s' is blacklisted - SKIPPED!", row["domain"])
1584 elif not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1585 logger.debug("row[domain]='%s' has recently been checked - SKIPPED!", row["domain"])
1589 logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1590 software = federation.determine_software(row["domain"])
1592 logger.debug("Determined software='%s'", software)
1593 if (software != row["software"] and software is not None) or args.force is True:
1594 logger.debug("software='%s'", software)
1595 if software is None:
1596 logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1597 instances.set_nodeinfo_url(row["domain"], None)
1599 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1600 instances.set_software(row["domain"], software)
1602 if software is not None:
1603 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1604 instances.set_success(row["domain"])
1605 except network.exceptions as exception:
1606 logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1607 instances.set_last_error(row["domain"], exception)
1609 instances.set_last_nodeinfo(row["domain"])
1610 instances.update(row["domain"])
1613 logger.debug("Success! - EXIT!")
1616 def fetch_instances_social(args: argparse.Namespace) -> int:
1617 logger.debug("args[]='%s' - CALLED!", type(args))
1619 logger.debug("Invoking locking.acquire() ...")
1622 source_domain = "instances.social"
1624 if config.get("instances_social_api_key") == "":
1625 logger.error("API key not set. Please set in your config.json file.")
1627 elif sources.is_recent(source_domain):
1628 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1631 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1632 sources.update(source_domain)
1635 "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1638 logger.info("Fetching list from source_domain='%s' ...", source_domain)
1639 fetched = network.get_json_api(
1641 "/api/1.0/instances/list?count=0&sort_by=name",
1643 timeout=(config.get("connection_timeout"), config.get("read_timeout"))
1645 logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1647 if "error_message" in fetched:
1648 logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1650 elif "exception" in fetched:
1651 logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1653 elif "json" not in fetched:
1654 logger.warning("fetched has no element 'json' - EXIT!")
1656 elif "instances" not in fetched["json"]:
1657 logger.warning("fetched[row] has no element 'instances' - EXIT!")
1661 rows = fetched["json"]["instances"]
1663 logger.info("Checking %d row(s) ...", len(rows))
1665 logger.debug("row[]='%s'", type(row))
1666 domain = tidyup.domain(row["name"]) if row["name"] not in [None, ""] else None
1667 logger.debug("domain='%s' - AFTER!", domain)
1669 if domain in [None, ""]:
1670 logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1673 logger.debug("domain='%s' - BEFORE!", domain)
1674 domain = domain.encode("idna").decode("utf-8")
1675 logger.debug("domain='%s' - AFTER!", domain)
1677 if not domain_helper.is_wanted(domain):
1678 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1680 elif domain in domains:
1681 logger.debug("domain='%s' is already added - SKIPPED!", domain)
1683 elif instances.is_registered(domain):
1684 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1686 elif instances.is_recent(domain):
1687 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1690 logger.info("Fetching instances from domain='%s' ...", domain)
1691 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1693 logger.debug("Success! - EXIT!")
1696 def fetch_relaylist(args: argparse.Namespace) -> int:
1697 logger.debug("args[]='%s' - CALLED!", type(args))
1699 logger.debug("Invoking locking.acquire() ...")
1702 source_domain = "api.relaylist.com"
1704 if sources.is_recent(source_domain):
1705 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1708 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1709 sources.update(source_domain)
1711 logger.info("Fetching list from source_domain='%s' ...", source_domain)
1712 fetched = network.get_json_api(
1716 (config.get("connection_timeout"), config.get("read_timeout"))
1718 logger.debug("fetched(%d)[]='%s'", len(fetched), type(fetched))
1720 if "error_message" in fetched:
1721 logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1723 elif "exception" in fetched:
1724 logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1726 elif "json" not in fetched:
1727 logger.warning("fetched has no element 'json' - EXIT!")
1732 logger.info("Checking %d row(s) ...", len(fetched["json"]))
1733 for row in fetched["json"]:
1734 logger.debug("row[]='%s'", type(row))
1735 domain = urlparse(row["url"]).netloc.lower().split(":")[0]
1736 logger.debug("domain='%s' - AFTER!", domain)
1738 if domain in [None, ""]:
1739 logger.debug("domain[%s]='%s' is empty after tidyup.domain() - SKIPPED!", type(domain), domain)
1742 logger.debug("domain='%s' - BEFORE!", domain)
1743 domain = domain.encode("idna").decode("utf-8")
1744 logger.debug("domain='%s' - AFTER!", domain)
1746 if not domain_helper.is_wanted(domain):
1747 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1749 elif domain in domains:
1750 logger.debug("domain='%s' is already added - SKIPPED!", domain)
1752 elif instances.is_registered(domain):
1753 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1755 elif instances.is_recent(domain):
1756 logger.debug("domain='%s' has recently been crawled - SKIPPED!", domain)
1759 logger.info("Fetching instances from domain='%s'", domain)
1760 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1762 logger.debug("Success! - EXIT!")
1765 def fetch_relays(args: argparse.Namespace) -> int:
1766 logger.debug("args[]='%s' - CALLED!", type(args))
1768 logger.debug("Invoking locking.acquire() ...")
1771 if args.domain is not None and args.domain != "":
1772 logger.debug("Fetching instances record for args.domain='%s' ...", args.domain)
1773 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND domain = ? LIMIT 1", [args.domain])
1774 elif args.software is not None and args.software != "":
1775 logger.debug("Fetching instances records for args.software='%s' ...", args.software)
1776 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL AND software = ? ORDER BY last_updated DESC", [args.software])
1778 logger.debug("Fetch all relay instances ...")
1779 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay', 'pub-relay') AND nodeinfo_url IS NOT NULL ORDER BY last_updated DESC")
1782 rows = database.cursor.fetchall()
1784 logger.info("Checking %d relays ...", len(rows))
1786 logger.debug("row[domain]='%s',row[software]='%s'", row["domain"], row["software"])
1787 if not args.force and instances.is_recent(row["domain"]):
1788 logger.debug("row[domain]='%s' has recently been fetched - SKIPPED!", row["domain"])
1790 elif row["nodeinfo_url"] is None:
1791 logger.warning("row[domain]='%s' has empty nodeinfo_url but this is required - SKIPPED!", row["domain"])
1796 logger.debug("row[domain]='%s',row[software]='%s' - checking ....", row["domain"], row["software"])
1797 if row["software"] == "pub-relay":
1798 logger.info("Fetching row[nodeinfo_url]='%s' from relay row[domain]='%s',row[software]='%s' ...", row["nodeinfo_url"], row["domain"], row["software"])
1799 raw = network.fetch_api_url(
1800 row["nodeinfo_url"],
1801 (config.get("connection_timeout"), config.get("read_timeout"))
1804 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1805 if "exception" in raw:
1806 logger.warning("row[domain]='%s' has caused an exception: '%s' - raising again ...", row["domain"], type(raw["exception"]))
1807 raise raw["exception"]
1808 elif "error_message" in raw:
1809 logger.warning("row[domain]='%s' has caused error message: '%s' - SKIPPED!", row["domain"], raw["error_message"])
1810 instances.set_last_error(row["domain"], raw)
1811 instances.set_last_instance_fetch(row["domain"])
1812 instances.update(row["domain"])
1814 elif "json" not in raw:
1815 logger.warning("raw()=%d does not contain key 'json' in response - SKIPPED!", len(raw))
1817 elif not "metadata" in raw["json"]:
1818 logger.warning("raw[json]()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]))
1820 elif not "peers" in raw["json"]["metadata"]:
1821 logger.warning("raw[json][metadata()=%d does not contain key 'json' in response - SKIPPED!", len(raw["json"]["metadata"]))
1824 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1825 raw = network.fetch_url(
1826 f"https://{row['domain']}",
1827 network.web_headers,
1828 (config.get("connection_timeout"), config.get("read_timeout"))
1830 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1832 doc = bs4.BeautifulSoup(raw, features="html.parser")
1833 logger.debug("doc[]='%s'", type(doc))
1835 except network.exceptions as exception:
1836 logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1837 instances.set_last_error(row["domain"], exception)
1838 instances.set_last_instance_fetch(row["domain"])
1839 instances.update(row["domain"])
1842 logger.debug("row[software]='%s'", row["software"])
1843 if row["software"] == "activityrelay":
1844 logger.debug("Checking row[domain]='%s' ...", row["domain"])
1845 tags = doc.findAll("p")
1847 logger.debug("Checking %d paragraphs ...", len(tags))
1849 logger.debug("tag[]='%s'", type(tag))
1850 if len(tag.contents) == 0:
1851 logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1853 elif "registered instances" not in tag.contents[0]:
1854 logger.debug("Skipping paragraph, text not found.")
1857 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1858 for domain in tag.contents:
1859 logger.debug("domain[%s]='%s'", type(domain), domain)
1860 if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1863 domain = str(domain)
1864 logger.debug("domain='%s'", domain)
1865 if not domain_helper.is_wanted(domain):
1866 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1869 logger.debug("domain='%s' - BEFORE!", domain)
1870 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1871 logger.debug("domain='%s' - AFTER!", domain)
1873 if domain in [None, ""]:
1874 logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1876 elif domain not in peers:
1877 logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1878 peers.append(domain)
1880 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1881 if dict_helper.has_key(domains, "domain", domain):
1882 logger.debug("domain='%s' already added", domain)
1885 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1888 "origin": row["domain"],
1890 elif row["software"] in ["aoderelay", "selective-relay"]:
1891 logger.debug("Checking row[domain]='%s' ...", row["domain"])
1892 if row["software"] == "aoderelay":
1893 tags = doc.findAll("section", {"class": "instance"})
1895 tags = doc.find("div", {"id": "instances"}).findAll("li")
1897 logger.debug("Checking %d tags ...", len(tags))
1899 logger.debug("tag[]='%s'", type(tag))
1901 link = tag.find("a")
1902 logger.debug("link[%s]='%s'", type(link), link)
1903 if not isinstance(link, bs4.element.Tag):
1904 logger.warning("tag[%s]='%s' is not type of 'bs4.element.Tag' - SKIPPED!", type(tag), tag)
1907 components = urlparse(link.get("href"))
1908 logger.debug("components(%d)='%s'", len(components), components)
1909 domain = components.netloc.lower().split(":")[0]
1911 logger.debug("domain='%s' - BEFORE!", domain)
1912 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1913 logger.debug("domain='%s' - AFTER!", domain)
1915 if domain in [None, ""]:
1916 logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1918 elif domain not in peers:
1919 logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1920 peers.append(domain)
1922 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1923 if dict_helper.has_key(domains, "domain", domain):
1924 logger.debug("domain='%s' already added", domain)
1927 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1930 "origin": row["domain"],
1932 elif row["software"] == "pub-relay":
1933 logger.debug("Checking %d peer(s) row[domain]='%s' ...", len(raw["json"]["metadata"]["peers"]), row["domain"])
1934 for domain in raw["json"]["metadata"]["peers"]:
1935 logger.debug("domain='%s' - BEFORE!", domain)
1936 domain = tidyup.domain(domain) if domain not in[None, ""] else None
1937 logger.debug("domain='%s' - AFTER!", domain)
1939 if domain in [None, ""]:
1940 logger.debug("domain[%s]='%s' is empty after tidyup.domain() from origin='%s' - SKIPPED!", type(domain), domain, row["domain"])
1942 elif domain not in peers:
1943 logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1944 peers.append(domain)
1946 logger.debug("domains()=%d,domain='%s'", len(domains), domain)
1947 if dict_helper.has_key(domains, "domain", domain):
1948 logger.debug("domain='%s' already added", domain)
1951 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1954 "origin": row["domain"],
1957 logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1960 logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1961 instances.set_last_instance_fetch(row["domain"])
1963 logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1964 instances.set_total_peers(row["domain"], peers)
1966 logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1967 instances.update(row["domain"])
1969 logger.info("Checking %d domains ...", len(domains))
1971 logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
1972 if not domain_helper.is_wanted(row["domain"]):
1973 logger.debug("row[domain]='%s' is not wanted - SKIPPED!", row["domain"])
1975 elif instances.is_registered(row["domain"]):
1976 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
1978 elif instances.is_recent(row["domain"]):
1979 logger.debug("row[domain]='%s' has recently been crawled - SKIPPED!", row["domain"])
1982 logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
1983 federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
1985 logger.debug("Success! - EXIT!")
1988 def convert_idna(args: argparse.Namespace) -> int:
1989 logger.debug("args[]='%s' - CALLED!", type(args))
1991 database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1992 rows = database.cursor.fetchall()
1994 logger.debug("rows[]='%s'", type(rows))
1995 instances.translate_idnas(rows, "domain")
1997 database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1998 rows = database.cursor.fetchall()
2000 logger.debug("rows[]='%s'", type(rows))
2001 instances.translate_idnas(rows, "origin")
2003 database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2004 rows = database.cursor.fetchall()
2006 logger.debug("rows[]='%s'", type(rows))
2007 blocks.translate_idnas(rows, "blocker")
2009 database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2010 rows = database.cursor.fetchall()
2012 logger.debug("rows[]='%s'", type(rows))
2013 blocks.translate_idnas(rows, "blocked")
2015 logger.debug("Success! - EXIT!")
2018 def remove_invalid(args: argparse.Namespace) -> int:
2019 logger.debug("args[]='%s' - CALLED!", type(args))
2021 logger.debug("Invoking locking.acquire() ...")
2024 database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2025 rows = database.cursor.fetchall()
2027 logger.info("Checking %d domains ...", len(rows))
2029 logger.debug("row[domain]='%s'", row["domain"])
2030 if not validators.domain(row["domain"].split("/")[0]):
2031 logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2032 database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2033 database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2035 logger.debug("Invoking commit() ...")
2036 database.connection.commit()
2038 logger.info("Vaccum cleaning database ...")
2039 database.cursor.execute("VACUUM")
2041 logger.debug("Success! - EXIT!")