1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from urllib.parse import urlparse
32 from fba import database
35 from fba.helpers import blacklist
36 from fba.helpers import blocklists
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import dicts as dict_helper
40 from fba.helpers import domain as domain_helper
41 from fba.helpers import locking
42 from fba.helpers import processing
43 from fba.helpers import software as software_helper
44 from fba.helpers import tidyup
46 from fba.http import csrf
47 from fba.http import federation
48 from fba.http import network
50 from fba.models import blocks
51 from fba.models import instances
52 from fba.models import sources
54 from fba.networks import friendica
55 from fba.networks import lemmy
56 from fba.networks import mastodon
57 from fba.networks import misskey
58 from fba.networks import pleroma
60 logging.basicConfig(level=logging.INFO)
61 logger = logging.getLogger(__name__)
62 #logger.setLevel(logging.DEBUG)
64 def check_instance(args: argparse.Namespace) -> int:
65 logger.debug("args.domain='%s' - CALLED!", args.domain)
68 if not validators.domain(args.domain):
69 logger.warning("args.domain='%s' is not valid", args.domain)
71 elif blacklist.is_blacklisted(args.domain):
72 logger.warning("args.domain='%s' is blacklisted", args.domain)
74 elif instances.is_registered(args.domain):
75 logger.warning("args.domain='%s' is already registered", args.domain)
78 logger.info("args.domain='%s' is not known", args.domain)
80 logger.debug("status=%d - EXIT!", status)
83 def check_nodeinfo(args: argparse.Namespace) -> int:
84 logger.debug("args[]='%s' - CALLED!", type(args))
87 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
90 for row in database.cursor.fetchall():
91 logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
92 punycode = row["domain"].encode("idna").decode("utf-8")
94 if row["nodeinfo_url"].startswith("/"):
95 logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
97 elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
98 logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
101 logger.info("Found %d row(s)", cnt)
103 logger.debug("EXIT!")
106 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
107 logger.debug("args[]='%s' - CALLED!", type(args))
109 # No CSRF by default, you don't have to add network.source_headers by yourself here
111 source_domain = "pixelfed.org"
113 if sources.is_recent(source_domain):
114 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
117 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
118 sources.update(source_domain)
121 logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
122 headers = csrf.determine(source_domain, dict())
123 except network.exceptions as exception:
124 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
128 logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
129 fetched = network.get_json_api(
131 "/api/v1/servers/all.json?scope=All&country=all&language=all",
133 (config.get("connection_timeout"), config.get("read_timeout"))
136 logger.debug("JSON API returned %d elements", len(fetched))
137 if "error_message" in fetched:
138 logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
140 elif "data" not in fetched["json"]:
141 logger.warning("API did not return JSON with 'data' element - EXIT!")
144 rows = fetched["json"]["data"]
145 logger.info("Checking %d fetched rows ...", len(rows))
147 logger.debug("row[]='%s'", type(row))
148 if "domain" not in row:
149 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
151 elif row["domain"] == "":
152 logger.debug("row[domain] is empty - SKIPPED!")
155 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
156 domain = row["domain"].encode("idna").decode("utf-8")
157 logger.debug("domain='%s' - AFTER!", domain)
159 if not domain_helper.is_wanted(domain):
160 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
162 elif instances.is_registered(domain):
163 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
165 elif instances.is_recent(domain):
166 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
169 logger.debug("Fetching instances from domain='%s' ...", domain)
170 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
172 except network.exceptions as exception:
173 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
176 logger.debug("Success! - EXIT!")
179 def fetch_bkali(args: argparse.Namespace) -> int:
180 logger.debug("args[]='%s' - CALLED!", type(args))
182 logger.debug("Invoking locking.acquire() ...")
185 source_domain = "gql.api.bka.li"
186 if sources.is_recent(source_domain):
187 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
190 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
191 sources.update(source_domain)
195 logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
196 fetched = network.post_json_api(
200 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
204 logger.debug("fetched[]='%s'", type(fetched))
205 if "error_message" in fetched:
206 logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
208 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
209 logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
212 rows = fetched["json"]
214 logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
216 raise Exception("WARNING: Returned no records")
217 elif "data" not in rows:
218 raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
219 elif "nodeinfo" not in rows["data"]:
220 raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
222 for entry in rows["data"]["nodeinfo"]:
223 logger.debug("entry[%s]='%s'", type(entry), entry)
224 if "domain" not in entry:
225 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
227 elif entry["domain"] == "":
228 logger.debug("entry[domain] is empty - SKIPPED!")
230 elif not domain_helper.is_wanted(entry["domain"]):
231 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
233 elif instances.is_registered(entry["domain"]):
234 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
236 elif instances.is_recent(entry["domain"]):
237 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
240 logger.debug("Adding domain='%s' ...", entry["domain"])
241 domains.append(entry["domain"])
243 except network.exceptions as exception:
244 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
247 logger.debug("domains()=%d", len(domains))
249 logger.info("Adding %d new instances ...", len(domains))
250 for domain in domains:
251 logger.debug("domain='%s' - BEFORE!", domain)
252 domain = domain.encode("idna").decode("utf-8")
253 logger.debug("domain='%s' - AFTER!", domain)
256 logger.info("Fetching instances from domain='%s' ...", domain)
257 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
258 except network.exceptions as exception:
259 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
260 instances.set_last_error(domain, exception)
263 logger.debug("Success - EXIT!")
266 def fetch_blocks(args: argparse.Namespace) -> int:
267 logger.debug("args[]='%s' - CALLED!", type(args))
268 if args.domain is not None and args.domain != "":
269 logger.debug("args.domain='%s' - checking ...", args.domain)
270 if not validators.domain(args.domain):
271 logger.warning("args.domain='%s' is not valid.", args.domain)
273 elif blacklist.is_blacklisted(args.domain):
274 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
276 elif not instances.is_registered(args.domain):
277 logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
280 logger.debug("Invoking locking.acquire() ...")
283 if args.domain is not None and args.domain != "":
284 # Re-check single domain
285 logger.debug("Querying database for args.domain='%s' ...", args.domain)
286 database.cursor.execute(
287 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ? LIMIT 1", [args.domain]
289 elif args.software is not None and args.software != "":
290 # Re-check single software
291 logger.debug("Querying database for args.software='%s' ...", args.software)
292 database.cursor.execute(
293 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [args.software]
297 logger.debug("Re-checking all instances ...")
298 database.cursor.execute(
299 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC"
302 # Re-check after "timeout" (aka. minimum interval)
303 database.cursor.execute(
304 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY total_blocks DESC, last_updated ASC", [time.time() - config.get("recheck_block")]
307 rows = database.cursor.fetchall()
308 logger.info("Checking %d entries ...", len(rows))
309 for blocker, software, origin, nodeinfo_url in rows:
310 logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
312 if not domain_helper.is_wanted(blocker):
313 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
316 logger.debug("Setting last_blocked,has_obfuscation=false for blocker='%s' ...", blocker)
317 instances.set_last_blocked(blocker)
318 instances.set_has_obfuscation(blocker, False)
320 # c.s isn't part of oliphant's "hidden" blocklists
321 if blocker == "chaos.social" or blocklists.has(blocker):
322 logger.debug("Skipping blocker='%s', run ./fba.py fetch_cs or fetch_oliphant instead!", blocker)
325 logger.debug("Invoking federation.fetch_blocks(%s) ...", blocker)
326 blocking = federation.fetch_blocks(blocker)
328 logger.info("blocker='%s',software='%s' has %d block entries returned.", blocker, software, len(blocking))
329 if len(blocking) == 0:
330 logger.debug("blocker='%s',software='%s' - fetching blocklist ...", blocker, software)
331 if software == "pleroma":
332 blocking = pleroma.fetch_blocks(blocker)
333 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
334 elif software == "mastodon":
335 blocking = mastodon.fetch_blocks(blocker)
336 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
337 elif software == "lemmy":
338 blocking = lemmy.fetch_blocks(blocker)
339 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
340 elif software == "friendica":
341 blocking = friendica.fetch_blocks(blocker)
342 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
343 elif software == "misskey":
344 blocking = misskey.fetch_blocks(blocker)
345 logger.debug("blocker='%s' returned %d entries,software='%s'", blocker, len(blocking), software)
347 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
349 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
350 instances.set_total_blocks(blocker, blocking)
354 logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
355 for block in blocking:
356 logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
358 if block["block_level"] == "":
359 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
362 logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
363 block["blocked"] = tidyup.domain(block["blocked"])
364 block["reason"] = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
365 logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
367 if block["blocked"] == "":
368 logger.warning("blocked is empty, blocker='%s'", blocker)
370 elif block["blocked"].endswith(".onion"):
371 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
373 elif block["blocked"].endswith(".arpa"):
374 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
376 elif block["blocked"].endswith(".tld"):
377 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
379 elif block["blocked"].find("*") >= 0:
380 logger.debug("blocker='%s' uses obfuscated domains", blocker)
382 # Some friendica servers also obscure domains without hash
383 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
385 logger.debug("row[]='%s'", type(row))
387 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
388 instances.set_has_obfuscation(blocker, True)
391 block["blocked"] = row["domain"]
392 origin = row["origin"]
393 nodeinfo_url = row["nodeinfo_url"]
394 elif block["blocked"].find("?") >= 0:
395 logger.debug("blocker='%s' uses obfuscated domains", blocker)
397 # Some obscure them with question marks, not sure if that's dependent on version or not
398 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
400 logger.debug("row[]='%s'", type(row))
402 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
403 instances.set_has_obfuscation(blocker, True)
406 block["blocked"] = row["domain"]
407 origin = row["origin"]
408 nodeinfo_url = row["nodeinfo_url"]
410 logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
411 if block["blocked"] == "":
412 logger.debug("block[blocked] is empty - SKIPPED!")
415 logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
416 block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
417 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
419 if not domain_helper.is_wanted(block["blocked"]):
420 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
422 elif block["block_level"] in ["accept", "accepted"]:
423 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
425 elif not instances.is_registered(block["blocked"]):
426 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
427 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
429 block["block_level"] = blocks.alias_block_level(block["block_level"])
431 if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
432 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
434 "blocked": block["blocked"],
435 "reason" : block["reason"],
438 logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
439 cookies.clear(block["blocked"])
441 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
442 if instances.has_pending(blocker):
443 logger.debug("Flushing updates for blocker='%s' ...", blocker)
444 instances.update(blocker)
446 logger.debug("Invoking commit() ...")
447 database.connection.commit()
449 logger.debug("Invoking cookies.clear(%s) ...", blocker)
450 cookies.clear(blocker)
452 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
453 if config.get("bot_enabled") and len(blockdict) > 0:
454 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
455 network.send_bot_post(blocker, blockdict)
457 logger.debug("Success! - EXIT!")
460 def fetch_observer(args: argparse.Namespace) -> int:
461 logger.debug("args[]='%s' - CALLED!", type(args))
463 logger.debug("Invoking locking.acquire() ...")
466 source_domain = "fediverse.observer"
467 if sources.is_recent(source_domain):
468 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
471 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
472 sources.update(source_domain)
475 if args.software is None:
476 logger.info("Fetching software list ...")
477 raw = utils.fetch_url(
478 f"https://{source_domain}",
480 (config.get("connection_timeout"), config.get("read_timeout"))
482 logger.debug("raw[%s]()=%d", type(raw), len(raw))
484 doc = bs4.BeautifulSoup(raw, features="html.parser")
485 logger.debug("doc[]='%s'", type(doc))
487 navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
488 logger.debug("navbar[]='%s'", type(navbar))
490 logger.warning("Cannot find navigation bar, cannot continue!")
493 items = navbar.findAll("a", {"class": "dropdown-item"})
494 logger.debug("items[]='%s'", type(items))
496 logger.info("Checking %d menu items ...", len(items))
498 logger.debug("item[%s]='%s'", type(item), item)
499 if item.text.lower() == "all":
500 logger.debug("Skipping 'All' menu entry ...")
503 logger.debug("Appending item.text='%s' ...", item.text)
504 types.append(tidyup.domain(item.text))
506 logger.info("Adding args.software='%s' as type ...", args.software)
507 types.append(args.software)
509 logger.info("Fetching %d different table data ...", len(types))
510 for software in types:
511 logger.debug("software='%s' - BEFORE!", software)
512 if args.software is not None and args.software != software:
513 logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
518 logger.debug("Fetching table data for software='%s' ...", software)
519 raw = utils.fetch_url(
520 f"https://{source_domain}/app/views/tabledata.php?software={software}",
522 (config.get("connection_timeout"), config.get("read_timeout"))
524 logger.debug("raw[%s]()=%d", type(raw), len(raw))
526 doc = bs4.BeautifulSoup(raw, features="html.parser")
527 logger.debug("doc[]='%s'", type(doc))
528 except network.exceptions as exception:
529 logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
532 items = doc.findAll("a", {"class": "url"})
533 logger.info("Checking %d items,software='%s' ...", len(items), software)
535 logger.debug("item[]='%s'", type(item))
536 domain = item.decode_contents()
537 logger.debug("domain='%s' - AFTER!", domain)
540 logger.debug("domain is empty - SKIPPED!")
543 logger.debug("domain='%s' - BEFORE!", domain)
544 domain = domain.encode("idna").decode("utf-8")
545 logger.debug("domain='%s' - AFTER!", domain)
547 if not domain_helper.is_wanted(domain):
548 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
550 elif instances.is_registered(domain):
551 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
554 software = software_helper.alias(software)
555 logger.info("Fetching instances for domain='%s'", domain)
556 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
558 logger.debug("Success! - EXIT!")
561 def fetch_todon_wiki(args: argparse.Namespace) -> int:
562 logger.debug("args[]='%s' - CALLED!", type(args))
564 logger.debug("Invoking locking.acquire() ...")
567 source_domain = "wiki.todon.eu"
568 if sources.is_recent(source_domain):
569 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
572 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
573 sources.update(source_domain)
580 logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
581 raw = utils.fetch_url(
582 f"https://{source_domain}/todon/domainblocks",
584 (config.get("connection_timeout"), config.get("read_timeout"))
586 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
588 doc = bs4.BeautifulSoup(raw, "html.parser")
589 logger.debug("doc[]='%s'", type(doc))
591 silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
592 logger.info("Checking %d silenced/limited entries ...", len(silenced))
593 blocklist["silenced"] = utils.find_domains(silenced, "div")
595 suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
596 logger.info("Checking %d suspended entries ...", len(suspended))
597 blocklist["reject"] = utils.find_domains(suspended, "div")
599 blocking = blocklist["silenced"] + blocklist["reject"]
602 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
603 instances.set_last_blocked(blocker)
604 instances.set_total_blocks(blocker, blocking)
607 for block_level in blocklist:
608 blockers = blocklist[block_level]
610 logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
611 for blocked in blockers:
612 logger.debug("blocked='%s'", blocked)
614 if not instances.is_registered(blocked):
616 logger.info("Fetching instances from domain='%s' ...", blocked)
617 federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
618 except network.exceptions as exception:
619 logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
620 instances.set_last_error(blocked, exception)
622 if blocks.is_instance_blocked(blocker, blocked, block_level):
623 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
626 logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
627 if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
628 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
634 logger.debug("Invoking commit() ...")
635 database.connection.commit()
637 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
638 if config.get("bot_enabled") and len(blockdict) > 0:
639 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
640 network.send_bot_post(blocker, blockdict)
642 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
643 if instances.has_pending(blocker):
644 logger.debug("Flushing updates for blocker='%s' ...", blocker)
645 instances.update(blocker)
647 logger.debug("Success! - EXIT!")
650 def fetch_cs(args: argparse.Namespace):
651 logger.debug("args[]='%s' - CALLED!", type(args))
653 logger.debug("Invoking locking.acquire() ...")
681 source_domain = "raw.githubusercontent.com"
682 if sources.is_recent(source_domain):
683 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
686 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
687 sources.update(source_domain)
689 logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
690 raw = utils.fetch_url(
691 f"https://{source_domain}/chaossocial/meta/master/federation.md",
693 (config.get("connection_timeout"), config.get("read_timeout"))
695 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
697 doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
698 logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
700 silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
701 logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
702 blocklist["silenced"] = federation.find_domains(silenced)
704 blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
705 logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
706 blocklist["reject"] = federation.find_domains(blocked)
708 blocking = blocklist["silenced"] + blocklist["reject"]
709 blocker = "chaos.social"
711 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
712 instances.set_last_blocked(blocker)
713 instances.set_total_blocks(blocker, blocking)
715 logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
716 if len(blocking) > 0:
718 for block_level in blocklist:
719 logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
721 for row in blocklist[block_level]:
722 logger.debug("row[%s]='%s'", type(row), row)
723 if not "domain" in row:
724 logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
726 elif not instances.is_registered(row["domain"]):
728 logger.info("Fetching instances from domain='%s' ...", row["domain"])
729 federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
730 except network.exceptions as exception:
731 logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
732 instances.set_last_error(row["domain"], exception)
734 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
735 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
737 "blocked": row["domain"],
738 "reason" : row["reason"],
741 logger.debug("Invoking commit() ...")
742 database.connection.commit()
744 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
745 if config.get("bot_enabled") and len(blockdict) > 0:
746 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
747 network.send_bot_post(blocker, blockdict)
749 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
750 if instances.has_pending(blocker):
751 logger.debug("Flushing updates for blocker='%s' ...", blocker)
752 instances.update(blocker)
754 logger.debug("Success! - EXIT!")
757 def fetch_fba_rss(args: argparse.Namespace) -> int:
758 logger.debug("args[]='%s' - CALLED!", type(args))
762 logger.debug("Invoking locking.acquire() ...")
765 components = urlparse(args.feed)
767 if sources.is_recent(components.netloc):
768 logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
771 logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
772 sources.update(components.netloc)
774 logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
775 response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
777 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
778 if response.ok and response.status_code == 200 and len(response.text) > 0:
779 logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
780 rss = atoma.parse_rss_bytes(response.content)
782 logger.debug("rss[]='%s'", type(rss))
783 for item in rss.items:
784 logger.debug("item[%s]='%s'", type(item), item)
785 domain = tidyup.domain(item.link.split("=")[1])
787 logger.debug("domain='%s' - AFTER!", domain)
789 logger.debug("domain is empty - SKIPPED!")
792 logger.debug("domain='%s' - BEFORE!", domain)
793 domain = domain.encode("idna").decode("utf-8")
794 logger.debug("domain='%s' - AFTER!", domain)
796 if not domain_helper.is_wanted(domain):
797 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
799 elif domain in domains:
800 logger.debug("domain='%s' is already added - SKIPPED!", domain)
802 elif instances.is_registered(domain):
803 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
805 elif instances.is_recent(domain):
806 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
809 logger.debug("Adding domain='%s'", domain)
810 domains.append(domain)
812 logger.debug("domains()=%d", len(domains))
814 logger.info("Adding %d new instances ...", len(domains))
815 for domain in domains:
816 logger.debug("domain='%s'", domain)
818 logger.info("Fetching instances from domain='%s' ...", domain)
819 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
820 except network.exceptions as exception:
821 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
822 instances.set_last_error(domain, exception)
825 logger.debug("Success! - EXIT!")
828 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
829 logger.debug("args[]='%s' - CALLED!", type(args))
831 logger.debug("Invoking locking.acquire() ...")
834 source_domain = "ryona.agency"
835 feed = f"https://{source_domain}/users/fba/feed.atom"
837 logger.debug("args.feed[%s]='%s'", type(args.feed), args.feed)
838 if args.feed is not None and validators.url(args.feed):
839 logger.debug("Setting feed='%s' ...", args.feed)
840 feed = str(args.feed)
841 source_domain = urlparse(args.feed).netloc
843 if sources.is_recent(source_domain):
844 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
847 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
848 sources.update(source_domain)
852 logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
853 response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
855 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
856 if response.ok and response.status_code == 200 and len(response.text) > 0:
857 logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
858 atom = atoma.parse_atom_bytes(response.content)
860 logger.debug("atom[]='%s'", type(atom))
861 for entry in atom.entries:
862 logger.debug("entry[]='%s'", type(entry))
863 doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
864 logger.debug("doc[]='%s'", type(doc))
865 for element in doc.findAll("a"):
866 logger.debug("element[]='%s'", type(element))
867 for href in element["href"].split(","):
868 logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
869 domain = tidyup.domain(href)
871 logger.debug("domain='%s' - AFTER!", domain)
873 logger.debug("domain is empty - SKIPPED!")
876 logger.debug("domain='%s' - BEFORE!", domain)
877 domain = domain.encode("idna").decode("utf-8")
878 logger.debug("domain='%s' - AFTER!", domain)
880 if not domain_helper.is_wanted(domain):
881 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
883 elif domain in domains:
884 logger.debug("domain='%s' is already added - SKIPPED!", domain)
886 elif instances.is_registered(domain):
887 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
889 elif instances.is_recent(domain):
890 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
893 logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
894 domains.append(domain)
896 logger.debug("domains()=%d", len(domains))
898 logger.info("Adding %d new instances ...", len(domains))
899 for domain in domains:
900 logger.debug("domain='%s'", domain)
902 logger.info("Fetching instances from domain='%s' ...", domain)
903 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
904 except network.exceptions as exception:
905 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
906 instances.set_last_error(domain, exception)
909 logger.debug("Success! - EXIT!")
912 def fetch_instances(args: argparse.Namespace) -> int:
913 logger.debug("args[]='%s' - CALLED!", type(args))
915 logger.debug("args.domain='%s' - checking ...", args.domain)
916 if not validators.domain(args.domain):
917 logger.warning("args.domain='%s' is not valid.", args.domain)
919 elif blacklist.is_blacklisted(args.domain):
920 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
923 logger.debug("Invoking locking.acquire() ...")
927 domain = tidyup.domain(args.domain)
928 origin = software = None
931 database.cursor.execute("SELECT origin, software FROM instances WHERE domain = ? LIMIT 1", [args.domain])
932 row = database.cursor.fetchone()
934 origin = row["origin"]
935 software = row["software"]
939 logger.info("Fetching instances from args.domain='%s',origin='%s',software='%s' ...", domain, origin, software)
940 federation.fetch_instances(domain, origin, software, inspect.currentframe().f_code.co_name)
941 except network.exceptions as exception:
942 logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
943 instances.set_last_error(args.domain, exception)
944 instances.update(args.domain)
948 logger.debug("Not fetching more instances - EXIT!")
951 # Loop through some instances
952 database.cursor.execute(
953 "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey', 'mammuthus', 'neodb') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY total_peers DESC, last_updated ASC", [time.time() - config.get("recheck_instance")]
956 rows = database.cursor.fetchall()
957 logger.info("Checking %d entries ...", len(rows))
959 logger.debug("row[domain]='%s'", row["domain"])
960 if row["domain"] == "":
961 logger.debug("row[domain] is empty - SKIPPED!")
964 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
965 domain = row["domain"].encode("idna").decode("utf-8")
966 logger.debug("domain='%s' - AFTER!", domain)
968 if not domain_helper.is_wanted(domain):
969 logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
973 logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
974 federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
975 except network.exceptions as exception:
976 logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
977 instances.set_last_error(domain, exception)
979 logger.debug("Success - EXIT!")
982 def fetch_oliphant(args: argparse.Namespace) -> int:
983 logger.debug("args[]='%s' - CALLED!", type(args))
985 logger.debug("Invoking locking.acquire() ...")
988 source_domain = "codeberg.org"
989 if sources.is_recent(source_domain):
990 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
993 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
994 sources.update(source_domain)
997 base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
1001 logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
1002 for block in blocklists.oliphant_blocklists:
1003 # Is domain given and not equal blocker?
1004 if isinstance(args.domain, str) and args.domain != block["blocker"]:
1005 logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1007 elif args.domain in domains:
1008 logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1011 instances.set_last_blocked(block["blocker"])
1014 logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1015 response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1017 logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1018 if not response.ok or response.status_code > 200 or response.content == "":
1019 logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1022 logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1023 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1029 logger.debug("row[%s]='%s'", type(row), row)
1030 domain = severity = None
1031 reject_media = reject_reports = False
1033 if "#domain" in row:
1034 domain = row["#domain"]
1035 elif "domain" in row:
1036 domain = row["domain"]
1038 logger.debug("row='%s' does not contain domain column", row)
1041 if "#severity" in row:
1042 severity = blocks.alias_block_level(row["#severity"])
1043 elif "severity" in row:
1044 severity = blocks.alias_block_level(row["severity"])
1046 logger.debug("row='%s' does not contain severity column", row)
1049 if "#reject_media" in row and row["#reject_media"].lower() == "true":
1051 elif "reject_media" in row and row["reject_media"].lower() == "true":
1054 if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1055 reject_reports = True
1056 elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1057 reject_reports = True
1060 logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1062 logger.debug("domain is empty - SKIPPED!")
1064 elif domain.endswith(".onion"):
1065 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1067 elif domain.endswith(".arpa"):
1068 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1070 elif domain.endswith(".tld"):
1071 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1073 elif domain.find("*") >= 0 or domain.find("?") >= 0:
1074 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1075 domain = utils.deobfuscate(domain, block["blocker"])
1076 logger.debug("domain='%s' - AFTER!", domain)
1078 if not validators.domain(domain):
1079 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1081 elif blacklist.is_blacklisted(domain):
1082 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1084 elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1085 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1088 logger.debug("Marking domain='%s' as handled", domain)
1089 domains.append(domain)
1091 logger.debug("Processing domain='%s' ...", domain)
1092 processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1093 logger.debug("processed='%s'", processed)
1095 if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1096 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1099 "reason" : block["reason"],
1103 processing.block(block["blocker"], domain, None, "reject_media")
1105 processing.block(block["blocker"], domain, None, "reject_reports")
1107 logger.debug("block[blocker]='%s'", block["blocker"])
1108 if not blocklists.has(block["blocker"]):
1109 logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1110 instances.set_total_blocks(block["blocker"], domains)
1112 logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1113 if instances.has_pending(block["blocker"]):
1114 logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1115 instances.update(block["blocker"])
1117 logger.debug("Invoking commit() ...")
1118 database.connection.commit()
1120 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1121 if config.get("bot_enabled") and len(blockdict) > 0:
1122 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1123 network.send_bot_post(block["blocker"], blockdict)
1125 logger.debug("Success! - EXIT!")
1128 def fetch_txt(args: argparse.Namespace) -> int:
1129 logger.debug("args[]='%s' - CALLED!", type(args))
1131 logger.debug("Invoking locking.acquire() ...")
1136 "blocker": "seirdy.one",
1137 "url" : "https://seirdy.one/pb/bsl.txt",
1140 logger.info("Checking %d text file(s) ...", len(urls))
1142 logger.debug("Fetching row[url]='%s' ...", row["url"])
1143 response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1145 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1146 if response.ok and response.status_code == 200 and response.text != "":
1147 logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1148 domains = response.text.split("\n")
1150 logger.info("Processing %d domains ...", len(domains))
1151 for domain in domains:
1152 logger.debug("domain='%s' - BEFORE!", domain)
1153 domain = tidyup.domain(domain)
1155 logger.debug("domain='%s' - AFTER!", domain)
1157 logger.debug("domain is empty - SKIPPED!")
1159 elif not domain_helper.is_wanted(domain):
1160 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1162 elif instances.is_recent(domain):
1163 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1166 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1167 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1169 logger.debug("processed='%s'", processed)
1171 logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1174 logger.debug("Success! - EXIT!")
1177 def fetch_fedipact(args: argparse.Namespace) -> int:
1178 logger.debug("args[]='%s' - CALLED!", type(args))
1180 logger.debug("Invoking locking.acquire() ...")
1183 source_domain = "fedipact.online"
1184 if sources.is_recent(source_domain):
1185 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1188 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1189 sources.update(source_domain)
1191 logger.info("Fetching / from source_domain='%s' ...", source_domain)
1192 response = utils.fetch_url(
1193 f"https://{source_domain}",
1194 network.web_headers,
1195 (config.get("connection_timeout"), config.get("read_timeout"))
1198 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1199 if response.ok and response.status_code == 200 and response.text != "":
1200 logger.debug("Parsing %d Bytes ...", len(response.text))
1202 doc = bs4.BeautifulSoup(response.text, "html.parser")
1203 logger.debug("doc[]='%s'", type(doc))
1205 rows = doc.findAll("li")
1206 logger.info("Checking %d row(s) ...", len(rows))
1208 logger.debug("row[]='%s'", type(row))
1209 domain = tidyup.domain(row.contents[0])
1211 logger.debug("domain='%s' - AFTER!", domain)
1213 logger.debug("domain is empty - SKIPPED!")
1216 logger.debug("domain='%s' - BEFORE!", domain)
1217 domain = domain.encode("idna").decode("utf-8")
1218 logger.debug("domain='%s' - AFTER!", domain)
1220 if not domain_helper.is_wanted(domain):
1221 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1223 elif instances.is_registered(domain):
1224 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1226 elif instances.is_recent(domain):
1227 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1230 logger.info("Fetching domain='%s' ...", domain)
1231 federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1233 logger.debug("Success! - EXIT!")
1236 def fetch_joinmobilizon(args: argparse.Namespace) -> int:
1237 logger.debug("args[]='%s' - CALLED!", type(args))
1239 logger.debug("Invoking locking.acquire() ...")
1242 source_domain = "instances.joinmobilizon.org"
1243 if sources.is_recent(source_domain):
1244 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1247 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1248 sources.update(source_domain)
1250 logger.info("Fetching instances from source_domain='%s' ...", source_domain)
1251 raw = utils.fetch_url(
1252 f"https://{source_domain}/api/v1/instances",
1253 network.web_headers,
1254 (config.get("connection_timeout"), config.get("read_timeout"))
1256 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1258 parsed = json.loads(raw)
1259 logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1261 if "data" not in parsed:
1262 logger.warning("parsed()=%d does not contain key 'data'")
1265 logger.info("Checking %d instances ...", len(parsed["data"]))
1266 for row in parsed["data"]:
1267 logger.debug("row[]='%s'", type(row))
1268 if "host" not in row:
1269 logger.warning("row='%s' does not contain key 'host' - SKIPPED!", row)
1271 elif not domain_helper.is_wanted(row["host"]):
1272 logger.debug("row[host]='%s' is not wanted - SKIPPED!", row["host"])
1274 elif instances.is_registered(row["host"]):
1275 logger.debug("row[host]='%s' is already registered - SKIPPED!", row["host"])
1278 logger.info("Fetching row[host]='%s' ...", row["host"])
1279 federation.fetch_instances(row["host"], "demo.mobilizon.org", None, inspect.currentframe().f_code.co_name)
1281 logger.debug("Success! - EXIT!")
1284 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1285 logger.debug("args[]='%s' - CALLED!", type(args))
1287 logger.debug("Invoking locking.acquire() ...")
1290 source_domain = "instanceapp.misskey.page"
1291 if sources.is_recent(source_domain):
1292 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1295 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1296 sources.update(source_domain)
1298 logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1299 raw = utils.fetch_url(
1300 f"https://{source_domain}/instances.json",
1301 network.web_headers,
1302 (config.get("connection_timeout"), config.get("read_timeout"))
1304 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1306 parsed = json.loads(raw)
1307 logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1309 if "instancesInfos" not in parsed:
1310 logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1313 logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1314 for row in parsed["instancesInfos"]:
1315 logger.debug("row[%s]='%s'", type(row), row)
1316 if "url" not in row:
1317 logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1319 elif not domain_helper.is_wanted(row["url"]):
1320 logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1322 elif instances.is_registered(row["url"]):
1323 logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1326 logger.info("Fetching row[url]='%s' ...", row["url"])
1327 federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1329 logger.debug("Success! - EXIT!")
1332 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1333 logger.debug("args[]='%s' - CALLED!", type(args))
1335 logger.debug("Invoking locking.acquire() ...")
1338 source_domain = "joinfediverse.wiki"
1339 if sources.is_recent(source_domain):
1340 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1343 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1344 sources.update(source_domain)
1346 logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1347 raw = utils.fetch_url(
1348 f"https://{source_domain}/FediBlock",
1349 network.web_headers,
1350 (config.get("connection_timeout"), config.get("read_timeout"))
1352 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1354 doc = bs4.BeautifulSoup(raw, "html.parser")
1355 logger.debug("doc[]='%s'", type(doc))
1357 tables = doc.findAll("table", {"class": "wikitable"})
1359 logger.info("Analyzing %d table(s) ...", len(tables))
1361 for table in tables:
1362 logger.debug("table[]='%s'", type(table))
1364 rows = table.findAll("tr")
1365 logger.info("Checking %d row(s) ...", len(rows))
1366 block_headers = dict()
1368 logger.debug("row[%s]='%s'", type(row), row)
1370 headers = row.findAll("th")
1371 logger.debug("Found headers()=%d header(s)", len(headers))
1372 if len(headers) > 1:
1373 block_headers = dict()
1375 for header in headers:
1377 logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1378 text = header.contents[0]
1380 logger.debug("text[]='%s'", type(text))
1381 if not isinstance(text, str):
1382 logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1384 elif validators.domain(text.strip()):
1385 logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1388 text = tidyup.domain(text.strip())
1389 logger.debug("text='%s' - AFTER!", text)
1390 if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1391 logger.debug("Found header: '%s'=%d", text, cnt)
1392 block_headers[cnt] = text
1394 elif len(block_headers) == 0:
1395 logger.debug("row is not scrapable - SKIPPED!")
1397 elif len(block_headers) > 0:
1398 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1402 for element in row.find_all(["th", "td"]):
1404 logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1405 if cnt in block_headers:
1406 logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1408 text = element.text.strip()
1409 key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1411 logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1412 if key in ["domain", "instance"]:
1414 elif key == "reason":
1415 block[key] = tidyup.reason(text)
1416 elif key == "subdomain(s)":
1419 block[key] = text.split("/")
1421 logger.debug("key='%s'", key)
1424 logger.debug("block()=%d ...", len(block))
1426 logger.debug("Appending block()=%d ...", len(block))
1427 blocklist.append(block)
1429 logger.debug("blocklist()=%d", len(blocklist))
1431 database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1432 domains = database.cursor.fetchall()
1434 logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1436 for block in blocklist:
1437 logger.debug("block='%s'", block)
1438 if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1439 origin = block["blocked"]
1440 logger.debug("origin='%s'", origin)
1441 for subdomain in block["subdomain(s)"]:
1442 block["blocked"] = subdomain + "." + origin
1443 logger.debug("block[blocked]='%s'", block["blocked"])
1444 blocking.append(block)
1446 blocking.append(block)
1448 logger.debug("blocking()=%d", blocking)
1449 for block in blocking:
1450 logger.debug("block[]='%s'", type(block))
1451 if "blocked" not in block:
1452 raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1454 block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1455 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1457 if block["blocked"] == "":
1458 logger.debug("block[blocked] is empty - SKIPPED!")
1460 elif not domain_helper.is_wanted(block["blocked"]):
1461 logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1463 elif instances.is_recent(block["blocked"]):
1464 logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1467 logger.debug("Proccessing blocked='%s' ...", block["blocked"])
1468 processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1471 for blocker in domains:
1472 blocker = blocker[0]
1473 logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1474 instances.set_last_blocked(blocker)
1476 for block in blocking:
1477 logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1478 block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1480 logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1481 if block["blocked"] == "":
1482 logger.debug("block[blocked] is empty - SKIPPED!")
1484 elif not domain_helper.is_wanted(block["blocked"]):
1485 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1488 logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1489 if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1490 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1492 "blocked": block["blocked"],
1493 "reason" : block["reason"],
1496 if instances.has_pending(blocker):
1497 logger.debug("Flushing updates for blocker='%s' ...", blocker)
1498 instances.update(blocker)
1500 logger.debug("Invoking commit() ...")
1501 database.connection.commit()
1503 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1504 if config.get("bot_enabled") and len(blockdict) > 0:
1505 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1506 network.send_bot_post(blocker, blockdict)
1508 logger.debug("Success! - EXIT!")
1511 def recheck_obfuscation(args: argparse.Namespace) -> int:
1512 logger.debug("args[]='%s' - CALLED!", type(args))
1514 logger.debug("Invoking locking.acquire() ...")
1517 if isinstance(args.domain, str) and args.domain != "" and domain_helper.is_wanted(args.domain):
1518 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1519 elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1520 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1522 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1524 rows = database.cursor.fetchall()
1525 logger.info("Checking %d domains ...", len(rows))
1527 logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1528 if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1529 logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1532 logger.debug("Invoking federation.fetch_blocks(%s) ...", row["domain"])
1533 blocking = federation.fetch_blocks(row["domain"])
1535 logger.debug("blocking()=%d", len(blocking))
1536 if len(blocking) == 0:
1537 if row["software"] == "pleroma":
1538 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1539 blocking = pleroma.fetch_blocks(row["domain"])
1540 elif row["software"] == "mastodon":
1541 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1542 blocking = mastodon.fetch_blocks(row["domain"])
1543 elif row["software"] == "lemmy":
1544 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1545 blocking = lemmy.fetch_blocks(row["domain"])
1546 elif row["software"] == "friendica":
1547 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1548 blocking = friendica.fetch_blocks(row["domain"])
1549 elif row["software"] == "misskey":
1550 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1551 blocking = misskey.fetch_blocks(row["domain"])
1553 logger.warning("Unknown software: domain='%s',software='%s'", row["domain"], row["software"])
1555 # c.s isn't part of oliphant's "hidden" blocklists
1556 logger.debug("row[domain]='%s'", row["domain"])
1557 if row["domain"] != "chaos.social" and not blocklists.has(row["domain"]):
1558 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1559 instances.set_last_blocked(row["domain"])
1560 instances.set_total_blocks(row["domain"], blocking)
1565 logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1566 for block in blocking:
1567 logger.debug("block[blocked]='%s'", block["blocked"])
1570 if block["blocked"] == "":
1571 logger.debug("block[blocked] is empty - SKIPPED!")
1573 elif block["blocked"].endswith(".arpa"):
1574 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1576 elif block["blocked"].endswith(".tld"):
1577 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1579 elif block["blocked"].endswith(".onion"):
1580 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1582 elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1583 logger.debug("block='%s' is obfuscated.", block["blocked"])
1584 obfuscated = obfuscated + 1
1585 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1586 elif not domain_helper.is_wanted(block["blocked"]):
1587 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1589 elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1590 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1593 logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1594 if blocked is not None and blocked != block["blocked"]:
1595 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1596 obfuscated = obfuscated - 1
1598 if blocks.is_instance_blocked(row["domain"], blocked):
1599 logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1601 elif blacklist.is_blacklisted(blocked):
1602 logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
1605 block["block_level"] = blocks.alias_block_level(block["block_level"])
1607 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1608 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1609 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1612 "reason" : block["reason"],
1615 logger.debug("Settings obfuscated=%d for row[domain]='%s' ...", obfuscated, row["domain"])
1616 instances.set_obfuscated_blocks(row["domain"], obfuscated)
1618 logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1619 if obfuscated == 0 and len(blocking) > 0:
1620 logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1621 instances.set_has_obfuscation(row["domain"], False)
1623 if instances.has_pending(row["domain"]):
1624 logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1625 instances.update(row["domain"])
1627 logger.debug("Invoking commit() ...")
1628 database.connection.commit()
1630 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1631 if config.get("bot_enabled") and len(blockdict) > 0:
1632 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1633 network.send_bot_post(row["domain"], blockdict)
1635 logger.debug("Success! - EXIT!")
1638 def fetch_fedilist(args: argparse.Namespace) -> int:
1639 logger.debug("args[]='%s' - CALLED!", type(args))
1641 logger.debug("Invoking locking.acquire() ...")
1644 source_domain = "demo.fedilist.com"
1645 if sources.is_recent(source_domain):
1646 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1649 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1650 sources.update(source_domain)
1652 url = f"http://{source_domain}/instance/csv?onion=not"
1653 if args.software is not None and args.software != "":
1654 logger.debug("args.software='%s'", args.software)
1655 url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1657 logger.info("Fetching url='%s' ...", url)
1658 response = reqto.get(
1660 headers=network.web_headers,
1661 timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1662 allow_redirects=False
1665 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1666 if not response.ok or response.status_code > 200 or len(response.content) == 0:
1667 logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1670 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1672 logger.debug("reader[]='%s'", type(reader))
1674 logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1679 logger.info("Checking %d rows ...", len(rows))
1681 logger.debug("row[]='%s'", type(row))
1682 if "hostname" not in row:
1683 logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1686 logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1687 domain = tidyup.domain(row["hostname"])
1688 logger.debug("domain='%s' - AFTER!", domain)
1691 logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1694 logger.debug("domain='%s' - BEFORE!", domain)
1695 domain = domain.encode("idna").decode("utf-8")
1696 logger.debug("domain='%s' - AFTER!", domain)
1698 if not domain_helper.is_wanted(domain):
1699 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1701 elif (args.force is None or not args.force) and instances.is_registered(domain):
1702 logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1704 elif instances.is_recent(domain):
1705 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1708 logger.info("Fetching instances from domain='%s' ...", domain)
1709 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1711 logger.debug("Success! - EXIT!")
1714 def update_nodeinfo(args: argparse.Namespace) -> int:
1715 logger.debug("args[]='%s' - CALLED!", type(args))
1717 logger.debug("Invoking locking.acquire() ...")
1720 if args.domain is not None and args.domain != "":
1721 logger.debug("Fetching args.domain='%s'", args.domain)
1722 database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1723 elif args.software is not None and args.software != "":
1724 logger.info("Fetching domains for args.software='%s'", args.software)
1725 database.cursor.execute("SELECT domain, software FROM instances WHERE software = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.software.lower(), time.time() - config.get("recheck_nodeinfo")])
1726 elif args.mode is not None and args.mode != "":
1727 logger.info("Fetching domains for args.mode='%s'", args.mode.upper())
1728 database.cursor.execute("SELECT domain, software FROM instances WHERE detection_mode = ? AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [args.mode.upper(), time.time() - config.get("recheck_nodeinfo")])
1729 elif args.no_software:
1730 logger.info("Fetching domains with no software type detected ...")
1731 database.cursor.execute("SELECT domain, software FROM instances WHERE software IS NULL AND (last_nodeinfo < ? OR last_nodeinfo IS NULL)", [time.time() - config.get("recheck_nodeinfo")])
1733 logger.info("Fetching domains for recently updated ...")
1734 database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1736 domains = database.cursor.fetchall()
1738 logger.info("Checking %d domain(s) ...", len(domains))
1741 logger.debug("row[]='%s'", type(row))
1742 if not args.force and instances.is_recent(row["domain"], "last_nodeinfo"):
1743 logger.debug("row[domain]='%s' has been recently checked - SKIPPED!", row["domain"])
1747 logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1748 software = federation.determine_software(row["domain"])
1750 logger.debug("Determined software='%s'", software)
1751 if (software != row["software"] and software is not None) or args.force is True:
1752 logger.debug("software='%s'", software)
1753 if software is None:
1754 logger.debug("Setting nodeinfo_url to 'None' for row[domain]='%s' ...", row["domain"])
1755 instances.set_nodeinfo_url(row["domain"], None)
1757 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1758 instances.set_software(row["domain"], software)
1760 if software is not None:
1761 logger.debug("Setting row[domain]='%s' as successfully determined ...", row["domain"])
1762 instances.set_success(row["domain"])
1763 except network.exceptions as exception:
1764 logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1765 instances.set_last_error(row["domain"], exception)
1767 instances.set_last_nodeinfo(row["domain"])
1768 instances.update(row["domain"])
1771 logger.debug("Success! - EXIT!")
1774 def fetch_instances_social(args: argparse.Namespace) -> int:
1775 logger.debug("args[]='%s' - CALLED!", type(args))
1777 logger.debug("Invoking locking.acquire() ...")
1780 source_domain = "instances.social"
1782 if config.get("instances_social_api_key") == "":
1783 logger.error("API key not set. Please set in your config.json file.")
1785 elif sources.is_recent(source_domain):
1786 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1789 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1790 sources.update(source_domain)
1793 "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1796 logger.info("Fetching list from source_domain='%s' ...", source_domain)
1797 fetched = network.get_json_api(
1799 "/api/1.0/instances/list?count=0&sort_by=name",
1801 (config.get("connection_timeout"), config.get("read_timeout"))
1803 logger.debug("fetched[]='%s'", type(fetched))
1805 if "error_message" in fetched:
1806 logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1808 elif "exception" in fetched:
1809 logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1811 elif "json" not in fetched:
1812 logger.warning("fetched has no element 'json' - EXIT!")
1814 elif "instances" not in fetched["json"]:
1815 logger.warning("fetched[row] has no element 'instances' - EXIT!")
1819 rows = fetched["json"]["instances"]
1821 logger.info("Checking %d row(s) ...", len(rows))
1823 logger.debug("row[]='%s'", type(row))
1824 domain = tidyup.domain(row["name"])
1825 logger.debug("domain='%s' - AFTER!", domain)
1828 logger.debug("domain is empty - SKIPPED!")
1831 logger.debug("domain='%s' - BEFORE!", domain)
1832 domain = domain.encode("idna").decode("utf-8")
1833 logger.debug("domain='%s' - AFTER!", domain)
1835 if not domain_helper.is_wanted(domain):
1836 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1838 elif domain in domains:
1839 logger.debug("domain='%s' is already added - SKIPPED!", domain)
1841 elif instances.is_registered(domain):
1842 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1844 elif instances.is_recent(domain):
1845 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1848 logger.info("Fetching instances from domain='%s'", domain)
1849 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1851 logger.debug("Success! - EXIT!")
1854 def fetch_relays(args: argparse.Namespace) -> int:
1855 logger.debug("args[]='%s' - CALLED!", type(args))
1857 logger.debug("Invoking locking.acquire() ...")
1860 if args.domain is not None and args.domain != "":
1861 database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
1863 database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
1866 rows = database.cursor.fetchall()
1868 logger.info("Checking %d relays ...", len(rows))
1870 logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1872 if not args.force and instances.is_recent(row["domain"]):
1873 logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
1877 logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
1878 raw = utils.fetch_url(
1879 f"https://{row['domain']}",
1880 network.web_headers,
1881 (config.get("connection_timeout"), config.get("read_timeout"))
1883 logger.debug("raw[%s]()=%d", type(raw), len(raw))
1884 except network.exceptions as exception:
1885 logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
1886 instances.set_last_error(row["domain"], exception)
1887 instances.set_last_instance_fetch(row["domain"])
1888 instances.update(row["domain"])
1891 doc = bs4.BeautifulSoup(raw, features="html.parser")
1892 logger.debug("doc[]='%s'", type(doc))
1894 logger.debug("row[software]='%s'", row["software"])
1895 if row["software"] == "activityrelay":
1896 logger.debug("Checking row[domain]='%s' ...", row["domain"])
1897 tags = doc.findAll("p")
1899 logger.debug("Checking %d paragraphs ...", len(tags))
1901 logger.debug("tag[]='%s'", type(tag))
1902 if len(tag.contents) == 0:
1903 logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
1905 elif "registered instances" not in tag.contents[0]:
1906 logger.debug("Skipping paragraph, text not found.")
1909 logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
1910 for domain in tag.contents:
1911 logger.debug("domain[%s]='%s'", type(domain), domain)
1912 if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
1915 domain = str(domain)
1916 logger.debug("domain='%s'", domain)
1917 if not domain_helper.is_wanted(domain):
1918 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1921 logger.debug("domain='%s' - BEFORE!", domain)
1922 domain = tidyup.domain(domain)
1923 logger.debug("domain='%s' - AFTER!", domain)
1926 logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1928 elif domain not in peers:
1929 logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1930 peers.append(domain)
1932 if dict_helper.has_key(domains, "domain", domain):
1933 logger.debug("domain='%s' already added", domain)
1936 logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
1939 "origin": row["domain"],
1941 elif row["software"] in ["aoderelay", "selective-relay"]:
1942 logger.debug("Checking row[domain]='%s' ...", row["domain"])
1943 if row["software"] == "aoderelay":
1944 tags = doc.findAll("section", {"class": "instance"})
1946 tags = doc.find("div", {"id": "instances"}).findAll("li")
1948 logger.debug("Checking %d tags ...", len(tags))
1950 logger.debug("tag[]='%s'", type(tag))
1952 link = tag.find("a")
1953 logger.debug("link[%s]='%s'", type(link), link)
1955 logger.warning("tag='%s' has no a-tag ...", tag)
1958 components = urlparse(link["href"])
1959 domain = components.netloc.lower()
1961 if not domain_helper.is_wanted(domain):
1962 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1965 logger.debug("domain='%s' - BEFORE!", domain)
1966 domain = tidyup.domain(domain)
1967 logger.debug("domain='%s' - AFTER!", domain)
1970 logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
1972 elif domain not in peers:
1973 logger.debug("Appending domain='%s' to peers list for relay='%s' ...", domain, row["domain"])
1974 peers.append(domain)
1976 if dict_helper.has_key(domains, "domain", domain):
1977 logger.debug("domain='%s' already added", domain)
1980 logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
1983 "origin": row["domain"],
1986 logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
1988 logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
1989 instances.set_last_instance_fetch(row["domain"])
1991 logger.info("Relay '%s' has %d peer(s) registered.", row["domain"], len(peers))
1992 instances.set_total_peers(row["domain"], peers)
1994 logger.debug("Flushing data for row[domain]='%s'", row["domain"])
1995 instances.update(row["domain"])
1997 logger.info("Checking %d domains ...", len(domains))
1999 logger.debug("row[domain]='%s',row[origin]='%s'", row["domain"], row["origin"])
2000 if instances.is_registered(row["domain"]):
2001 logger.debug("row[domain]='%s' is already registered - SKIPPED!", row["domain"])
2004 logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
2005 federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
2007 logger.debug("Success! - EXIT!")
2010 def convert_idna(args: argparse.Namespace) -> int:
2011 logger.debug("args[]='%s' - CALLED!", type(args))
2013 database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
2014 rows = database.cursor.fetchall()
2016 logger.debug("rows[]='%s'", type(rows))
2017 instances.translate_idnas(rows, "domain")
2019 database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
2020 rows = database.cursor.fetchall()
2022 logger.debug("rows[]='%s'", type(rows))
2023 instances.translate_idnas(rows, "origin")
2025 database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
2026 rows = database.cursor.fetchall()
2028 logger.debug("rows[]='%s'", type(rows))
2029 blocks.translate_idnas(rows, "blocker")
2031 database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
2032 rows = database.cursor.fetchall()
2034 logger.debug("rows[]='%s'", type(rows))
2035 blocks.translate_idnas(rows, "blocked")
2037 logger.debug("Success! - EXIT!")
2040 def remove_invalid(args: argparse.Namespace) -> int:
2041 logger.debug("args[]='%s' - CALLED!", type(args))
2043 logger.debug("Invoking locking.acquire() ...")
2046 database.cursor.execute("SELECT domain FROM instances ORDER BY domain ASC")
2047 rows = database.cursor.fetchall()
2049 logger.info("Checking %d domains ...", len(rows))
2051 logger.debug("row[domain]='%s'", row["domain"])
2052 if not validators.domain(row["domain"].split("/")[0]):
2053 logger.info("Invalid row[domain]='%s' found, removing ...", row["domain"])
2054 database.cursor.execute("DELETE FROM blocks WHERE blocker = ? OR blocked = ?", [row["domain"], row["domain"]])
2055 database.cursor.execute("DELETE FROM instances WHERE domain = ? LIMIT 1", [row["domain"]])
2057 logger.debug("Invoking commit() ...")
2058 database.connection.commit()
2060 logger.info("Vaccum cleaning database ...")
2061 database.cursor.execute("VACUUM")
2063 logger.debug("Success! - EXIT!")