1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from urllib.parse import urlparse
33 from fba import database
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import processing
41 from fba.helpers import software as software_helper
42 from fba.helpers import tidyup
44 from fba.http import federation
45 from fba.http import network
47 from fba.models import blocks
48 from fba.models import instances
49 from fba.models import sources
51 from fba.networks import friendica
52 from fba.networks import lemmy
53 from fba.networks import mastodon
54 from fba.networks import misskey
55 from fba.networks import pleroma
57 logging.basicConfig(level=logging.INFO)
58 logger = logging.getLogger(__name__)
59 #logger.setLevel(logging.DEBUG)
61 def check_instance(args: argparse.Namespace) -> int:
62 logger.debug("args.domain='%s' - CALLED!", args.domain)
64 if not validators.domain(args.domain):
65 logger.warning("args.domain='%s' is not valid", args.domain)
67 elif blacklist.is_blacklisted(args.domain):
68 logger.warning("args.domain='%s' is blacklisted", args.domain)
70 elif instances.is_registered(args.domain):
71 logger.warning("args.domain='%s' is already registered", args.domain)
74 logger.info("args.domain='%s' is not known", args.domain)
76 logger.debug("status=%d - EXIT!", status)
79 def check_nodeinfo(args: argparse.Namespace) -> int:
80 logger.debug("args[]='%s' - CALLED!", type(args))
83 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
86 for row in database.cursor.fetchall():
87 logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
88 punycode = row["domain"].encode("idna").decode("utf-8")
90 if row["nodeinfo_url"].startswith("/"):
91 logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
93 elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
94 logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
97 logger.info("Found %d row(s)", cnt)
102 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
103 logger.debug("args[]='%s' - CALLED!", type(args))
105 # No CSRF by default, you don't have to add network.source_headers by yourself here
107 source_domain = "pixelfed.org"
109 if sources.is_recent(source_domain):
110 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
113 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
114 sources.update(source_domain)
117 logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
118 headers = csrf.determine(source_domain, dict())
119 except network.exceptions as exception:
120 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
124 logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
125 fetched = network.get_json_api(
127 "/api/v1/servers/all.json?scope=All&country=all&language=all",
129 (config.get("connection_timeout"), config.get("read_timeout"))
132 logger.debug("JSON API returned %d elements", len(fetched))
133 if "error_message" in fetched:
134 logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
136 elif "data" not in fetched["json"]:
137 logger.warning("API did not return JSON with 'data' element - EXIT!")
140 rows = fetched["json"]["data"]
141 logger.info("Checking %d fetched rows ...", len(rows))
143 logger.debug("row[]='%s'", type(row))
144 if "domain" not in row:
145 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
147 elif row["domain"] == "":
148 logger.debug("row[domain] is empty - SKIPPED!")
151 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
152 domain = row["domain"].encode("idna").decode("utf-8")
153 logger.debug("domain='%s' - AFTER!", domain)
155 if not utils.is_domain_wanted(domain):
156 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
158 elif instances.is_registered(domain):
159 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
161 elif instances.is_recent(domain):
162 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
165 logger.debug("Fetching instances from domain='%s' ...", domain)
166 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
168 except network.exceptions as exception:
169 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
172 logger.debug("Success! - EXIT!")
175 def fetch_bkali(args: argparse.Namespace) -> int:
176 logger.debug("args[]='%s' - CALLED!", type(args))
178 logger.debug("Invoking locking.acquire() ...")
181 source_domain = "gql.api.bka.li"
182 if sources.is_recent(source_domain):
183 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
186 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
187 sources.update(source_domain)
191 logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
192 fetched = network.post_json_api(
196 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
200 logger.debug("fetched[]='%s'", type(fetched))
201 if "error_message" in fetched:
202 logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
204 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
205 logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
208 rows = fetched["json"]
210 logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
212 raise Exception("WARNING: Returned no records")
213 elif "data" not in rows:
214 raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
215 elif "nodeinfo" not in rows["data"]:
216 raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
218 for entry in rows["data"]["nodeinfo"]:
219 logger.debug("entry[%s]='%s'", type(entry), entry)
220 if "domain" not in entry:
221 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
223 elif entry["domain"] == "":
224 logger.debug("entry[domain] is empty - SKIPPED!")
226 elif not utils.is_domain_wanted(entry["domain"]):
227 logger.debug("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
229 elif instances.is_registered(entry["domain"]):
230 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
232 elif instances.is_recent(entry["domain"]):
233 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
236 logger.debug("Adding domain='%s' ...", entry["domain"])
237 domains.append(entry["domain"])
239 except network.exceptions as exception:
240 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
243 logger.debug("domains()=%d", len(domains))
245 logger.info("Adding %d new instances ...", len(domains))
246 for domain in domains:
247 logger.debug("domain='%s' - BEFORE!", domain)
248 domain = domain.encode("idna").decode("utf-8")
249 logger.debug("domain='%s' - AFTER!", domain)
252 logger.info("Fetching instances from domain='%s' ...", domain)
253 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
254 except network.exceptions as exception:
255 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
256 instances.set_last_error(domain, exception)
259 logger.debug("Success - EXIT!")
262 def fetch_blocks(args: argparse.Namespace) -> int:
263 logger.debug("args[]='%s' - CALLED!", type(args))
264 if args.domain is not None and args.domain != "":
265 logger.debug("args.domain='%s' - checking ...", args.domain)
266 if not validators.domain(args.domain):
267 logger.warning("args.domain='%s' is not valid.", args.domain)
269 elif blacklist.is_blacklisted(args.domain):
270 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
272 elif not instances.is_registered(args.domain):
273 logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
276 logger.debug("Invoking locking.acquire() ...")
279 if args.domain is not None and args.domain != "":
280 # Re-check single domain
281 logger.debug("Querying database for single args.domain='%s' ...", args.domain)
282 database.cursor.execute(
283 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
285 elif args.software is not None and args.software != "":
286 # Re-check single software
287 logger.debug("Querying database for args.software='%s' ...", args.software)
288 database.cursor.execute(
289 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
292 # Re-check after "timeout" (aka. minimum interval)
293 database.cursor.execute(
294 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
297 rows = database.cursor.fetchall()
298 logger.info("Checking %d entries ...", len(rows))
299 for blocker, software, origin, nodeinfo_url in rows:
300 logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
301 blocker = tidyup.domain(blocker)
302 logger.debug("blocker='%s' - AFTER!", blocker)
305 logger.warning("blocker is now empty!")
307 elif nodeinfo_url is None or nodeinfo_url == "":
308 logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
310 elif not utils.is_domain_wanted(blocker):
311 logger.debug("blocker='%s' is not wanted - SKIPPED!", blocker)
314 logger.debug("blocker='%s'", blocker)
315 instances.set_last_blocked(blocker)
316 instances.set_has_obfuscation(blocker, False)
319 if software == "pleroma":
320 logger.info("blocker='%s',software='%s'", blocker, software)
321 blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
322 elif software == "mastodon":
323 logger.info("blocker='%s',software='%s'", blocker, software)
324 blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
325 elif software == "lemmy":
326 logger.info("blocker='%s',software='%s'", blocker, software)
327 blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
328 elif software == "friendica":
329 logger.info("blocker='%s',software='%s'", blocker, software)
330 blocking = friendica.fetch_blocks(blocker)
331 elif software == "misskey":
332 logger.info("blocker='%s',software='%s'", blocker, software)
333 blocking = misskey.fetch_blocks(blocker)
335 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
337 logger.debug("blocker='%s'", blocker)
338 if blocker != "chaos.social":
339 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
340 instances.set_total_blocks(blocker, blocking)
342 logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
344 for block in blocking:
345 logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
347 if block["block_level"] == "":
348 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
351 logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
352 block["blocked"] = tidyup.domain(block["blocked"])
353 block["reason"] = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
354 logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
356 if block["blocked"] == "":
357 logger.warning("blocked is empty, blocker='%s'", blocker)
359 elif block["blocked"].endswith(".onion"):
360 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
362 elif block["blocked"].endswith(".arpa"):
363 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
365 elif block["blocked"].endswith(".tld"):
366 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
368 elif block["blocked"].find("*") >= 0:
369 logger.debug("blocker='%s' uses obfuscated domains", blocker)
371 # Some friendica servers also obscure domains without hash
372 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
374 logger.debug("row[]='%s'", type(row))
376 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
377 instances.set_has_obfuscation(blocker, True)
380 block["blocked"] = row["domain"]
381 origin = row["origin"]
382 nodeinfo_url = row["nodeinfo_url"]
383 elif block["blocked"].find("?") >= 0:
384 logger.debug("blocker='%s' uses obfuscated domains", blocker)
386 # Some obscure them with question marks, not sure if that's dependent on version or not
387 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
389 logger.debug("row[]='%s'", type(row))
391 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
392 instances.set_has_obfuscation(blocker, True)
395 block["blocked"] = row["domain"]
396 origin = row["origin"]
397 nodeinfo_url = row["nodeinfo_url"]
399 logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
400 if block["blocked"] == "":
401 logger.debug("block[blocked] is empty - SKIPPED!")
404 logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
405 block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
406 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
408 if not utils.is_domain_wanted(block["blocked"]):
409 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
411 elif block["block_level"] in ["accept", "accepted"]:
412 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
414 elif not instances.is_registered(block["blocked"]):
415 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
416 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
418 block["block_level"] = blocks.alias_block_level(block["block_level"])
420 if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
421 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
423 "blocked": block["blocked"],
424 "reason" : block["reason"],
427 logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
428 cookies.clear(block["blocked"])
430 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
431 if instances.has_pending(blocker):
432 logger.debug("Flushing updates for blocker='%s' ...", blocker)
433 instances.update_data(blocker)
435 logger.debug("Invoking commit() ...")
436 database.connection.commit()
438 logger.debug("Invoking cookies.clear(%s) ...", blocker)
439 cookies.clear(blocker)
441 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
442 if config.get("bot_enabled") and len(blockdict) > 0:
443 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
444 network.send_bot_post(blocker, blockdict)
446 logger.debug("Success! - EXIT!")
449 def fetch_observer(args: argparse.Namespace) -> int:
450 logger.debug("args[]='%s' - CALLED!", type(args))
452 logger.debug("Invoking locking.acquire() ...")
455 source_domain = "fediverse.observer"
456 if sources.is_recent(source_domain):
457 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
460 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
461 sources.update(source_domain)
464 if args.software is None:
465 logger.info("Fetching software list ...")
466 raw = utils.fetch_url(
467 f"https://{source_domain}",
469 (config.get("connection_timeout"), config.get("read_timeout"))
471 logger.debug("raw[%s]()=%d", type(raw), len(raw))
473 doc = bs4.BeautifulSoup(raw, features="html.parser")
474 logger.debug("doc[]='%s'", type(doc))
476 navbar = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"})
477 logger.debug("navbar[]='%s'", type(navbar))
479 logger.warning("Cannot find navigation bar, cannot continue!")
482 items = navbar.findAll("a", {"class": "dropdown-item"})
483 logger.debug("items[]='%s'", type(items))
485 logger.info("Checking %d menu items ...", len(items))
487 logger.debug("item[%s]='%s'", type(item), item)
488 if item.text.lower() == "all":
489 logger.debug("Skipping 'All' menu entry ...")
492 logger.debug("Appending item.text='%s' ...", item.text)
493 types.append(tidyup.domain(item.text))
495 logger.info("Adding args.software='%s' as type ...", args.software)
496 types.append(args.software)
498 logger.info("Fetching %d different table data ...", len(types))
499 for software in types:
500 logger.debug("software='%s' - BEFORE!", software)
501 if args.software is not None and args.software != software:
502 logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
507 logger.debug("Fetching table data for software='%s' ...", software)
508 raw = utils.fetch_url(
509 f"https://{source_domain}/app/views/tabledata.php?software={software}",
511 (config.get("connection_timeout"), config.get("read_timeout"))
513 logger.debug("raw[%s]()=%d", type(raw), len(raw))
515 doc = bs4.BeautifulSoup(raw, features="html.parser")
516 logger.debug("doc[]='%s'", type(doc))
517 except network.exceptions as exception:
518 logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
521 items = doc.findAll("a", {"class": "url"})
522 logger.info("Checking %d items,software='%s' ...", len(items), software)
524 logger.debug("item[]='%s'", type(item))
525 domain = item.decode_contents()
526 logger.debug("domain='%s' - AFTER!", domain)
529 logger.debug("domain is empty - SKIPPED!")
532 logger.debug("domain='%s' - BEFORE!", domain)
533 domain = domain.encode("idna").decode("utf-8")
534 logger.debug("domain='%s' - AFTER!", domain)
536 if not utils.is_domain_wanted(domain):
537 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
539 elif instances.is_registered(domain):
540 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
542 elif instances.is_recent(domain):
543 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
546 software = software_helper.alias(software)
547 logger.info("Fetching instances for domain='%s'", domain)
548 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
550 logger.debug("Success! - EXIT!")
553 def fetch_todon_wiki(args: argparse.Namespace) -> int:
554 logger.debug("args[]='%s' - CALLED!", type(args))
556 logger.debug("Invoking locking.acquire() ...")
559 source_domain = "wiki.todon.eu"
560 if sources.is_recent(source_domain):
561 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
564 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
565 sources.update(source_domain)
572 raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
573 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
575 doc = bs4.BeautifulSoup(raw, "html.parser")
576 logger.debug("doc[]='%s'", type(doc))
578 silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
579 logger.info("Checking %d silenced/limited entries ...", len(silenced))
580 blocklist["silenced"] = utils.find_domains(silenced, "div")
582 suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
583 logger.info("Checking %d suspended entries ...", len(suspended))
584 blocklist["reject"] = utils.find_domains(suspended, "div")
586 blocking = blocklist["silenced"] + blocklist["reject"]
589 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
590 instances.set_total_blocks(blocker, blocking)
593 for block_level in blocklist:
594 blockers = blocklist[block_level]
596 logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
597 for blocked in blockers:
598 logger.debug("blocked='%s'", blocked)
600 if not instances.is_registered(blocked):
602 logger.info("Fetching instances from domain='%s' ...", blocked)
603 federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
604 except network.exceptions as exception:
605 logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
606 instances.set_last_error(blocked, exception)
608 if blocks.is_instance_blocked(blocker, blocked, block_level):
609 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
612 logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
613 if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
614 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
620 logger.debug("Invoking commit() ...")
621 database.connection.commit()
623 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
624 if config.get("bot_enabled") and len(blockdict) > 0:
625 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
626 network.send_bot_post(blocker, blockdict)
628 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
629 if instances.has_pending(blocker):
630 logger.debug("Flushing updates for blocker='%s' ...", blocker)
631 instances.update_data(blocker)
633 logger.debug("Success! - EXIT!")
636 def fetch_cs(args: argparse.Namespace):
637 logger.debug("args[]='%s' - CALLED!", type(args))
639 logger.debug("Invoking locking.acquire() ...")
667 source_domain = "raw.githubusercontent.com"
668 if sources.is_recent(source_domain):
669 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
672 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
673 sources.update(source_domain)
675 raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
676 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
678 doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
679 logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
681 silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
682 logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
683 blocklist["silenced"] = federation.find_domains(silenced)
685 blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
686 logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
687 blocklist["reject"] = federation.find_domains(blocked)
689 blocking = blocklist["silenced"] + blocklist["reject"]
690 blocker = "chaos.social"
692 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
693 instances.set_total_blocks(blocker, blocking)
695 logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
696 if len(blocking) > 0:
698 for block_level in blocklist:
699 logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
701 for row in blocklist[block_level]:
702 logger.debug("row[%s]='%s'", type(row), row)
703 if not "domain" in row:
704 logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
706 elif not instances.is_registered(row["domain"]):
708 logger.info("Fetching instances from domain='%s' ...", row["domain"])
709 federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
710 except network.exceptions as exception:
711 logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
712 instances.set_last_error(row["domain"], exception)
714 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
715 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
717 "blocked": row["domain"],
718 "reason" : row["reason"],
721 logger.debug("Invoking commit() ...")
722 database.connection.commit()
724 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
725 if config.get("bot_enabled") and len(blockdict) > 0:
726 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
727 network.send_bot_post(blocker, blockdict)
729 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
730 if instances.has_pending(blocker):
731 logger.debug("Flushing updates for blocker='%s' ...", blocker)
732 instances.update_data(blocker)
734 logger.debug("Success! - EXIT!")
737 def fetch_fba_rss(args: argparse.Namespace) -> int:
738 logger.debug("args[]='%s' - CALLED!", type(args))
742 logger.debug("Invoking locking.acquire() ...")
745 components = urlparse(args.feed)
747 if sources.is_recent(components.netloc):
748 logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
751 logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
752 sources.update(components.netloc)
754 logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
755 response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
757 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
758 if response.ok and response.status_code < 300 and len(response.text) > 0:
759 logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
760 rss = atoma.parse_rss_bytes(response.content)
762 logger.debug("rss[]='%s'", type(rss))
763 for item in rss.items:
764 logger.debug("item[%s]='%s'", type(item), item)
765 domain = tidyup.domain(item.link.split("=")[1])
767 logger.debug("domain='%s' - AFTER!", domain)
769 logger.debug("domain is empty - SKIPPED!")
772 logger.debug("domain='%s' - BEFORE!", domain)
773 domain = domain.encode("idna").decode("utf-8")
774 logger.debug("domain='%s' - AFTER!", domain)
776 if not utils.is_domain_wanted(domain):
777 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
779 elif domain in domains:
780 logger.debug("domain='%s' is already added - SKIPPED!", domain)
782 elif instances.is_registered(domain):
783 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
785 elif instances.is_recent(domain):
786 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
789 logger.debug("Adding domain='%s'", domain)
790 domains.append(domain)
792 logger.debug("domains()=%d", len(domains))
794 logger.info("Adding %d new instances ...", len(domains))
795 for domain in domains:
796 logger.debug("domain='%s'", domain)
798 logger.info("Fetching instances from domain='%s' ...", domain)
799 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
800 except network.exceptions as exception:
801 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
802 instances.set_last_error(domain, exception)
805 logger.debug("Success! - EXIT!")
808 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
809 logger.debug("args[]='%s' - CALLED!", type(args))
811 logger.debug("Invoking locking.acquire() ...")
814 source_domain = "ryona.agency"
815 if sources.is_recent(source_domain):
816 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
819 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
820 sources.update(source_domain)
822 feed = f"https://{source_domain}/users/fba/feed.atom"
826 logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
827 response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
829 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
830 if response.ok and response.status_code < 300 and len(response.text) > 0:
831 logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
832 atom = atoma.parse_atom_bytes(response.content)
834 logger.debug("atom[]='%s'", type(atom))
835 for entry in atom.entries:
836 logger.debug("entry[]='%s'", type(entry))
837 doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
838 logger.debug("doc[]='%s'", type(doc))
839 for element in doc.findAll("a"):
840 logger.debug("element[]='%s'", type(element))
841 for href in element["href"].split(","):
842 logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
843 domain = tidyup.domain(href)
845 logger.debug("domain='%s' - AFTER!", domain)
847 logger.debug("domain is empty - SKIPPED!")
850 logger.debug("domain='%s' - BEFORE!", domain)
851 domain = domain.encode("idna").decode("utf-8")
852 logger.debug("domain='%s' - AFTER!", domain)
854 if not utils.is_domain_wanted(domain):
855 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
857 elif domain in domains:
858 logger.debug("domain='%s' is already added - SKIPPED!", domain)
860 elif instances.is_registered(domain):
861 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
863 elif instances.is_recent(domain):
864 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
867 logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
868 domains.append(domain)
870 logger.debug("domains()=%d", len(domains))
872 logger.info("Adding %d new instances ...", len(domains))
873 for domain in domains:
874 logger.debug("domain='%s'", domain)
876 logger.info("Fetching instances from domain='%s' ...", domain)
877 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
878 except network.exceptions as exception:
879 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
880 instances.set_last_error(domain, exception)
883 logger.debug("Success! - EXIT!")
886 def fetch_instances(args: argparse.Namespace) -> int:
887 logger.debug("args[]='%s' - CALLED!", type(args))
889 logger.debug("args.domain='%s' - checking ...", args.domain)
890 if not validators.domain(args.domain):
891 logger.warning("args.domain='%s' is not valid.", args.domain)
893 elif blacklist.is_blacklisted(args.domain):
894 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
897 logger.debug("Invoking locking.acquire() ...")
902 logger.info("Fetching instances from args.domain='%s' ...", args.domain)
903 federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
904 except network.exceptions as exception:
905 logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
906 instances.set_last_error(args.domain, exception)
907 instances.update_data(args.domain)
911 logger.debug("Not fetching more instances - EXIT!")
914 # Loop through some instances
915 database.cursor.execute(
916 "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe', 'gotosocial', 'brighteon', 'wildebeest', 'bookwyrm', 'mitra', 'areionskey') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
919 rows = database.cursor.fetchall()
920 logger.info("Checking %d entries ...", len(rows))
922 logger.debug("row[domain]='%s'", row["domain"])
923 if row["domain"] == "":
924 logger.debug("row[domain] is empty - SKIPPED!")
927 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
928 domain = row["domain"].encode("idna").decode("utf-8")
929 logger.debug("domain='%s' - AFTER!", domain)
931 if not utils.is_domain_wanted(domain):
932 logger.debug("Domain domain='%s' is not wanted - SKIPPED!", domain)
936 logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
937 federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
938 except network.exceptions as exception:
939 logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
940 instances.set_last_error(domain, exception)
942 logger.debug("Success - EXIT!")
945 def fetch_oliphant(args: argparse.Namespace) -> int:
946 logger.debug("args[]='%s' - CALLED!", type(args))
948 logger.debug("Invoking locking.acquire() ...")
951 source_domain = "codeberg.org"
952 if sources.is_recent(source_domain):
953 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
956 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
957 sources.update(source_domain)
960 base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
965 "blocker": "artisan.chat",
966 "csv_url": "mastodon/artisan.chat.csv",
968 "blocker": "mastodon.art",
969 "csv_url": "mastodon/mastodon.art.csv",
971 "blocker": "pleroma.envs.net",
972 "csv_url": "mastodon/pleroma.envs.net.csv",
974 "blocker": "oliphant.social",
975 "csv_url": "mastodon/_unified_tier3_blocklist.csv",
977 "blocker": "mastodon.online",
978 "csv_url": "mastodon/mastodon.online.csv",
980 "blocker": "mastodon.social",
981 "csv_url": "mastodon/mastodon.social.csv",
983 "blocker": "mastodon.social",
984 "csv_url": "other/missing-tier0-mastodon.social.csv",
986 "blocker": "rage.love",
987 "csv_url": "mastodon/rage.love.csv",
989 "blocker": "sunny.garden",
990 "csv_url": "mastodon/sunny.garden.csv",
992 "blocker": "sunny.garden",
993 "csv_url": "mastodon/gardenfence.csv",
995 "blocker": "solarpunk.moe",
996 "csv_url": "mastodon/solarpunk.moe.csv",
998 "blocker": "toot.wales",
999 "csv_url": "mastodon/toot.wales.csv",
1001 "blocker": "union.place",
1002 "csv_url": "mastodon/union.place.csv",
1004 "blocker": "oliphant.social",
1005 "csv_url": "mastodon/birdsite.csv",
1011 logger.debug("Downloading %d files ...", len(blocklists))
1012 for block in blocklists:
1013 # Is domain given and not equal blocker?
1014 if isinstance(args.domain, str) and args.domain != block["blocker"]:
1015 logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1017 elif args.domain in domains:
1018 logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1022 logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1023 response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1025 logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1026 if not response.ok or response.status_code >= 300 or response.content == "":
1027 logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1030 logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1031 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1037 logger.debug("row[%s]='%s'", type(row), row)
1038 domain = severity = None
1039 reject_media = reject_reports = False
1041 if "#domain" in row:
1042 domain = row["#domain"]
1043 elif "domain" in row:
1044 domain = row["domain"]
1046 logger.debug("row='%s' does not contain domain column", row)
1049 if "#severity" in row:
1050 severity = blocks.alias_block_level(row["#severity"])
1051 elif "severity" in row:
1052 severity = blocks.alias_block_level(row["severity"])
1054 logger.debug("row='%s' does not contain severity column", row)
1057 if "#reject_media" in row and row["#reject_media"].lower() == "true":
1059 elif "reject_media" in row and row["reject_media"].lower() == "true":
1062 if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1063 reject_reports = True
1064 elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1065 reject_reports = True
1068 logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1070 logger.debug("domain is empty - SKIPPED!")
1072 elif domain.endswith(".onion"):
1073 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1075 elif domain.endswith(".arpa"):
1076 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1078 elif domain.endswith(".tld"):
1079 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1081 elif domain.find("*") >= 0 or domain.find("?") >= 0:
1082 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1083 domain = utils.deobfuscate(domain, block["blocker"])
1084 logger.debug("domain='%s' - AFTER!", domain)
1086 if not validators.domain(domain):
1087 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1089 elif blacklist.is_blacklisted(domain):
1090 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1092 elif blocks.is_instance_blocked(block["blocker"], domain, severity):
1093 logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
1096 logger.debug("Marking domain='%s' as handled", domain)
1097 domains.append(domain)
1099 logger.debug("Processing domain='%s' ...", domain)
1100 processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1101 logger.debug("processed='%s'", processed)
1103 if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1104 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1107 "reason" : block["reason"],
1111 processing.block(block["blocker"], domain, None, "reject_media")
1113 processing.block(block["blocker"], domain, None, "reject_reports")
1115 logger.debug("block[blocker]='%s'", block["blocker"])
1116 if block["blocker"] != "chaos.social":
1117 logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1118 instances.set_total_blocks(block["blocker"], domains)
1120 logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1121 if instances.has_pending(block["blocker"]):
1122 logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1123 instances.update_data(block["blocker"])
1125 logger.debug("Invoking commit() ...")
1126 database.connection.commit()
1128 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1129 if config.get("bot_enabled") and len(blockdict) > 0:
1130 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1131 network.send_bot_post(block["blocker"], blockdict)
1133 logger.debug("Success! - EXIT!")
1136 def fetch_txt(args: argparse.Namespace) -> int:
1137 logger.debug("args[]='%s' - CALLED!", type(args))
1139 logger.debug("Invoking locking.acquire() ...")
1144 "blocker": "seirdy.one",
1145 "url" : "https://seirdy.one/pb/bsl.txt",
1148 logger.info("Checking %d text file(s) ...", len(urls))
1150 logger.debug("Fetching row[url]='%s' ...", row["url"])
1151 response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1153 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1154 if response.ok and response.status_code < 300 and response.text != "":
1155 logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1156 domains = response.text.split("\n")
1158 logger.info("Processing %d domains ...", len(domains))
1159 for domain in domains:
1160 logger.debug("domain='%s' - BEFORE!", domain)
1161 domain = tidyup.domain(domain)
1163 logger.debug("domain='%s' - AFTER!", domain)
1165 logger.debug("domain is empty - SKIPPED!")
1167 elif not utils.is_domain_wanted(domain):
1168 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1170 elif instances.is_recent(domain):
1171 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1174 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1175 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1177 logger.debug("processed='%s'", processed)
1179 logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1182 logger.debug("Success! - EXIT!")
1185 def fetch_fedipact(args: argparse.Namespace) -> int:
1186 logger.debug("args[]='%s' - CALLED!", type(args))
1188 logger.debug("Invoking locking.acquire() ...")
1191 source_domain = "fedipact.online"
1192 if sources.is_recent(source_domain):
1193 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1196 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1197 sources.update(source_domain)
1199 logger.info("Fetching / from source_domain='%s' ...", source_domain)
1200 response = utils.fetch_url(
1201 f"https://{source_domain}",
1202 network.web_headers,
1203 (config.get("connection_timeout"), config.get("read_timeout"))
1206 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1207 if response.ok and response.status_code < 300 and response.text != "":
1208 logger.debug("Parsing %d Bytes ...", len(response.text))
1210 doc = bs4.BeautifulSoup(response.text, "html.parser")
1211 logger.debug("doc[]='%s'", type(doc))
1213 rows = doc.findAll("li")
1214 logger.info("Checking %d row(s) ...", len(rows))
1216 logger.debug("row[]='%s'", type(row))
1217 domain = tidyup.domain(row.contents[0])
1219 logger.debug("domain='%s' - AFTER!", domain)
1221 logger.debug("domain is empty - SKIPPED!")
1224 logger.debug("domain='%s' - BEFORE!", domain)
1225 domain = domain.encode("idna").decode("utf-8")
1226 logger.debug("domain='%s' - AFTER!", domain)
1228 if not utils.is_domain_wanted(domain):
1229 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1231 elif instances.is_registered(domain):
1232 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1234 elif instances.is_recent(domain):
1235 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1238 logger.info("Fetching domain='%s' ...", domain)
1239 federation.fetch_instances(domain, "beach.city", None, inspect.currentframe().f_code.co_name)
1241 logger.debug("Success! - EXIT!")
1244 def fetch_joinmisskey(args: argparse.Namespace) -> int:
1245 logger.debug("args[]='%s' - CALLED!", type(args))
1247 logger.debug("Invoking locking.acquire() ...")
1250 source_domain = "instanceapp.misskey.page"
1251 if sources.is_recent(source_domain):
1252 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1255 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1256 sources.update(source_domain)
1258 logger.info("Fetching instances.json from source_domain='%s' ...", source_domain)
1259 raw = utils.fetch_url(
1260 f"https://{source_domain}/instances.json",
1261 network.web_headers,
1262 (config.get("connection_timeout"), config.get("read_timeout"))
1264 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1266 parsed = json.loads(raw)
1267 logger.debug("parsed[%s]()=%d", type(parsed), len(parsed))
1269 if "instancesInfos" not in parsed:
1270 logger.warning("parsed()=%d does not contain element 'instancesInfos'")
1273 logger.info("Checking %d instane(s) ...", len(parsed["instancesInfos"]))
1274 for row in parsed["instancesInfos"]:
1275 logger.debug("row[%s]='%s'", type(row), row)
1276 if "url" not in row:
1277 logger.warning("row()=%d does not have element 'url' - SKIPPED!", len(row))
1279 elif not utils.is_domain_wanted(row["url"]):
1280 logger.debug("row[url]='%s' is not wanted - SKIPPED!", row["url"])
1282 elif instances.is_registered(row["url"]):
1283 logger.debug("row[url]='%s' is already registered - SKIPPED!", row["url"])
1286 logger.info("Fetching row[url]='%s' ...", row["url"])
1287 federation.fetch_instances(row["url"], "misskey.io", None, inspect.currentframe().f_code.co_name)
1289 logger.debug("Success! - EXIT!")
1292 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1293 logger.debug("args[]='%s' - CALLED!", type(args))
1295 logger.debug("Invoking locking.acquire() ...")
1298 source_domain = "joinfediverse.wiki"
1299 if sources.is_recent(source_domain):
1300 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1303 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1304 sources.update(source_domain)
1306 logger.info("Fetching /FediBlock wiki page from source_domain='%s' ...", source_domain)
1307 raw = utils.fetch_url(
1308 f"https://{source_domain}/FediBlock",
1309 network.web_headers,
1310 (config.get("connection_timeout"), config.get("read_timeout"))
1312 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1314 doc = bs4.BeautifulSoup(raw, "html.parser")
1315 logger.debug("doc[]='%s'", type(doc))
1317 tables = doc.findAll("table", {"class": "wikitable"})
1319 logger.info("Analyzing %d table(s) ...", len(tables))
1321 for table in tables:
1322 logger.debug("table[]='%s'", type(table))
1324 rows = table.findAll("tr")
1325 logger.info("Checking %d row(s) ...", len(rows))
1326 block_headers = dict()
1328 logger.debug("row[%s]='%s'", type(row), row)
1330 headers = row.findAll("th")
1331 logger.debug("Found headers()=%d header(s)", len(headers))
1332 if len(headers) > 1:
1333 block_headers = dict()
1335 for header in headers:
1337 logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1338 text = header.contents[0]
1340 logger.debug("text[]='%s'", type(text))
1341 if not isinstance(text, str):
1342 logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1344 elif validators.domain(text.strip()):
1345 logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1348 text = tidyup.domain(text.strip())
1349 logger.debug("text='%s' - AFTER!", text)
1350 if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1351 logger.debug("Found header: '%s'=%d", text, cnt)
1352 block_headers[cnt] = text
1354 elif len(block_headers) == 0:
1355 logger.debug("row is not scrapable - SKIPPED!")
1357 elif len(block_headers) > 0:
1358 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1362 for element in row.find_all(["th", "td"]):
1364 logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1365 if cnt in block_headers:
1366 logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1368 text = element.text.strip()
1369 key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1371 logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1372 if key in ["domain", "instance"]:
1374 elif key == "reason":
1375 block[key] = tidyup.reason(text)
1376 elif key == "subdomain(s)":
1379 block[key] = text.split("/")
1381 logger.debug("key='%s'", key)
1384 logger.debug("block()=%d ...", len(block))
1386 logger.debug("Appending block()=%d ...", len(block))
1387 blocklist.append(block)
1389 logger.debug("blocklist()=%d", len(blocklist))
1391 database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1392 domains = database.cursor.fetchall()
1394 logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1396 for block in blocklist:
1397 logger.debug("block='%s'", block)
1398 if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1399 origin = block["blocked"]
1400 logger.debug("origin='%s'", origin)
1401 for subdomain in block["subdomain(s)"]:
1402 block["blocked"] = subdomain + "." + origin
1403 logger.debug("block[blocked]='%s'", block["blocked"])
1404 blocking.append(block)
1406 blocking.append(block)
1408 logger.debug("blocking()=%d", blocking)
1409 for block in blocking:
1410 logger.debug("block[]='%s'", type(block))
1411 if "blocked" not in block:
1412 raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1414 block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1415 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1417 if block["blocked"] == "":
1418 logger.debug("block[blocked] is empty - SKIPPED!")
1420 elif not utils.is_domain_wanted(block["blocked"]):
1421 logger.debug("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1423 elif instances.is_recent(block["blocked"]):
1424 logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1427 logger.info("Proccessing blocked='%s' ...", block["blocked"])
1428 processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1431 for blocker in domains:
1432 blocker = blocker[0]
1433 logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1435 for block in blocking:
1436 logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1437 block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1439 logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1440 if block["blocked"] == "":
1441 logger.debug("block[blocked] is empty - SKIPPED!")
1443 elif not utils.is_domain_wanted(block["blocked"]):
1444 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1447 logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1448 if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1449 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1451 "blocked": block["blocked"],
1452 "reason" : block["reason"],
1455 if instances.has_pending(blocker):
1456 logger.debug("Flushing updates for blocker='%s' ...", blocker)
1457 instances.update_data(blocker)
1459 logger.debug("Invoking commit() ...")
1460 database.connection.commit()
1462 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1463 if config.get("bot_enabled") and len(blockdict) > 0:
1464 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1465 network.send_bot_post(blocker, blockdict)
1467 logger.debug("Success! - EXIT!")
1470 def recheck_obfuscation(args: argparse.Namespace) -> int:
1471 logger.debug("args[]='%s' - CALLED!", type(args))
1473 logger.debug("Invoking locking.acquire() ...")
1476 if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1477 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1478 elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1479 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1481 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1483 rows = database.cursor.fetchall()
1484 logger.info("Checking %d domains ...", len(rows))
1486 logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1487 if (args.force is None or not args.force) and args.domain is None and args.software is None and instances.is_recent(row["domain"], "last_blocked"):
1488 logger.debug("row[domain]='%s' has been recently checked, args.force[]='%s' - SKIPPED!", row["domain"], type(args.force))
1492 if row["software"] == "pleroma":
1493 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1494 blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1495 elif row["software"] == "mastodon":
1496 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1497 blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1498 elif row["software"] == "lemmy":
1499 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1500 blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1501 elif row["software"] == "friendica":
1502 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1503 blocking = friendica.fetch_blocks(row["domain"])
1504 elif row["software"] == "misskey":
1505 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1506 blocking = misskey.fetch_blocks(row["domain"])
1508 logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1510 logger.debug("row[domain]='%s'", row["domain"])
1511 # chaos.social requires special care ...
1512 if row["domain"] != "chaos.social":
1513 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1514 instances.set_total_blocks(row["domain"], blocking)
1519 logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1520 for block in blocking:
1521 logger.debug("block[blocked]='%s'", block["blocked"])
1524 if block["blocked"] == "":
1525 logger.debug("block[blocked] is empty - SKIPPED!")
1527 elif block["blocked"].endswith(".arpa"):
1528 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1530 elif block["blocked"].endswith(".tld"):
1531 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1533 elif block["blocked"].endswith(".onion"):
1534 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1536 elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1537 logger.debug("block='%s' is obfuscated.", block["blocked"])
1538 obfuscated = obfuscated + 1
1539 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1540 elif not utils.is_domain_wanted(block["blocked"]):
1541 logger.debug("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1543 elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1544 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1547 logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1548 if blocked is not None and blocked != block["blocked"]:
1549 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1550 obfuscated = obfuscated - 1
1551 if blocks.is_instance_blocked(row["domain"], blocked):
1552 logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1555 block["block_level"] = blocks.alias_block_level(block["block_level"])
1557 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1558 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1559 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1562 "reason" : block["reason"],
1565 logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1566 if obfuscated == 0 and len(blocking) > 0:
1567 logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1568 instances.set_has_obfuscation(row["domain"], False)
1570 if instances.has_pending(row["domain"]):
1571 logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1572 instances.update_data(row["domain"])
1574 logger.debug("Invoking commit() ...")
1575 database.connection.commit()
1577 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1578 if config.get("bot_enabled") and len(blockdict) > 0:
1579 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1580 network.send_bot_post(row["domain"], blockdict)
1582 logger.debug("Success! - EXIT!")
1585 def fetch_fedilist(args: argparse.Namespace) -> int:
1586 logger.debug("args[]='%s' - CALLED!", type(args))
1588 logger.debug("Invoking locking.acquire() ...")
1591 source_domain = "demo.fedilist.com"
1592 if sources.is_recent(source_domain):
1593 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1596 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1597 sources.update(source_domain)
1599 url = f"http://{source_domain}/instance/csv?onion=not"
1600 if args.software is not None and args.software != "":
1601 logger.debug("args.software='%s'", args.software)
1602 url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1604 logger.info("Fetching url='%s' ...", url)
1605 response = reqto.get(
1607 headers=network.web_headers,
1608 timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1609 allow_redirects=False
1612 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1613 if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1614 logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1617 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1619 logger.debug("reader[]='%s'", type(reader))
1621 logger.warning("Failed parsing response.content()=%d as CSV content", len(response.content))
1626 logger.info("Checking %d rows ...", len(rows))
1628 logger.debug("row[]='%s'", type(row))
1629 if "hostname" not in row:
1630 logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1633 logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1634 domain = tidyup.domain(row["hostname"])
1635 logger.debug("domain='%s' - AFTER!", domain)
1638 logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1641 logger.debug("domain='%s' - BEFORE!", domain)
1642 domain = domain.encode("idna").decode("utf-8")
1643 logger.debug("domain='%s' - AFTER!", domain)
1645 if not utils.is_domain_wanted(domain):
1646 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1648 elif (args.force is None or not args.force) and instances.is_registered(domain):
1649 logger.debug("domain='%s' is already registered, --force not specified: args.force[]='%s'", domain, type(args.force))
1651 elif instances.is_recent(domain):
1652 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1655 logger.info("Fetching instances from domain='%s' ...", domain)
1656 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1658 logger.debug("Success! - EXIT!")
1661 def update_nodeinfo(args: argparse.Namespace) -> int:
1662 logger.debug("args[]='%s' - CALLED!", type(args))
1664 logger.debug("Invoking locking.acquire() ...")
1667 if args.domain is not None and args.domain != "":
1668 logger.debug("Fetching args.domain='%s'", args.domain)
1669 database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1670 elif args.software is not None and args.software != "":
1671 logger.info("Fetching domains for args.software='%s'", args.software)
1672 database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1674 logger.info("Fetching domains for recently updated ...")
1675 database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1677 domains = database.cursor.fetchall()
1679 logger.info("Checking %d domain(s) ...", len(domains))
1682 logger.debug("row[]='%s'", type(row))
1684 logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1685 software = federation.determine_software(row["domain"])
1687 logger.debug("Determined software='%s'", software)
1688 if (software != row["software"] and software is not None) or args.force is True:
1689 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1690 instances.set_software(row["domain"], software)
1692 instances.set_success(row["domain"])
1693 except network.exceptions as exception:
1694 logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1695 instances.set_last_error(row["domain"], exception)
1697 instances.set_last_nodeinfo(row["domain"])
1698 instances.update_data(row["domain"])
1701 logger.debug("Success! - EXIT!")
1704 def fetch_instances_social(args: argparse.Namespace) -> int:
1705 logger.debug("args[]='%s' - CALLED!", type(args))
1707 logger.debug("Invoking locking.acquire() ...")
1710 source_domain = "instances.social"
1712 if config.get("instances_social_api_key") == "":
1713 logger.error("API key not set. Please set in your config.json file.")
1715 elif sources.is_recent(source_domain):
1716 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1719 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1720 sources.update(source_domain)
1723 "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1726 fetched = network.get_json_api(
1728 "/api/1.0/instances/list?count=0&sort_by=name",
1730 (config.get("connection_timeout"), config.get("read_timeout"))
1732 logger.debug("fetched[]='%s'", type(fetched))
1734 if "error_message" in fetched:
1735 logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1737 elif "exception" in fetched:
1738 logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1740 elif "json" not in fetched:
1741 logger.warning("fetched has no element 'json' - EXIT!")
1743 elif "instances" not in fetched["json"]:
1744 logger.warning("fetched[row] has no element 'instances' - EXIT!")
1748 rows = fetched["json"]["instances"]
1750 logger.info("Checking %d row(s) ...", len(rows))
1752 logger.debug("row[]='%s'", type(row))
1753 domain = tidyup.domain(row["name"])
1754 logger.debug("domain='%s' - AFTER!", domain)
1757 logger.debug("domain is empty - SKIPPED!")
1760 logger.debug("domain='%s' - BEFORE!", domain)
1761 domain = domain.encode("idna").decode("utf-8")
1762 logger.debug("domain='%s' - AFTER!", domain)
1764 if not utils.is_domain_wanted(domain):
1765 logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
1767 elif domain in domains:
1768 logger.debug("domain='%s' is already added - SKIPPED!", domain)
1770 elif instances.is_registered(domain):
1771 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1773 elif instances.is_recent(domain):
1774 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1777 logger.info("Fetching instances from domain='%s'", domain)
1778 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1780 logger.debug("Success! - EXIT!")
1783 def convert_idna(args: argparse.Namespace) -> int:
1784 logger.debug("args[]='%s' - CALLED!", type(args))
1786 database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1787 rows = database.cursor.fetchall()
1789 logger.debug("rows[]='%s'", type(rows))
1790 instances.translate_idnas(rows, "domain")
1792 database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1793 rows = database.cursor.fetchall()
1795 logger.debug("rows[]='%s'", type(rows))
1796 instances.translate_idnas(rows, "origin")
1798 database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1799 rows = database.cursor.fetchall()
1801 logger.debug("rows[]='%s'", type(rows))
1802 blocks.translate_idnas(rows, "blocker")
1804 database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1805 rows = database.cursor.fetchall()
1807 logger.debug("rows[]='%s'", type(rows))
1808 blocks.translate_idnas(rows, "blocked")
1810 logger.debug("Success! - EXIT!")