1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
23 from urllib.parse import urlparse
33 from fba import database
36 from fba.helpers import blacklist
37 from fba.helpers import config
38 from fba.helpers import cookies
39 from fba.helpers import locking
40 from fba.helpers import processing
41 from fba.helpers import software as software_helper
42 from fba.helpers import tidyup
44 from fba.http import federation
45 from fba.http import network
47 from fba.models import blocks
48 from fba.models import instances
49 from fba.models import sources
51 from fba.networks import friendica
52 from fba.networks import lemmy
53 from fba.networks import mastodon
54 from fba.networks import misskey
55 from fba.networks import pleroma
57 logging.basicConfig(level=logging.INFO)
58 logger = logging.getLogger(__name__)
59 #logger.setLevel(logging.DEBUG)
61 def check_instance(args: argparse.Namespace) -> int:
62 logger.debug("args.domain='%s' - CALLED!", args.domain)
64 if not validators.domain(args.domain):
65 logger.warning("args.domain='%s' is not valid", args.domain)
67 elif blacklist.is_blacklisted(args.domain):
68 logger.warning("args.domain='%s' is blacklisted", args.domain)
70 elif instances.is_registered(args.domain):
71 logger.warning("args.domain='%s' is already registered", args.domain)
74 logger.info("args.domain='%s' is not known", args.domain)
76 logger.debug("status=%d - EXIT!", status)
79 def check_nodeinfo(args: argparse.Namespace) -> int:
80 logger.debug("args[]='%s' - CALLED!", type(args))
83 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE nodeinfo_url IS NOT NULL ORDER BY domain ASC")
86 for row in database.cursor.fetchall():
87 logger.debug("Checking row[domain]='%s',row[software]='%s',row[nodeinfo_url]='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
88 punycode = row["domain"].encode("idna").decode("utf-8")
90 if row["nodeinfo_url"].startswith("/"):
91 logger.debug("row[nodeinfo_url]='%s' is a relative URL and always matches", row["nodeinfo_url"])
93 elif row["nodeinfo_url"].find(punycode) == -1 and row["nodeinfo_url"].find(row["domain"]) == -1:
94 logger.warning("punycode='%s' is not found in row[nodeinfo_url]='%s',row[software]='%s'", punycode, row["nodeinfo_url"], row["software"])
97 logger.info("Found %d row(s)", cnt)
102 def fetch_pixelfed_api(args: argparse.Namespace) -> int:
103 logger.debug("args[]='%s' - CALLED!", type(args))
105 # No CSRF by default, you don't have to add network.source_headers by yourself here
107 source_domain = "pixelfed.org"
109 if sources.is_recent(source_domain):
110 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
113 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
114 sources.update(source_domain)
117 logger.debug("Checking CSRF from source_domain='%s' ...", source_domain)
118 headers = csrf.determine(source_domain, dict())
119 except network.exceptions as exception:
120 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
124 logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
125 fetched = network.get_json_api(
127 "/api/v1/servers/all.json?scope=All&country=all&language=all",
129 (config.get("connection_timeout"), config.get("read_timeout"))
132 logger.debug("JSON API returned %d elements", len(fetched))
133 if "error_message" in fetched:
134 logger.warning("API returned error_message='%s' - EXIT!", fetched["error_message"])
136 elif "data" not in fetched["json"]:
137 logger.warning("API did not return JSON with 'data' element - EXIT!")
140 rows = fetched["json"]["data"]
141 logger.info("Checking %d fetched rows ...", len(rows))
143 logger.debug("row[]='%s'", type(row))
144 if "domain" not in row:
145 logger.warning("row='%s' does not contain element 'domain' - SKIPPED!", row)
147 elif row["domain"] == "":
148 logger.debug("row[domain] is empty - SKIPPED!")
151 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
152 domain = row["domain"].encode("idna").decode("utf-8")
153 logger.debug("domain='%s' - AFTER!", domain)
155 if not utils.is_domain_wanted(domain):
156 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
158 elif instances.is_registered(domain):
159 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
161 elif instances.is_recent(domain):
162 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
165 logger.debug("Fetching instances from domain='%s' ...", domain)
166 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
168 except network.exceptions as exception:
169 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
172 logger.debug("Success! - EXIT!")
175 def fetch_bkali(args: argparse.Namespace) -> int:
176 logger.debug("args[]='%s' - CALLED!", type(args))
178 logger.debug("Invoking locking.acquire() ...")
181 source_domain = "gql.api.bka.li"
182 if sources.is_recent(source_domain):
183 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
186 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
187 sources.update(source_domain)
191 logger.info("Fetching domainlist from source_domain='%s' ...", source_domain)
192 fetched = network.post_json_api(
196 "query": "query domainlist {nodeinfo(order_by: {domain: asc}) {domain}}"
200 logger.debug("fetched[]='%s'", type(fetched))
201 if "error_message" in fetched:
202 logger.warning("post_json_api() for 'gql.sources.bka.li' returned error message='%s", fetched["error_message"])
204 elif isinstance(fetched["json"], dict) and "error" in fetched["json"] and "message" in fetched["json"]["error"]:
205 logger.warning("post_json_api() returned error: '%s", fetched["error"]["message"])
208 rows = fetched["json"]
210 logger.debug("rows(%d)[]='%s'", len(rows), type(rows))
212 raise Exception("WARNING: Returned no records")
213 elif "data" not in rows:
214 raise Exception(f"WARNING: rows()={len(rows)} does not contain key 'data'")
215 elif "nodeinfo" not in rows["data"]:
216 raise Exception(f"WARNING: rows()={len(rows['data'])} does not contain key 'nodeinfo'")
218 for entry in rows["data"]["nodeinfo"]:
219 logger.debug("entry[%s]='%s'", type(entry), entry)
220 if "domain" not in entry:
221 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
223 elif entry["domain"] == "":
224 logger.debug("entry[domain] is empty - SKIPPED!")
226 elif not utils.is_domain_wanted(entry["domain"]):
227 logger.warning("entry[domain]='%s' is not wanted - SKIPPED!", entry["domain"])
229 elif instances.is_registered(entry["domain"]):
230 logger.debug("entry[domain]='%s' is already registered - SKIPPED!", entry["domain"])
232 elif instances.is_recent(entry["domain"]):
233 logger.debug("entry[domain]='%s' has been recently crawled - SKIPPED!", entry["domain"])
236 logger.debug("Adding domain='%s' ...", entry["domain"])
237 domains.append(entry["domain"])
239 except network.exceptions as exception:
240 logger.warning("Cannot fetch graphql,exception[%s]:'%s' - EXIT!", type(exception), str(exception))
243 logger.debug("domains()=%d", len(domains))
245 logger.info("Adding %d new instances ...", len(domains))
246 for domain in domains:
247 logger.debug("domain='%s' - BEFORE!", domain)
248 domain = domain.encode("idna").decode("utf-8")
249 logger.debug("domain='%s' - AFTER!", domain)
252 logger.info("Fetching instances from domain='%s' ...", domain)
253 federation.fetch_instances(domain, 'tak.teleyal.blog', None, inspect.currentframe().f_code.co_name)
254 except network.exceptions as exception:
255 logger.warning("Exception '%s' during fetching instances (fetch_bkali) from domain='%s'", type(exception), domain)
256 instances.set_last_error(domain, exception)
259 logger.debug("Success - EXIT!")
262 def fetch_blocks(args: argparse.Namespace) -> int:
263 logger.debug("args[]='%s' - CALLED!", type(args))
264 if args.domain is not None and args.domain != "":
265 logger.debug("args.domain='%s' - checking ...", args.domain)
266 if not validators.domain(args.domain):
267 logger.warning("args.domain='%s' is not valid.", args.domain)
269 elif blacklist.is_blacklisted(args.domain):
270 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
272 elif not instances.is_registered(args.domain):
273 logger.warning("args.domain='%s' is not registered, please run ./utils.py fetch_instances '%s' first.", args.domain, args.domain)
276 logger.debug("Invoking locking.acquire() ...")
279 if args.domain is not None and args.domain != "":
280 # Re-check single domain
281 logger.debug("Querying database for single args.domain='%s' ...", args.domain)
282 database.cursor.execute(
283 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
285 elif args.software is not None and args.software != "":
286 # Re-check single software
287 logger.debug("Querying database for args.software='%s' ...", args.software)
288 database.cursor.execute(
289 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ? AND nodeinfo_url IS NOT NULL", [args.software]
292 # Re-check after "timeout" (aka. minimum interval)
293 database.cursor.execute(
294 "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey') AND (last_blocked IS NULL OR last_blocked < ?) AND nodeinfo_url IS NOT NULL ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
297 rows = database.cursor.fetchall()
298 logger.info("Checking %d entries ...", len(rows))
299 for blocker, software, origin, nodeinfo_url in rows:
300 logger.debug("blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
301 blocker = tidyup.domain(blocker)
302 logger.debug("blocker='%s' - AFTER!", blocker)
305 logger.warning("blocker is now empty!")
307 elif nodeinfo_url is None or nodeinfo_url == "":
308 logger.debug("blocker='%s',software='%s' has empty nodeinfo_url", blocker, software)
310 elif not utils.is_domain_wanted(blocker):
311 logger.warning("blocker='%s' is not wanted - SKIPPED!", blocker)
314 logger.debug("blocker='%s'", blocker)
315 instances.set_last_blocked(blocker)
316 instances.set_has_obfuscation(blocker, False)
319 if software == "pleroma":
320 logger.info("blocker='%s',software='%s'", blocker, software)
321 blocking = pleroma.fetch_blocks(blocker, nodeinfo_url)
322 elif software == "mastodon":
323 logger.info("blocker='%s',software='%s'", blocker, software)
324 blocking = mastodon.fetch_blocks(blocker, nodeinfo_url)
325 elif software == "lemmy":
326 logger.info("blocker='%s',software='%s'", blocker, software)
327 blocking = lemmy.fetch_blocks(blocker, nodeinfo_url)
328 elif software == "friendica":
329 logger.info("blocker='%s',software='%s'", blocker, software)
330 blocking = friendica.fetch_blocks(blocker)
331 elif software == "misskey":
332 logger.info("blocker='%s',software='%s'", blocker, software)
333 blocking = misskey.fetch_blocks(blocker)
335 logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
337 logger.debug("blocker='%s'", blocker)
338 if blocker != "chaos.social":
339 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
340 instances.set_total_blocks(blocker, blocking)
342 logger.info("Checking %d entries from blocker='%s',software='%s' ...", len(blocking), blocker, software)
344 for block in blocking:
345 logger.debug("blocked='%s',block_level='%s',reason='%s'", block["blocked"], block["block_level"], block["reason"])
347 if block["block_level"] == "":
348 logger.warning("block_level is empty, blocker='%s',blocked='%s'", block["blocker"], block["blocked"])
351 logger.debug("blocked='%s',reason='%s' - BEFORE!", block["blocked"], block["reason"])
352 block["blocked"] = tidyup.domain(block["blocked"])
353 block["reason"] = tidyup.reason(block["reason"]) if block["reason"] is not None and block["reason"] != "" else None
354 logger.debug("blocked='%s',reason='%s' - AFTER!", block["blocked"], block["reason"])
356 if block["blocked"] == "":
357 logger.warning("blocked is empty, blocker='%s'", blocker)
359 elif block["blocked"].endswith(".onion"):
360 logger.debug("blocked='%s' is a TOR .onion domain - SKIPPED", block["blocked"])
362 elif block["blocked"].endswith(".arpa"):
363 logger.debug("blocked='%s' is a reverse IP address - SKIPPED", block["blocked"])
365 elif block["blocked"].endswith(".tld"):
366 logger.debug("blocked='%s' is a fake domain - SKIPPED", block["blocked"])
368 elif block["blocked"].find("*") >= 0:
369 logger.debug("blocker='%s' uses obfuscated domains", blocker)
371 # Some friendica servers also obscure domains without hash
372 row = instances.deobfuscate("*", block["blocked"], block["hash"] if "hash" in block else None)
374 logger.debug("row[]='%s'", type(row))
376 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
377 instances.set_has_obfuscation(blocker, True)
380 block["blocked"] = row["domain"]
381 origin = row["origin"]
382 nodeinfo_url = row["nodeinfo_url"]
383 elif block["blocked"].find("?") >= 0:
384 logger.debug("blocker='%s' uses obfuscated domains", blocker)
386 # Some obscure them with question marks, not sure if that's dependent on version or not
387 row = instances.deobfuscate("?", block["blocked"], block["hash"] if "hash" in block else None)
389 logger.debug("row[]='%s'", type(row))
391 logger.warning("Cannot deobfuscate blocked='%s',blocker='%s',software='%s' - SKIPPED!", block["blocked"], blocker, software)
392 instances.set_has_obfuscation(blocker, True)
395 block["blocked"] = row["domain"]
396 origin = row["origin"]
397 nodeinfo_url = row["nodeinfo_url"]
399 logger.debug("Looking up instance by domainm, blocked='%s'", block["blocked"])
400 if block["blocked"] == "":
401 logger.debug("block[blocked] is empty - SKIPPED!")
404 logger.debug("block[blocked]='%s' - BEFORE!", block["blocked"])
405 block["blocked"] = block["blocked"].lstrip(".").encode("idna").decode("utf-8")
406 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
408 if not utils.is_domain_wanted(block["blocked"]):
409 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
411 elif block["block_level"] in ["accept", "accepted"]:
412 logger.debug("blocked='%s' is accepted, not wanted here - SKIPPED!", block["blocked"])
414 elif not instances.is_registered(block["blocked"]):
415 logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", block["blocked"], blocker)
416 federation.fetch_instances(block["blocked"], blocker, None, inspect.currentframe().f_code.co_name)
418 block["block_level"] = blocks.alias_block_level(block["block_level"])
420 if processing.block(blocker, block["blocked"], block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
421 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
423 "blocked": block["blocked"],
424 "reason" : block["reason"],
427 logger.debug("Invoking cookies.clear(%s) ...", block["blocked"])
428 cookies.clear(block["blocked"])
430 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
431 if instances.has_pending(blocker):
432 logger.debug("Flushing updates for blocker='%s' ...", blocker)
433 instances.update_data(blocker)
435 logger.debug("Invoking commit() ...")
436 database.connection.commit()
438 logger.debug("Invoking cookies.clear(%s) ...", blocker)
439 cookies.clear(blocker)
441 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d'", config.get("bot_enabled"), len(blockdict))
442 if config.get("bot_enabled") and len(blockdict) > 0:
443 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
444 network.send_bot_post(blocker, blockdict)
446 logger.debug("Success! - EXIT!")
449 def fetch_observer(args: argparse.Namespace) -> int:
450 logger.debug("args[]='%s' - CALLED!", type(args))
452 logger.debug("Invoking locking.acquire() ...")
455 source_domain = "fediverse.observer"
456 if sources.is_recent(source_domain):
457 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
460 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
461 sources.update(source_domain)
464 if args.software is None:
465 logger.info("Fetching software list ...")
466 raw = utils.fetch_url(
467 f"https://{source_domain}",
469 (config.get("connection_timeout"), config.get("read_timeout"))
471 logger.debug("raw[%s]()=%d", type(raw), len(raw))
473 doc = bs4.BeautifulSoup(raw, features="html.parser")
474 logger.debug("doc[]='%s'", type(doc))
476 items = doc.find("div", {"aria-labelledby": "navbarDropdownMenuSoftwares"}).findAll("a", {"class": "dropdown-item"})
477 logger.debug("items[]='%s'", type(items))
479 logger.info("Checking %d menu items ...", len(items))
481 logger.debug("item[%s]='%s'", type(item), item)
482 if item.text.lower() == "all":
483 logger.debug("Skipping 'All' menu entry ...")
486 logger.debug("Appending item.text='%s' ...", item.text)
487 types.append(tidyup.domain(item.text))
489 logger.info("Adding args.software='%s' as type ...", args.software)
490 types.append(args.software)
492 logger.info("Fetching %d different table data ...", len(types))
493 for software in types:
494 logger.debug("software='%s' - BEFORE!", software)
495 if args.software is not None and args.software != software:
496 logger.debug("args.software='%s' does not match software='%s' - SKIPPED!", args.software, software)
501 logger.debug("Fetching table data for software='%s' ...", software)
502 raw = utils.fetch_url(
503 f"https://{source_domain}/app/views/tabledata.php?software={software}",
505 (config.get("connection_timeout"), config.get("read_timeout"))
507 logger.debug("raw[%s]()=%d", type(raw), len(raw))
509 doc = bs4.BeautifulSoup(raw, features="html.parser")
510 logger.debug("doc[]='%s'", type(doc))
511 except network.exceptions as exception:
512 logger.warning("Cannot fetch software='%s' from source_domain='%s': '%s'", software, source_domain, type(exception))
515 items = doc.findAll("a", {"class": "url"})
516 logger.info("Checking %d items,software='%s' ...", len(items), software)
518 logger.debug("item[]='%s'", type(item))
519 domain = item.decode_contents()
520 logger.debug("domain='%s' - AFTER!", domain)
523 logger.debug("domain is empty - SKIPPED!")
526 logger.debug("domain='%s' - BEFORE!", domain)
527 domain = domain.encode("idna").decode("utf-8")
528 logger.debug("domain='%s' - AFTER!", domain)
530 if not utils.is_domain_wanted(domain):
531 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
533 elif instances.is_registered(domain):
534 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
536 elif instances.is_recent(domain):
537 logger.debug("domain='%s' is recently being handled - SKIPPED!", domain)
540 software = software_helper.alias(software)
541 logger.info("Fetching instances for domain='%s'", domain)
542 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
544 logger.debug("Success! - EXIT!")
547 def fetch_todon_wiki(args: argparse.Namespace) -> int:
548 logger.debug("args[]='%s' - CALLED!", type(args))
550 logger.debug("Invoking locking.acquire() ...")
553 source_domain = "wiki.todon.eu"
554 if sources.is_recent(source_domain):
555 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
558 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
559 sources.update(source_domain)
566 raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
567 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
569 doc = bs4.BeautifulSoup(raw, "html.parser")
570 logger.debug("doc[]='%s'", type(doc))
572 silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
573 logger.info("Checking %d silenced/limited entries ...", len(silenced))
574 blocklist["silenced"] = utils.find_domains(silenced, "div")
576 suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
577 logger.info("Checking %d suspended entries ...", len(suspended))
578 blocklist["reject"] = utils.find_domains(suspended, "div")
580 blocking = blocklist["silenced"] + blocklist["reject"]
583 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
584 instances.set_total_blocks(blocker, blocking)
587 for block_level in blocklist:
588 blockers = blocklist[block_level]
590 logger.debug("block_level='%s',blockers()=%d'", block_level, len(blockers))
591 for blocked in blockers:
592 logger.debug("blocked='%s'", blocked)
594 if not instances.is_registered(blocked):
596 logger.info("Fetching instances from domain='%s' ...", blocked)
597 federation.fetch_instances(blocked, blocker, None, inspect.currentframe().f_code.co_name)
598 except network.exceptions as exception:
599 logger.warning("Exception '%s' during fetching instances (fetch_cs) from blocked='%s'", type(exception), blocked)
600 instances.set_last_error(blocked, exception)
602 if blocks.is_instance_blocked(blocker, blocked, block_level):
603 logger.debug("blocked='%s',block_level='%s' is already blocked - SKIPPED!", blocked, block_level)
606 logger.info("Adding new block: blocked='%s',block_level='%s'", blocked, block_level)
607 if processing.block(blocker, blocked, None, block_level) and block_level == "reject" and config.get("bot_enabled"):
608 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", blocked, block_level, blocker)
614 logger.debug("Invoking commit() ...")
615 database.connection.commit()
617 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
618 if config.get("bot_enabled") and len(blockdict) > 0:
619 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
620 network.send_bot_post(blocker, blockdict)
622 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
623 if instances.has_pending(blocker):
624 logger.debug("Flushing updates for blocker='%s' ...", blocker)
625 instances.update_data(blocker)
627 logger.debug("Success! - EXIT!")
630 def fetch_cs(args: argparse.Namespace):
631 logger.debug("args[]='%s' - CALLED!", type(args))
633 logger.debug("Invoking locking.acquire() ...")
661 source_domain = "raw.githubusercontent.com"
662 if sources.is_recent(source_domain):
663 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
666 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
667 sources.update(source_domain)
669 raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
670 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
672 doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
673 logger.debug("doc()=%d[]='%s'", len(doc), type(doc))
675 silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody")
676 logger.debug("silenced[%s]()=%d", type(silenced), len(silenced))
677 blocklist["silenced"] = federation.find_domains(silenced)
679 blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody")
680 logger.debug("blocked[%s]()=%d", type(blocked), len(blocked))
681 blocklist["reject"] = federation.find_domains(blocked)
683 blocking = blocklist["silenced"] + blocklist["reject"]
684 blocker = "chaos.social"
686 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", blocker, len(blocking))
687 instances.set_total_blocks(blocker, blocking)
689 logger.debug("blocklist[silenced]()=%d,blocklist[reject]()=%d", len(blocklist["silenced"]), len(blocklist["reject"]))
690 if len(blocking) > 0:
692 for block_level in blocklist:
693 logger.info("block_level='%s' has %d row(s)", block_level, len(blocklist[block_level]))
695 for row in blocklist[block_level]:
696 logger.debug("row[%s]='%s'", type(row), row)
697 if not "domain" in row:
698 logger.warning("row[]='%s' has no element 'domain' - SKIPPED!", type(row))
700 elif instances.is_recent(row["domain"], "last_blocked"):
701 logger.debug("row[domain]='%s' has been recently crawled - SKIPPED!", row["domain"])
703 elif not instances.is_registered(row["domain"]):
705 logger.info("Fetching instances from domain='%s' ...", row["domain"])
706 federation.fetch_instances(row["domain"], blocker, None, inspect.currentframe().f_code.co_name)
707 except network.exceptions as exception:
708 logger.warning("Exception '%s' during fetching instances (fetch_cs) from row[domain]='%s'", type(exception), row["domain"])
709 instances.set_last_error(row["domain"], exception)
711 if processing.block(blocker, row["domain"], row["reason"], block_level) and block_level == "reject" and config.get("bot_enabled"):
712 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", row["domain"], block_level, blocker)
714 "blocked": row["domain"],
715 "reason" : row["reason"],
718 logger.debug("Invoking commit() ...")
719 database.connection.commit()
721 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
722 if config.get("bot_enabled") and len(blockdict) > 0:
723 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
724 network.send_bot_post(blocker, blockdict)
726 logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
727 if instances.has_pending(blocker):
728 logger.debug("Flushing updates for blocker='%s' ...", blocker)
729 instances.update_data(blocker)
731 logger.debug("Success! - EXIT!")
734 def fetch_fba_rss(args: argparse.Namespace) -> int:
735 logger.debug("args[]='%s' - CALLED!", type(args))
739 logger.debug("Invoking locking.acquire() ...")
742 components = urlparse(args.feed)
744 if sources.is_recent(components.netloc):
745 logger.info("API from components.netloc='%s' has recently being accessed - EXIT!", components.netloc)
748 logger.debug("components.netloc='%s' has not been recently used, marking ...", components.netloc)
749 sources.update(components.netloc)
751 logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
752 response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
754 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
755 if response.ok and response.status_code < 300 and len(response.text) > 0:
756 logger.debug("Parsing RSS feed (%d Bytes) ...", len(response.text))
757 rss = atoma.parse_rss_bytes(response.content)
759 logger.debug("rss[]='%s'", type(rss))
760 for item in rss.items:
761 logger.debug("item='%s'", item)
762 domain = tidyup.domain(item.link.split("=")[1])
764 logger.debug("domain='%s' - AFTER!", domain)
766 logger.debug("domain is empty - SKIPPED!")
769 logger.debug("domain='%s' - BEFORE!", domain)
770 domain = domain.encode("idna").decode("utf-8")
771 logger.debug("domain='%s' - AFTER!", domain)
773 if not utils.is_domain_wanted(domain):
774 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
776 elif domain in domains:
777 logger.debug("domain='%s' is already added - SKIPPED!", domain)
779 elif instances.is_registered(domain):
780 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
782 elif instances.is_recent(domain):
783 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
786 logger.debug("Adding domain='%s'", domain)
787 domains.append(domain)
789 logger.debug("domains()=%d", len(domains))
791 logger.info("Adding %d new instances ...", len(domains))
792 for domain in domains:
793 logger.debug("domain='%s'", domain)
795 logger.info("Fetching instances from domain='%s' ...", domain)
796 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
797 except network.exceptions as exception:
798 logger.warning("Exception '%s' during fetching instances (fetch_fba_rss) from domain='%s'", type(exception), domain)
799 instances.set_last_error(domain, exception)
802 logger.debug("Success! - EXIT!")
805 def fetch_fbabot_atom(args: argparse.Namespace) -> int:
806 logger.debug("args[]='%s' - CALLED!", type(args))
808 logger.debug("Invoking locking.acquire() ...")
811 source_domain = "ryona.agency"
812 if sources.is_recent(source_domain):
813 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
816 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
817 sources.update(source_domain)
819 feed = f"https://{source_domain}/users/fba/feed.atom"
823 logger.info("Fetching ATOM feed='%s' from FBA bot account ...", feed)
824 response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
826 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
827 if response.ok and response.status_code < 300 and len(response.text) > 0:
828 logger.debug("Parsing ATOM feed (%d Bytes) ...", len(response.text))
829 atom = atoma.parse_atom_bytes(response.content)
831 logger.debug("atom[]='%s'", type(atom))
832 for entry in atom.entries:
833 logger.debug("entry[]='%s'", type(entry))
834 doc = bs4.BeautifulSoup(entry.content.value, "html.parser")
835 logger.debug("doc[]='%s'", type(doc))
836 for element in doc.findAll("a"):
837 logger.debug("element[]='%s'", type(element))
838 for href in element["href"].split(","):
839 logger.debug("href[%s]='%s' - BEFORE!", type(href), href)
840 domain = tidyup.domain(href)
842 logger.debug("domain='%s' - AFTER!", domain)
844 logger.debug("domain is empty - SKIPPED!")
847 logger.debug("domain='%s' - BEFORE!", domain)
848 domain = domain.encode("idna").decode("utf-8")
849 logger.debug("domain='%s' - AFTER!", domain)
851 if not utils.is_domain_wanted(domain):
852 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
854 elif domain in domains:
855 logger.debug("domain='%s' is already added - SKIPPED!", domain)
857 elif instances.is_registered(domain):
858 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
860 elif instances.is_recent(domain):
861 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
864 logger.debug("Adding domain='%s',domains()=%d", domain, len(domains))
865 domains.append(domain)
867 logger.debug("domains()=%d", len(domains))
869 logger.info("Adding %d new instances ...", len(domains))
870 for domain in domains:
871 logger.debug("domain='%s'", domain)
873 logger.info("Fetching instances from domain='%s' ...", domain)
874 federation.fetch_instances(domain, source_domain, None, inspect.currentframe().f_code.co_name)
875 except network.exceptions as exception:
876 logger.warning("Exception '%s' during fetching instances (fetch_fbabot_atom) from domain='%s'", type(exception), domain)
877 instances.set_last_error(domain, exception)
880 logger.debug("Success! - EXIT!")
883 def fetch_instances(args: argparse.Namespace) -> int:
884 logger.debug("args[]='%s' - CALLED!", type(args))
886 logger.debug("args.domain='%s' - checking ...", args.domain)
887 if not validators.domain(args.domain):
888 logger.warning("args.domain='%s' is not valid.", args.domain)
890 elif blacklist.is_blacklisted(args.domain):
891 logger.warning("args.domain='%s' is blacklisted, won't check it!", args.domain)
894 logger.debug("Invoking locking.acquire() ...")
899 logger.info("Fetching instances from args.domain='%s' ...", args.domain)
900 federation.fetch_instances(args.domain, None, None, inspect.currentframe().f_code.co_name)
901 except network.exceptions as exception:
902 logger.warning("Exception '%s' during fetching instances (fetch_instances) from args.domain='%s'", type(exception), args.domain)
903 instances.set_last_error(args.domain, exception)
904 instances.update_data(args.domain)
908 logger.debug("Not fetching more instances - EXIT!")
911 # Loop through some instances
912 database.cursor.execute(
913 "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube', 'takahe') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
916 rows = database.cursor.fetchall()
917 logger.info("Checking %d entries ...", len(rows))
919 logger.debug("row[domain]='%s'", row["domain"])
920 if row["domain"] == "":
921 logger.debug("row[domain] is empty - SKIPPED!")
924 logger.debug("row[domain]='%s' - BEFORE!", row["domain"])
925 domain = row["domain"].encode("idna").decode("utf-8")
926 logger.debug("domain='%s' - AFTER!", domain)
928 if not utils.is_domain_wanted(domain):
929 logger.warning("Domain domain='%s' is not wanted - SKIPPED!", domain)
933 logger.info("Fetching instances for domain='%s',origin='%s',software='%s',nodeinfo_url='%s'", domain, row["origin"], row["software"], row["nodeinfo_url"])
934 federation.fetch_instances(domain, row["origin"], row["software"], inspect.currentframe().f_code.co_name, row["nodeinfo_url"])
935 except network.exceptions as exception:
936 logger.warning("Exception '%s' during fetching instances (fetch_instances) from domain='%s'", type(exception), domain)
937 instances.set_last_error(domain, exception)
939 logger.debug("Success - EXIT!")
942 def fetch_oliphant(args: argparse.Namespace) -> int:
943 logger.debug("args[]='%s' - CALLED!", type(args))
945 logger.debug("Invoking locking.acquire() ...")
948 source_domain = "codeberg.org"
949 if sources.is_recent(source_domain):
950 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
953 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
954 sources.update(source_domain)
957 base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
962 "blocker": "artisan.chat",
963 "csv_url": "mastodon/artisan.chat.csv",
965 "blocker": "mastodon.art",
966 "csv_url": "mastodon/mastodon.art.csv",
968 "blocker": "pleroma.envs.net",
969 "csv_url": "mastodon/pleroma.envs.net.csv",
971 "blocker": "oliphant.social",
972 "csv_url": "mastodon/_unified_tier3_blocklist.csv",
974 "blocker": "mastodon.online",
975 "csv_url": "mastodon/mastodon.online.csv",
977 "blocker": "mastodon.social",
978 "csv_url": "mastodon/mastodon.social.csv",
980 "blocker": "mastodon.social",
981 "csv_url": "other/missing-tier0-mastodon.social.csv",
983 "blocker": "rage.love",
984 "csv_url": "mastodon/rage.love.csv",
986 "blocker": "sunny.garden",
987 "csv_url": "mastodon/sunny.garden.csv",
989 "blocker": "sunny.garden",
990 "csv_url": "mastodon/gardenfence.csv",
992 "blocker": "solarpunk.moe",
993 "csv_url": "mastodon/solarpunk.moe.csv",
995 "blocker": "toot.wales",
996 "csv_url": "mastodon/toot.wales.csv",
998 "blocker": "union.place",
999 "csv_url": "mastodon/union.place.csv",
1001 "blocker": "oliphant.social",
1002 "csv_url": "mastodon/birdsite.csv",
1008 logger.debug("Downloading %d files ...", len(blocklists))
1009 for block in blocklists:
1010 # Is domain given and not equal blocker?
1011 if isinstance(args.domain, str) and args.domain != block["blocker"]:
1012 logger.debug("Skipping blocker='%s', not matching args.domain='%s'", block["blocker"], args.domain)
1014 elif args.domain in domains:
1015 logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
1019 logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
1020 response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1022 logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
1023 if not response.ok or response.status_code >= 300 or response.content == "":
1024 logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
1027 logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
1028 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1034 logger.debug("row[%s]='%s'", type(row), row)
1035 domain = severity = None
1036 reject_media = reject_reports = False
1038 if "#domain" in row:
1039 domain = row["#domain"]
1040 elif "domain" in row:
1041 domain = row["domain"]
1043 logger.debug("row='%s' does not contain domain column", row)
1046 if "#severity" in row:
1047 severity = blocks.alias_block_level(row["#severity"])
1048 elif "severity" in row:
1049 severity = blocks.alias_block_level(row["severity"])
1051 logger.debug("row='%s' does not contain severity column", row)
1054 if "#reject_media" in row and row["#reject_media"].lower() == "true":
1056 elif "reject_media" in row and row["reject_media"].lower() == "true":
1059 if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
1060 reject_reports = True
1061 elif "reject_reports" in row and row["reject_reports"].lower() == "true":
1062 reject_reports = True
1065 logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
1067 logger.debug("domain is empty - SKIPPED!")
1069 elif domain.endswith(".onion"):
1070 logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
1072 elif domain.endswith(".arpa"):
1073 logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
1075 elif domain.endswith(".tld"):
1076 logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
1078 elif domain.find("*") >= 0 or domain.find("?") >= 0:
1079 logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
1080 domain = utils.deobfuscate(domain, block["blocker"])
1081 logger.debug("domain='%s' - AFTER!", domain)
1083 if not validators.domain(domain):
1084 logger.debug("domain='%s' is not a valid domain - SKIPPED!")
1086 elif blacklist.is_blacklisted(domain):
1087 logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
1090 logger.debug("Marking domain='%s' as handled", domain)
1091 domains.append(domain)
1093 logger.debug("Processing domain='%s' ...", domain)
1094 processed = processing.domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
1095 logger.debug("processed='%s'", processed)
1097 if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
1098 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
1101 "reason" : block["reason"],
1105 processing.block(block["blocker"], domain, None, "reject_media")
1107 processing.block(block["blocker"], domain, None, "reject_reports")
1109 logger.debug("block[blocker]='%s'", block["blocker"])
1110 if block["blocker"] != "chaos.social":
1111 logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
1112 instances.set_total_blocks(block["blocker"], domains)
1114 logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
1115 if instances.has_pending(block["blocker"]):
1116 logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
1117 instances.update_data(block["blocker"])
1119 logger.debug("Invoking commit() ...")
1120 database.connection.commit()
1122 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1123 if config.get("bot_enabled") and len(blockdict) > 0:
1124 logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
1125 network.send_bot_post(block["blocker"], blockdict)
1127 logger.debug("Success! - EXIT!")
1130 def fetch_txt(args: argparse.Namespace) -> int:
1131 logger.debug("args[]='%s' - CALLED!", type(args))
1133 logger.debug("Invoking locking.acquire() ...")
1138 "blocker": "seirdy.one",
1139 "url" : "https://seirdy.one/pb/bsl.txt",
1142 logger.info("Checking %d text file(s) ...", len(urls))
1144 logger.debug("Fetching row[url]='%s' ...", row["url"])
1145 response = utils.fetch_url(row["url"], network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
1147 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1148 if response.ok and response.status_code < 300 and response.text != "":
1149 logger.debug("Returned %d Bytes for processing", len(response.text.strip()))
1150 domains = response.text.split("\n")
1152 logger.info("Processing %d domains ...", len(domains))
1153 for domain in domains:
1154 logger.debug("domain='%s' - BEFORE!", domain)
1155 domain = tidyup.domain(domain)
1157 logger.debug("domain='%s' - AFTER!", domain)
1159 logger.debug("domain is empty - SKIPPED!")
1161 elif not utils.is_domain_wanted(domain):
1162 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1164 elif instances.is_recent(domain):
1165 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1168 logger.debug("Processing domain='%s',row[blocker]='%s'", domain, row["blocker"])
1169 processed = processing.domain(domain, row["blocker"], inspect.currentframe().f_code.co_name)
1171 logger.debug("processed='%s'", processed)
1173 logger.debug("domain='%s' was not generically processed - SKIPPED!", domain)
1176 logger.debug("Success! - EXIT!")
1179 def fetch_fedipact(args: argparse.Namespace) -> int:
1180 logger.debug("args[]='%s' - CALLED!", type(args))
1182 logger.debug("Invoking locking.acquire() ...")
1185 source_domain = "fedipact.online"
1186 if sources.is_recent(source_domain):
1187 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1190 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1191 sources.update(source_domain)
1193 response = utils.fetch_url(
1194 f"https://{source_domain}",
1195 network.web_headers,
1196 (config.get("connection_timeout"), config.get("read_timeout"))
1199 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1200 if response.ok and response.status_code < 300 and response.text != "":
1201 logger.debug("Parsing %d Bytes ...", len(response.text))
1203 doc = bs4.BeautifulSoup(response.text, "html.parser")
1204 logger.debug("doc[]='%s'", type(doc))
1206 rows = doc.findAll("li")
1207 logger.info("Checking %d row(s) ...", len(rows))
1209 logger.debug("row[]='%s'", type(row))
1210 domain = tidyup.domain(row.contents[0])
1212 logger.debug("domain='%s' - AFTER!", domain)
1214 logger.debug("domain is empty - SKIPPED!")
1217 logger.debug("domain='%s' - BEFORE!", domain)
1218 domain = domain.encode("idna").decode("utf-8")
1219 logger.debug("domain='%s' - AFTER!", domain)
1221 if not utils.is_domain_wanted(domain):
1222 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1224 elif instances.is_registered(domain):
1225 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1227 elif instances.is_recent(domain):
1228 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1231 logger.info("Fetching domain='%s' ...", domain)
1232 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1234 logger.debug("Success! - EXIT!")
1237 def fetch_joinfediverse(args: argparse.Namespace) -> int:
1238 logger.debug("args[]='%s' - CALLED!", type(args))
1240 logger.debug("Invoking locking.acquire() ...")
1243 source_domain = "joinfediverse.wiki"
1244 if sources.is_recent(source_domain):
1245 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1248 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1249 sources.update(source_domain)
1251 raw = utils.fetch_url(
1252 f"https://{source_domain}/FediBlock",
1253 network.web_headers,
1254 (config.get("connection_timeout"), config.get("read_timeout"))
1256 logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
1258 doc = bs4.BeautifulSoup(raw, "html.parser")
1259 logger.debug("doc[]='%s'", type(doc))
1261 tables = doc.findAll("table", {"class": "wikitable"})
1263 logger.info("Analyzing %d table(s) ...", len(tables))
1265 for table in tables:
1266 logger.debug("table[]='%s'", type(table))
1268 rows = table.findAll("tr")
1269 logger.info("Checking %d row(s) ...", len(rows))
1270 block_headers = dict()
1272 logger.debug("row[%s]='%s'", type(row), row)
1274 headers = row.findAll("th")
1275 logger.debug("Found headers()=%d header(s)", len(headers))
1276 if len(headers) > 1:
1277 block_headers = dict()
1279 for header in headers:
1281 logger.debug("header[]='%s',cnt=%d", type(header), cnt)
1282 text = header.contents[0]
1284 logger.debug("text[]='%s'", type(text))
1285 if not isinstance(text, str):
1286 logger.debug("text[]='%s' is not of type 'str' - SKIPPED!", type(text))
1288 elif validators.domain(text.strip()):
1289 logger.debug("text='%s' is a domain - SKIPPED!", text.strip())
1292 text = tidyup.domain(text.strip())
1293 logger.debug("text='%s' - AFTER!", text)
1294 if text in ["domain", "instance", "subdomain(s)", "block reason(s)"]:
1295 logger.debug("Found header: '%s'=%d", text, cnt)
1296 block_headers[cnt] = text
1298 elif len(block_headers) == 0:
1299 logger.debug("row is not scrapable - SKIPPED!")
1301 elif len(block_headers) > 0:
1302 logger.debug("Found a row with %d scrapable headers ...", len(block_headers))
1306 for element in row.find_all(["th", "td"]):
1308 logger.debug("element[]='%s',cnt=%d", type(element), cnt)
1309 if cnt in block_headers:
1310 logger.debug("block_headers[%d]='%s'", cnt, block_headers[cnt])
1312 text = element.text.strip()
1313 key = block_headers[cnt] if block_headers[cnt] not in ["domain", "instance"] else "blocked"
1315 logger.debug("cnt=%d is wanted: key='%s',text[%s]='%s'", cnt, key, type(text), text)
1316 if key in ["domain", "instance"]:
1318 elif key == "reason":
1319 block[key] = tidyup.reason(text)
1320 elif key == "subdomain(s)":
1323 block[key] = text.split("/")
1325 logger.debug("key='%s'", key)
1328 logger.debug("block()=%d ...", len(block))
1330 logger.debug("Appending block()=%d ...", len(block))
1331 blocklist.append(block)
1333 logger.debug("blocklist()=%d", len(blocklist))
1335 database.cursor.execute("SELECT domain FROM instances WHERE domain LIKE 'climatejustice.%'")
1336 domains = database.cursor.fetchall()
1338 logger.debug("domains(%d)[]='%s'", len(domains), type(domains))
1340 for block in blocklist:
1341 logger.debug("block='%s'", block)
1342 if "subdomain(s)" in block and len(block["subdomain(s)"]) > 0:
1343 origin = block["blocked"]
1344 logger.debug("origin='%s'", origin)
1345 for subdomain in block["subdomain(s)"]:
1346 block["blocked"] = subdomain + "." + origin
1347 logger.debug("block[blocked]='%s'", block["blocked"])
1348 blocking.append(block)
1350 blocking.append(block)
1352 logger.debug("blocking()=%d", blocking)
1353 for block in blocking:
1354 logger.debug("block[]='%s'", type(block))
1355 if "blocked" not in block:
1356 raise KeyError(f"block()={len(block)} does not have element 'blocked'")
1358 block["blocked"] = tidyup.domain(block["blocked"]).encode("idna").decode("utf-8")
1359 logger.debug("block[blocked]='%s' - AFTER!", block["blocked"])
1361 if block["blocked"] == "":
1362 logger.debug("block[blocked] is empty - SKIPPED!")
1364 elif not utils.is_domain_wanted(block["blocked"]):
1365 logger.warning("block[blocked]='%s' is not wanted - SKIPPED!", block["blocked"])
1367 elif instances.is_recent(block["blocked"]):
1368 logger.debug("block[blocked]='%s' has been recently checked - SKIPPED!", block["blocked"])
1371 logger.info("Proccessing blocked='%s' ...", block["blocked"])
1372 processing.domain(block["blocked"], "climatejustice.social", inspect.currentframe().f_code.co_name)
1375 for blocker in domains:
1376 blocker = blocker[0]
1377 logger.debug("blocker[%s]='%s'", type(blocker), blocker)
1379 for block in blocking:
1380 logger.debug("block[blocked]='%s',block[block reason(s)]='%s' - BEFORE!", block["blocked"], block["block reason(s)"] if "block reason(s)" in block else None)
1381 block["reason"] = tidyup.reason(block["block reason(s)"]) if "block reason(s)" in block else None
1383 logger.debug("block[blocked]='%s',block[reason]='%s' - AFTER!", block["blocked"], block["reason"])
1384 if block["blocked"] == "":
1385 logger.debug("block[blocked] is empty - SKIPPED!")
1387 elif not utils.is_domain_wanted(block["blocked"]):
1388 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1391 logger.debug("blocked='%s',reason='%s'", block["blocked"], block["reason"])
1392 if processing.block(blocker, block["blocked"], block["reason"], "reject") and config.get("bot_enabled"):
1393 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], blocker)
1395 "blocked": block["blocked"],
1396 "reason" : block["reason"],
1399 if instances.has_pending(blocker):
1400 logger.debug("Flushing updates for blocker='%s' ...", blocker)
1401 instances.update_data(blocker)
1403 logger.debug("Invoking commit() ...")
1404 database.connection.commit()
1406 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1407 if config.get("bot_enabled") and len(blockdict) > 0:
1408 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", blocker, len(blockdict))
1409 network.send_bot_post(blocker, blockdict)
1411 logger.debug("Success! - EXIT!")
1414 def recheck_obfuscation(args: argparse.Namespace) -> int:
1415 logger.debug("args[]='%s' - CALLED!", type(args))
1417 logger.debug("Invoking locking.acquire() ...")
1420 if isinstance(args.domain, str) and args.domain != "" and utils.is_domain_wanted(args.domain):
1421 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND domain = ?", [args.domain])
1422 elif isinstance(args.software, str) and args.software != "" and validators.domain(args.software) == args.software:
1423 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1 AND software = ?", [args.software])
1425 database.cursor.execute("SELECT domain, software, nodeinfo_url FROM instances WHERE has_obfuscation = 1")
1427 rows = database.cursor.fetchall()
1428 logger.info("Checking %d domains ...", len(rows))
1430 logger.debug("Fetching peers from domain='%s',software='%s',nodeinfo_url='%s' ...", row["domain"], row["software"], row["nodeinfo_url"])
1431 if (args.all is None or not args.all) and instances.is_recent(row["domain"]) and args.domain is None and args.software is None:
1432 logger.debug("row[domain]='%s' has been recently checked, args.all[]='%s' - SKIPPED!", row["domain"], type(args.all))
1436 if row["software"] == "pleroma":
1437 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1438 blocking = pleroma.fetch_blocks(row["domain"], row["nodeinfo_url"])
1439 elif row["software"] == "mastodon":
1440 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1441 blocking = mastodon.fetch_blocks(row["domain"], row["nodeinfo_url"])
1442 elif row["software"] == "lemmy":
1443 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1444 blocking = lemmy.fetch_blocks(row["domain"], row["nodeinfo_url"])
1445 elif row["software"] == "friendica":
1446 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1447 blocking = friendica.fetch_blocks(row["domain"])
1448 elif row["software"] == "misskey":
1449 logger.debug("domain='%s',software='%s'", row["domain"], row["software"])
1450 blocking = misskey.fetch_blocks(row["domain"])
1452 logger.warning("Unknown sofware: domain='%s',software='%s'", row["domain"], row["software"])
1454 logger.debug("row[domain]='%s'", row["domain"])
1455 if row["domain"] != "chaos.social":
1456 logger.debug("Invoking instances.set_total_blocks(%s, %d) ...", row["domain"], len(blocking))
1457 instances.set_total_blocks(row["domain"], blocking)
1462 logger.info("Checking %d block(s) from domain='%s' ...", len(blocking), row["domain"])
1463 for block in blocking:
1464 logger.debug("block[blocked]='%s'", block["blocked"])
1467 if block["blocked"] == "":
1468 logger.debug("block[blocked] is empty - SKIPPED!")
1470 elif block["blocked"].endswith(".arpa"):
1471 logger.debug("blocked='%s' is a reversed IP address - SKIPPED!", block["blocked"])
1473 elif block["blocked"].endswith(".tld"):
1474 logger.debug("blocked='%s' is a fake domain name - SKIPPED!", block["blocked"])
1476 elif block["blocked"].endswith(".onion"):
1477 logger.debug("blocked='%s' is a TOR onion domain name - SKIPPED!", block["blocked"])
1479 elif block["blocked"].find("*") >= 0 or block["blocked"].find("?") >= 0:
1480 logger.debug("block='%s' is obfuscated.", block["blocked"])
1481 obfuscated = obfuscated + 1
1482 blocked = utils.deobfuscate(block["blocked"], row["domain"], block["hash"] if "hash" in block else None)
1483 elif not utils.is_domain_wanted(block["blocked"]):
1484 logger.warning("blocked='%s' is not wanted - SKIPPED!", block["blocked"])
1486 elif blocks.is_instance_blocked(row["domain"], block["blocked"]):
1487 logger.debug("blocked='%s' is already blocked - SKIPPED!", block["blocked"])
1490 logger.debug("blocked[%s]='%s',block[blocked]='%s'", type(blocked), blocked, block["blocked"])
1491 if blocked is not None and blocked != block["blocked"]:
1492 logger.debug("blocked='%s' was deobfuscated to blocked='%s'", block["blocked"], blocked)
1493 obfuscated = obfuscated - 1
1494 if blocks.is_instance_blocked(row["domain"], blocked):
1495 logger.debug("blocked='%s' is already blocked by domain='%s' - SKIPPED!", blocked, row["domain"])
1498 block["block_level"] = blocks.alias_block_level(block["block_level"])
1500 logger.info("blocked='%s' has been deobfuscated to blocked='%s', adding ...", block["blocked"], blocked)
1501 if processing.block(row["domain"], blocked, block["reason"], block["block_level"]) and block["block_level"] == "reject" and config.get("bot_enabled"):
1502 logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", block["blocked"], block["block_level"], row["domain"])
1505 "reason" : block["reason"],
1508 logger.info("domain='%s' has %d obfuscated domain(s)", row["domain"], obfuscated)
1509 if obfuscated == 0 and len(blocking) > 0:
1510 logger.info("Block list from domain='%s' has been fully deobfuscated.", row["domain"])
1511 instances.set_has_obfuscation(row["domain"], False)
1513 if instances.has_pending(row["domain"]):
1514 logger.debug("Flushing updates for blocker='%s' ...", row["domain"])
1515 instances.update_data(row["domain"])
1517 logger.debug("Invoking commit() ...")
1518 database.connection.commit()
1520 logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
1521 if config.get("bot_enabled") and len(blockdict) > 0:
1522 logger.info("Sending bot POST for blocker='%s,blockdict()=%d ...", row["domain"], len(blockdict))
1523 network.send_bot_post(row["domain"], blockdict)
1525 logger.debug("Success! - EXIT!")
1528 def fetch_fedilist(args: argparse.Namespace) -> int:
1529 logger.debug("args[]='%s' - CALLED!", type(args))
1531 logger.debug("Invoking locking.acquire() ...")
1534 source_domain = "demo.fedilist.com"
1535 if sources.is_recent(source_domain):
1536 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1539 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1540 sources.update(source_domain)
1542 url = f"http://{source_domain}/instance/csv?onion=not"
1543 if args.software is not None and args.software != "":
1544 logger.debug("args.software='%s'", args.software)
1545 url = f"http://{source_domain}/instance/csv?software={args.software}&onion=not"
1547 logger.info("Fetching url='%s' ...", url)
1548 response = reqto.get(
1550 headers=network.web_headers,
1551 timeout=(config.get("connection_timeout"), config.get("read_timeout")),
1552 allow_redirects=False
1555 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
1556 if not response.ok or response.status_code >= 300 or len(response.content) == 0:
1557 logger.warning("Failed fetching url='%s': response.ok='%s',response.status_code=%d,response.content()=%d - EXIT!", url, response.ok, response.status_code, len(response.text))
1560 reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
1562 logger.debug("reader[]='%s'", type(reader))
1564 logger.debug("row[]='%s'", type(row))
1565 if "hostname" not in row:
1566 logger.warning("row()=%d has no element 'hostname' - SKIPPED!", len(row))
1569 logger.debug("row[hostname]='%s' - BEFORE!", row["hostname"])
1570 domain = tidyup.domain(row["hostname"])
1571 logger.debug("domain='%s' - AFTER!", domain)
1574 logger.debug("domain is empty after tidyup: row[hostname]='%s' - SKIPPED!", row["hostname"])
1577 logger.debug("domain='%s' - BEFORE!", domain)
1578 domain = domain.encode("idna").decode("utf-8")
1579 logger.debug("domain='%s' - AFTER!", domain)
1581 if not utils.is_domain_wanted(domain):
1582 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1584 elif (args.all is None or not args.all) and instances.is_registered(domain):
1585 logger.debug("domain='%s' is already registered, --all not specified: args.all[]='%s'", domain, type(args.all))
1587 elif instances.is_recent(domain):
1588 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1591 logger.info("Fetching instances from domain='%s' ...", domain)
1592 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1594 logger.debug("Success! - EXIT!")
1597 def update_nodeinfo(args: argparse.Namespace) -> int:
1598 logger.debug("args[]='%s' - CALLED!", type(args))
1600 logger.debug("Invoking locking.acquire() ...")
1603 if args.domain is not None and args.domain != "":
1604 logger.debug("Fetching args.domain='%s'", args.domain)
1605 database.cursor.execute("SELECT domain, software FROM instances WHERE domain = ?", [args.domain])
1606 elif args.software is not None and args.software != "":
1607 logger.info("Fetching domains for args.software='%s'", args.software)
1608 database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
1610 logger.info("Fetching domains for recently updated ...")
1611 database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_nodeinfo")])
1613 domains = database.cursor.fetchall()
1615 logger.info("Checking %d domain(s) ...", len(domains))
1618 logger.debug("row[]='%s'", type(row))
1620 logger.info("Checking nodeinfo for row[domain]='%s',row[software]='%s' (%s%%) ...", row["domain"], row["software"], "{:5.1f}".format(cnt / len(domains) * 100))
1621 software = federation.determine_software(row["domain"])
1623 logger.debug("Determined software='%s'", software)
1624 if (software != row["software"] and software is not None) or args.force is True:
1625 logger.warning("Software type for row[domain]='%s' has changed from '%s' to '%s'!", row["domain"], row["software"], software)
1626 instances.set_software(row["domain"], software)
1628 instances.set_success(row["domain"])
1629 except network.exceptions as exception:
1630 logger.warning("Exception '%s' during updating nodeinfo for row[domain]='%s'", type(exception), row["domain"])
1631 instances.set_last_error(row["domain"], exception)
1633 instances.set_last_nodeinfo(row["domain"])
1634 instances.update_data(row["domain"])
1637 logger.debug("Success! - EXIT!")
1640 def fetch_instances_social(args: argparse.Namespace) -> int:
1641 logger.debug("args[]='%s' - CALLED!", type(args))
1643 logger.debug("Invoking locking.acquire() ...")
1646 source_domain = "instances.social"
1648 if config.get("instances_social_api_key") == "":
1649 logger.error("API key not set. Please set in your config.json file.")
1651 elif sources.is_recent(source_domain):
1652 logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
1655 logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
1656 sources.update(source_domain)
1659 "Authorization": f"Bearer {config.get('instances_social_api_key')}",
1662 fetched = network.get_json_api(
1664 "/api/1.0/instances/list?count=0&sort_by=name",
1666 (config.get("connection_timeout"), config.get("read_timeout"))
1668 logger.debug("fetched[]='%s'", type(fetched))
1670 if "error_message" in fetched:
1671 logger.warning("Error during fetching API result: '%s' - EXIT!", fetched["error_message"])
1673 elif "exception" in fetched:
1674 logger.warning("Exception '%s' during fetching API result - EXIT!", type(fetched["exception"]))
1676 elif "json" not in fetched:
1677 logger.warning("fetched has no element 'json' - EXIT!")
1679 elif "instances" not in fetched["json"]:
1680 logger.warning("fetched[row] has no element 'instances' - EXIT!")
1684 rows = fetched["json"]["instances"]
1686 logger.info("Checking %d row(s) ...", len(rows))
1688 logger.debug("row[]='%s'", type(row))
1689 domain = tidyup.domain(row["name"])
1690 logger.debug("domain='%s' - AFTER!", domain)
1693 logger.debug("domain is empty - SKIPPED!")
1696 logger.debug("domain='%s' - BEFORE!", domain)
1697 domain = domain.encode("idna").decode("utf-8")
1698 logger.debug("domain='%s' - AFTER!", domain)
1700 if not utils.is_domain_wanted(domain):
1701 logger.warning("domain='%s' is not wanted - SKIPPED!", domain)
1703 elif domain in domains:
1704 logger.debug("domain='%s' is already added - SKIPPED!", domain)
1706 elif instances.is_registered(domain):
1707 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
1709 elif instances.is_recent(domain):
1710 logger.debug("domain='%s' has been recently crawled - SKIPPED!", domain)
1713 logger.info("Fetching instances from domain='%s'", domain)
1714 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
1716 logger.debug("Success! - EXIT!")
1719 def convert_idna(args: argparse.Namespace) -> int:
1720 logger.debug("args[]='%s' - CALLED!", type(args))
1722 database.cursor.execute("SELECT domain FROM instances WHERE domain NOT LIKE '%xn--%' ORDER BY domain ASC")
1723 rows = database.cursor.fetchall()
1725 logger.debug("rows[]='%s'", type(rows))
1726 instances.translate_idnas(rows, "domain")
1728 database.cursor.execute("SELECT origin FROM instances WHERE origin NOT LIKE '%xn--%' ORDER BY origin ASC")
1729 rows = database.cursor.fetchall()
1731 logger.debug("rows[]='%s'", type(rows))
1732 instances.translate_idnas(rows, "origin")
1734 database.cursor.execute("SELECT blocker FROM blocks WHERE blocker NOT LIKE '%xn--%' ORDER BY blocker ASC")
1735 rows = database.cursor.fetchall()
1737 logger.debug("rows[]='%s'", type(rows))
1738 blocks.translate_idnas(rows, "blocker")
1740 database.cursor.execute("SELECT blocked FROM blocks WHERE blocked NOT LIKE '%xn--%' ORDER BY blocked ASC")
1741 rows = database.cursor.fetchall()
1743 logger.debug("rows[]='%s'", type(rows))
1744 blocks.translate_idnas(rows, "blocked")
1746 logger.debug("Success! - EXIT!")