From 042e6bc3267ad1080e2b796ea162544feeeb8318 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Fri, 9 Jun 2023 05:32:23 +0200 Subject: [PATCH] Continued: - the repository is more up-to-date, let's fetch this instead - had to parse the markdown code back to HTML to be able to traverse with existing code over it --- fba/commands.py | 35 ++++++++++++++++++++++++++++------- fba/fba.py | 6 +++--- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fba/commands.py b/fba/commands.py index a14d878..1db0c08 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -21,6 +21,7 @@ import csv import inspect import itertools import json +import markdown import re import reqto import sys @@ -257,23 +258,43 @@ def fetch_blocks(args: argparse.Namespace): def fetch_cs(args: argparse.Namespace): # DEBUG: print(f"DEBUG: args[]={type(args)} - CALLED!") + extensions = [ + 'extra', + 'abbr', + 'attr_list', + 'def_list', + 'fenced_code', + 'footnotes', + 'md_in_html', + 'admonition', + 'codehilite', + 'legacy_attrs', + 'legacy_em', + 'meta', + 'nl2br', + 'sane_lists', + 'smarty', + 'toc', + 'wikilinks' + ] + domains = { "silenced": list(), "reject" : list(), } try: - doc = bs4.BeautifulSoup( - network.fetch_response("meta.chaos.social", "/federation", fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))).text, - "html.parser", - ) - # DEBUG: print(f"DEBUG: doc()={len(doc)}[]={type(doc)}") - silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table") + raw = fba.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))).text + # DEBUG: print(f"DEBUG: raw()={len(raw)}[]={type(raw)}") + doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features='html.parser') + + # DEBUG: print(f"DEBUG: doc()={len(doc)}[]={type(doc)}") + silenced = doc.find("h2", {"id": "silenced-instances"}).findNext("table").find("tbody") # DEBUG: print(f"DEBUG: silenced[]={type(silenced)}") domains["silenced"] = domains["silenced"] + fba.find_domains(silenced) - blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table") + blocked = doc.find("h2", {"id": "blocked-instances"}).findNext("table").find("tbody") # DEBUG: print(f"DEBUG: blocked[]={type(blocked)}") domains["reject"] = domains["reject"] + fba.find_domains(blocked) diff --git a/fba/fba.py b/fba/fba.py index 1020416..7f0c140 100644 --- a/fba/fba.py +++ b/fba/fba.py @@ -807,7 +807,7 @@ def find_domains(tag: bs4.element.Tag) -> list: # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!") return domains -def fetch_url(url: str, headers: dict, timeout: list) -> requests.models.Response: +def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Response: # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!") if not isinstance(url, str): raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'") @@ -815,8 +815,8 @@ def fetch_url(url: str, headers: dict, timeout: list) -> requests.models.Respons raise ValueError("Parameter 'url' is empty") elif not isinstance(headers, dict): raise ValueError(f"Parameter headers[]='{type(headers)}' is not 'dict'") - elif not isinstance(timeout, list): - raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not 'list'") + elif not isinstance(timeout, tuple): + raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not 'tuple'") # DEBUG: print(f"DEBUG: Parsing url='{url}'") components = urlparse(url) -- 2.39.5