From e66aa094809d60fc6e784f1131015205332366ec Mon Sep 17 00:00:00 2001 From: =?utf8?q?Roland=20H=C3=A4der?= Date: Tue, 11 Jul 2023 15:21:38 +0200 Subject: [PATCH] Continued: - ops, header was wrong here due to previous changes (search for all headers) - but after a few renames, all is back in order! --- fba/networks/lemmy.py | 49 ++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/fba/networks/lemmy.py b/fba/networks/lemmy.py index 3d99838..0b22f53 100644 --- a/fba/networks/lemmy.py +++ b/fba/networks/lemmy.py @@ -145,17 +145,18 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list: logger.debug("doc[]='%s'", type(doc)) found = None - for container in [{"class": "home-instances container-lg"}, {"class": "container"}]: - logger.debug("container='%s'", container) - headers = doc.findAll("div", container) - - logger.debug("Checking %d header(s) ...", len(headers)) - for header in headers: - logger.debug("header[]='%s'", type(header)) - for content in header.find_all(["h2", "h3", "h4", "h5"]): - logger.debug("content[%s]='%s' - BEFORE!", type(content), content) - if content is not None: - content = str(content.contents[0]) + for criteria in [{"class": "home-instances container-lg"}, {"class": "container"}]: + logger.debug("criteria='%s'", criteria) + containers = doc.findAll("div", criteria) + + logger.debug("Checking %d containers ...", len(containers)) + for container in containers: + logger.debug("container[]='%s'", type(container)) + for header in container.find_all(["h2", "h3", "h4", "h5"]): + content = header + logger.debug("header[%s]='%s' - BEFORE!", type(header), header) + if header is not None: + content = str(header.contents[0]) logger.debug("content[%s]='%s' - AFTER!", type(content), content) if content is None: @@ -165,10 +166,20 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list: logger.debug("content[]='%s' is not supported/wanted type 'str' - SKIPPED!", type(content)) continue elif content.lower() in translations: - logger.debug("Found header with blocked instances - BREAK!") + logger.debug("Found header='%s' with blocked instances - BREAK(3) !", header) found = header break + logger.debug("found[]='%s'", type(found)) + if found is not None: + logger.debug("Found header with blocked instances - BREAK(2) !") + break + + logger.debug("found[]='%s'", type(found)) + if found is not None: + logger.debug("Found header with blocked instances - BREAK(1) !") + break + logger.debug("found[]='%s'", type(found)) if found is None: logger.info("domain='%s' has no HTML blocklist, checking scripts ...", domain) @@ -187,7 +198,7 @@ def fetch_blocks(domain: str, nodeinfo_url: str) -> list: logger.debug("blocklist()=%d - EXIT!", len(blocklist)) return blocklist - blocking = found.find_next(["ul","table"]).findAll("a") + blocking = found.find_next(["ul", "table"]).findAll("a") logger.debug("Found %d blocked instance(s) ...", len(blocking)) for tag in blocking: logger.debug("tag[]='%s'", type(tag)) @@ -239,16 +250,16 @@ def fetch_instances(domain: str, origin: str) -> list: doc = bs4.BeautifulSoup(response.text, "html.parser") logger.debug("doc[]='%s'", type(doc)) - for container in [{"class": "home-instances container-lg"}, {"class": "container"}]: - logger.debug("container='%s'", container) - headers = doc.findAll("div", container) + for criteria in [{"class": "home-instances container-lg"}, {"class": "container"}]: + logger.debug("criteria='%s'", criteria) + containers = doc.findAll("div", criteria) - logger.debug("Checking %d headers ...", len(headers)) - for header in headers: + logger.debug("Checking %d containers ...", len(containers)) + for header in containers: logger.debug("header[%s]='%s'", type(header), header) rows = header.find_next(["ul","table"]).findAll("a") - logger.debug("Found %d blocked instance(s) ...", len(rows)) + logger.debug("Found %d instance(s) ...", len(rows)) for tag in rows: logger.debug("tag[]='%s'", type(tag)) text = tag.contents[0] if isinstance(tag.contents[0], str) else tag.contents[0].text -- 2.39.5