From: Roland Häder Date: Thu, 8 Jun 2023 23:58:53 +0000 (+0200) Subject: Continued: X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=f9fd5b3e477977f54cd28d1c5f118db348a9acdc;p=fba.git Continued: - added command 'fetch_federater' which fetches a CSV file from github - if software is null, attempt to determine it - renamed get_url() to fetch_url() --- diff --git a/fba/boot.py b/fba/boot.py index 13a233d..e66690b 100644 --- a/fba/boot.py +++ b/fba/boot.py @@ -99,6 +99,13 @@ def init_parser(): ) parser.set_defaults(command=commands.fetch_fbabot_atom) + ### Fetch blocks from federater ### + parser = subparser_command.add_parser( + "fetch_federater", + help="Fetches CSV file (block recommendations) for more possible instances to disover", + ) + parser.set_defaults(command=commands.fetch_federater) + ### Fetch instances from given initial instance ### parser = subparser_command.add_parser( "fetch_instances", diff --git a/fba/commands.py b/fba/commands.py index 809ee21..c2bb791 100644 --- a/fba/commands.py +++ b/fba/commands.py @@ -17,6 +17,7 @@ import argparse import atoma import bs4 +import csv import inspect import itertools import json @@ -308,7 +309,7 @@ def fetch_fba_rss(args: argparse.Namespace): try: print(f"INFO: Fetch FBA-specific RSS args.feed='{args.feed}' ...") - response = fba.get_url(args.feed, fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = fba.fetch_url(args.feed, fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))) # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}") if response.ok and response.status_code < 300 and len(response.text) > 0: @@ -355,7 +356,7 @@ def fetch_fbabot_atom(args: argparse.Namespace): domains = list() try: print(f"INFO: Fetching ATOM feed='{feed}' from FBA bot account ...") - response = fba.get_url(feed, fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = fba.fetch_url(feed, fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))) # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}") if response.ok and response.status_code < 300 and len(response.text) > 0: @@ -429,3 +430,32 @@ def fetch_instances(args: argparse.Namespace): fba.fetch_instances(row[0], row[1], row[2], inspect.currentframe().f_code.co_name, row[3]) # DEBUG: print("DEBUG: EXIT!") + +def fetch_federater(args: argparse.Namespace): + # DEBUG: print(f"DEBUG: args[]={type(args)} - CALLED!") + boot.acquire_lock() + + # Fetch this URL + response = fba.fetch_url("https://github.com/federater/blocks_recommended/raw/main/federater.csv", fba.headers, (config.get("connection_timeout"), config.get("read_timeout"))) + # DEBUG: print(f"DEBUG: response[]='{type(response)}'") + if response.ok and response.content != "": + # DEBUG: print(f"DEBUG: Fetched {len(response.content)} Bytes, parsing CSV ...") + #print(f"DEBUG: response.content={response.content}") + reader = csv.DictReader(response.content.decode('utf-8').splitlines(), dialect='unix') + #, fieldnames='domain,severity,reject_media,reject_reports,public_comment,obfuscate' + # DEBUG: print(f"DEBUG: reader[]={type(reader)}") + for row in reader: + if not validators.domain(row["#domain"]): + print(f"WARNING: domain='{row['#domain']}' is not a valid domain - skipped!") + continue + elif blacklist.is_blacklisted(row["#domain"]): + print(f"WARNING: domain='{row['#domain']}' is blacklisted - skipped!") + continue + elif instances.is_registered(row["#domain"]): + # DEBUG: print(f"DEBUG: domain='{row['#domain']}' is already registered - skipped!") + continue + + print(f"INFO: Fetching instances for instane='{row['#domain']}' ...") + fba.fetch_instances(row["#domain"], 'github.com', None, inspect.currentframe().f_code.co_name) + + # DEBUG: print("DEBUG: EXIT!") diff --git a/fba/fba.py b/fba/fba.py index 1eddf5d..7868dd6 100644 --- a/fba/fba.py +++ b/fba/fba.py @@ -93,6 +93,12 @@ def fetch_instances(domain: str, origin: str, software: str, script: str, path: raise ValueError(f"Parameter 'domain' is empty") elif type(origin) != str and origin != None: raise ValueError(f"Parameter origin[]={type(origin)} is not 'str'") + elif software == None: + print(f"DEBUG: software for domain='{domain}' is not set, determining ...") + software = determine_software(domain, path) + print(f"DEBUG: Determined software='{software}' for domain='{domain}'") + elif type(software) != str: + raise ValueError(f"Parameter software[]={type(software)} is not 'str'") elif type(script) != str: raise ValueError(f"Parameter script[]={type(script)} is not 'str'") elif domain == "": @@ -521,7 +527,7 @@ def fetch_wellknown_nodeinfo(domain: str) -> list: # DEBUG: print("DEBUG: rel,href:", link["rel"], link["href"]) if link["rel"] in nodeinfo_identifier: # DEBUG: print("DEBUG: Fetching nodeinfo from:", link["href"]) - response = get_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout"))) + response = fetch_url(link["href"], api_headers, (config.get("connection_timeout"), config.get("read_timeout"))) data = json_from_response(response) # DEBUG: print("DEBUG: href,response.ok,response.status_code:", link["href"], response.ok, response.status_code) @@ -955,7 +961,7 @@ def find_domains(tag: bs4.element.Tag) -> list: # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!") return domains -def get_url(url: str, headers: dict, timeout: list) -> requests.models.Response: +def fetch_url(url: str, headers: dict, timeout: list) -> requests.models.Response: # DEBUG: print(f"DEBUG: url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!") if type(url) != str: raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")