]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Tue, 12 Sep 2023 10:00:19 +0000 (12:00 +0200)
committerRoland Häder <roland@mxchange.org>
Tue, 12 Sep 2023 10:01:16 +0000 (12:01 +0200)
- added command 'fetch_csv' which fetches CSV files and processes them for
  further instance discovery and blocklist expansion
- introduced function processing.csv_block() which does the above processing
- return non-zero exit code when source was queried to recently

fba/boot.py
fba/commands.py
fba/helpers/blocklists.py
fba/helpers/processing.py

index 8e0f3fe5c92801e83279965383755bb6dc78f911..a2941ab9d641e8433a4494484d0aad45d8bd0945 100644 (file)
@@ -123,6 +123,13 @@ def init_parser():
     parser.set_defaults(command=commands.fetch_oliphant)
     parser.add_argument("--domain", help="Instance name (aka. domain) to check")
 
+    ### Fetch blocks from other CSV files
+    parser = subparser_command.add_parser(
+        "fetch_csv",
+        help="Fetches CSV files (block recommendations) for more possible instances to disover",
+    )
+    parser.set_defaults(command=commands.fetch_csv)
+
     ### Fetch instances from given initial instance ###
     parser = subparser_command.add_parser(
         "fetch_instances",
index 1d893062f0145bb236184649e08771e3233d3b70..1ceb234e45561095cb1dbac424ba37bb31fde89d 100644 (file)
@@ -112,7 +112,7 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int:
 
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -185,7 +185,7 @@ def fetch_bkali(args: argparse.Namespace) -> int:
     source_domain = "gql.api.bka.li"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -466,7 +466,7 @@ def fetch_observer(args: argparse.Namespace) -> int:
     source_domain = "fediverse.observer"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -568,7 +568,7 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int:
     source_domain = "wiki.todon.eu"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -688,7 +688,7 @@ def fetch_cs(args: argparse.Namespace):
     source_domain = "raw.githubusercontent.com"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -852,7 +852,7 @@ def fetch_fbabot_atom(args: argparse.Namespace) -> int:
 
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -993,6 +993,20 @@ def fetch_instances(args: argparse.Namespace) -> int:
     logger.debug("Success - EXIT!")
     return 0
 
+def fetch_csv(args: argparse.Namespace) -> int:
+    logger.debug("args[]='%s' - CALLED!", type(args))
+
+    logger.debug("Invoking locking.acquire() ...")
+    locking.acquire()
+
+    logger.info("Checking %d CSV files ...", len(blocklists.csv_files))
+    for block in blocklists.csv_files:
+        logger.debug("block[blocker]='%s',block[csv_url]='%s'", block["blocker"], block["csv_url"])
+        processing.csv_block(block["blocker"], block["csv_url"], inspect.currentframe().f_code.co_name)
+
+    logger.debug("Success - EXIT!")
+    return 0
+
 def fetch_oliphant(args: argparse.Namespace) -> int:
     logger.debug("args[]='%s' - CALLED!", type(args))
 
@@ -1002,7 +1016,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int:
     source_domain = "codeberg.org"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1010,8 +1024,6 @@ def fetch_oliphant(args: argparse.Namespace) -> int:
     # Base URL
     base_url = f"https://{source_domain}/oliphant/blocklists/raw/branch/main/blocklists"
 
-    domains = list()
-
     logger.debug("Downloading %d files ...", len(blocklists.oliphant_blocklists))
     for block in blocklists.oliphant_blocklists:
         # Is domain given and not equal blocker?
@@ -1022,119 +1034,7 @@ def fetch_oliphant(args: argparse.Namespace) -> int:
             logger.debug("args.domain='%s' already handled - SKIPPED!", args.domain)
             continue
 
-        instances.set_last_blocked(block["blocker"])
-
-        # Fetch this URL
-        logger.info("Fetching csv_url='%s' for blocker='%s' ...", block["csv_url"], block["blocker"])
-        response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
-
-        logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
-        if not response.ok or response.status_code > 200 or response.content == "":
-            logger.warning("Could not fetch csv_url='%s' for blocker='%s' - SKIPPED!", block["csv_url"], block["blocker"])
-            continue
-
-        logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
-        reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
-
-        blockdict = list()
-
-        cnt = 0
-        for row in reader:
-            logger.debug("row[%s]='%s'", type(row), row)
-            domain = severity = None
-            reject_media = reject_reports = False
-
-            if "#domain" in row:
-                domain = row["#domain"]
-            elif "domain" in row:
-                domain = row["domain"]
-            else:
-                logger.debug("row='%s' does not contain domain column", row)
-                continue
-
-            if "#severity" in row:
-                severity = blocks.alias_block_level(row["#severity"])
-            elif "severity" in row:
-                severity = blocks.alias_block_level(row["severity"])
-            else:
-                logger.debug("row='%s' does not contain severity column", row)
-                continue
-
-            if "#reject_media" in row and row["#reject_media"].lower() == "true":
-                reject_media = True
-            elif "reject_media" in row and row["reject_media"].lower() == "true":
-                reject_media = True
-
-            if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
-                reject_reports = True
-            elif "reject_reports" in row and row["reject_reports"].lower() == "true":
-                reject_reports = True
-
-            cnt = cnt + 1
-            logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
-            if domain is None or domain == "":
-                logger.debug("domain='%s' is empty - SKIPPED!", domain)
-                continue
-            elif domain.endswith(".onion"):
-                logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
-                continue
-            elif domain.endswith(".arpa"):
-                logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
-                continue
-            elif domain.endswith(".tld"):
-                logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
-                continue
-            elif domain.find("*") >= 0 or domain.find("?") >= 0:
-                logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, block["blocker"])
-                domain = utils.deobfuscate(domain, block["blocker"])
-                logger.debug("domain='%s' - AFTER!", domain)
-
-            if not validators.domain(domain):
-                logger.debug("domain='%s' is not a valid domain - SKIPPED!")
-                continue
-            elif blacklist.is_blacklisted(domain):
-                logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
-                continue
-            elif blocks.is_instance_blocked(block["blocker"], domain, severity):
-                logger.debug("block[blocker]='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", block["blocker"], domain, severity)
-                continue
-
-            logger.debug("Marking domain='%s' as handled", domain)
-            domains.append(domain)
-
-            logger.debug("Processing domain='%s' ...", domain)
-            processed = processing.instance(domain, block["blocker"], inspect.currentframe().f_code.co_name)
-            logger.debug("processed='%s'", processed)
-
-            if processing.block(block["blocker"], domain, None, severity) and config.get("bot_enabled"):
-                logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, block["block_level"], block["blocker"])
-                blockdict.append({
-                    "blocked": domain,
-                    "reason" : block["reason"],
-                })
-
-            if reject_media:
-                processing.block(block["blocker"], domain, None, "reject_media")
-            if reject_reports:
-                processing.block(block["blocker"], domain, None, "reject_reports")
-
-        logger.debug("block[blocker]='%s'", block["blocker"])
-        if not blocklists.has(block["blocker"]):
-            logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", block["blocker"], len(domains))
-            instances.set_total_blocks(block["blocker"], domains)
-
-        logger.debug("Checking if blocker='%s' has pending updates ...", block["blocker"])
-        if instances.has_pending(block["blocker"]):
-            logger.debug("Flushing updates for block[blocker]='%s' ...", block["blocker"])
-            instances.update(block["blocker"])
-
-        logger.debug("Invoking commit() ...")
-        database.connection.commit()
-
-        logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
-        if config.get("bot_enabled") and len(blockdict) > 0:
-            logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", block["blocker"], len(blockdict))
-            network.send_bot_post(block["blocker"], blockdict)
+        processing.csv_block(block["blocker"], f"{base_url}/{block['csv_url']}", inspect.currentframe().f_code.co_name)
 
     logger.debug("Success! - EXIT!")
     return 0
@@ -1197,7 +1097,7 @@ def fetch_fedipact(args: argparse.Namespace) -> int:
     source_domain = "fedipact.online"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1256,7 +1156,7 @@ def fetch_joinmobilizon(args: argparse.Namespace) -> int:
     source_domain = "instances.joinmobilizon.org"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1304,7 +1204,7 @@ def fetch_joinmisskey(args: argparse.Namespace) -> int:
     source_domain = "instanceapp.misskey.page"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1352,7 +1252,7 @@ def fetch_joinfediverse(args: argparse.Namespace) -> int:
     source_domain = "joinfediverse.wiki"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1657,7 +1557,7 @@ def fetch_fedilist(args: argparse.Namespace) -> int:
     source_domain = "demo.fedilist.com"
     if sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 1
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
@@ -1797,7 +1697,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int:
         return 1
     elif sources.is_recent(source_domain):
         logger.info("API from source_domain='%s' has recently being accessed - EXIT!", source_domain)
-        return 0
+        return 2
     else:
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
index b6d018b8f7574ab36fe153193a79fe908861302e..819d8a8af0593b38ffa6249cb7857f198e94cdf8 100644 (file)
@@ -22,7 +22,7 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 #logger.setLevel(logging.DEBUG)
 
-# URLs to fetch
+# Blocklists hosted by oliphant
 oliphant_blocklists = (
     {
         "blocker": "artisan.chat",
@@ -72,7 +72,15 @@ oliphant_blocklists = (
     },{
         "blocker": "oliphant.social",
         "csv_url": "mastodon/birdsite.csv",
-    }
+    },
+)
+
+# Other CSV files
+csv_files = (
+    {
+        "blocker": "tooters.org",
+        "csv_url": "https://raw.githubusercontent.com/victorwynne/victorwynne/tooters/federation/tooters_defederations.csv",
+    },
 )
 
 def has(domain: str) -> bool:
@@ -81,7 +89,7 @@ def has(domain: str) -> bool:
 
     # Default is not found
     found = False
-    for row in oliphant_blocklists:
+    for row in oliphant_blocklists + csv_files:
         logger.debug("row[blocker]='%s',domain='%s'", row["blocker"], domain)
         if row["blocker"] == domain:
             found = True
index 5881ded240d52b3190d843a531b6d409aeb84631..116aa2fa14967a06dfcde715a381ee71af27f7a8 100644 (file)
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+import csv
 import logging
 
+import validators
+
+from fba import database
 from fba import utils
 
 from fba.helpers import blacklist
+from fba.helpers import blocklists
+from fba.helpers import config
 from fba.helpers import domain as domain_helper
+from fba.helpers import tidyup
 
 from fba.http import federation
 from fba.http import network
@@ -96,3 +103,148 @@ def block(blocker: str, blocked: str, reason: str, block_level: str) -> bool:
 
     logger.debug("added='%s' - EXIT!", added)
     return added
+
+def csv_block(blocker: str, url: str, command: str):
+    logger.debug("blocker='%s',url='%s',command='%s' - CALLED!", blocker, url, command)
+    domain_helper.raise_on(blocker)
+
+    if not isinstance(url, str):
+        raise ValueError(f"url[]='{url}' is not of type 'str'")
+    elif url == "":
+        raise ValueError("Parameter 'url' is empty")
+    elif not isinstance(command, str):
+        raise ValueError(f"command[]='{command}' is not of type 'str'")
+    elif command == "":
+        raise ValueError("Parameter 'command' is empty")
+
+    logger.debug("Setting last_blocked for blocker='%s' ...", blocker)
+    instances.set_last_blocked(blocker)
+
+    domains = list()
+
+    # Fetch this URL
+    logger.info("Fetching url='%s' for blocker='%s' ...", url, blocker)
+    response = utils.fetch_url(
+        url,
+        network.web_headers,
+        (config.get("connection_timeout"), config.get("read_timeout"))
+    )
+
+    logger.debug("response.ok='%s',response.status_code=%d,response.content()=%d", response.ok, response.status_code, len(response.content))
+    if not response.ok or response.status_code > 200 or response.content == "":
+        logger.warning("Could not fetch url='%s' for blocker='%s' - EXIT!", url, blocker)
+        return
+
+    logger.debug("Fetched %d Bytes, parsing CSV ...", len(response.content))
+    reader = csv.DictReader(response.content.decode("utf-8").splitlines(), dialect="unix")
+
+    blockdict = list()
+
+    cnt = 0
+    for row in reader:
+        logger.debug("row[%s]='%s'", type(row), row)
+        domain = severity = reason = None
+        reject_media = reject_reports = False
+
+        if "#domain" in row:
+            domain = tidyup.domain(row["#domain"]) if row["#domain"] != None and row["#domain"] != "" else None
+        elif "domain" in row:
+            domain = tidyup.domain(row["domain"]) if row["domain"] != None and row["domain"] != "" else None
+        elif "Domain" in row:
+            domain = tidyup.domain(row["Domain"]) if row["Domain"] != None and row["Domain"] != "" else None
+        else:
+            logger.warning("row='%s' does not contain domain column - SKIPPED!", row)
+            continue
+
+        if "#severity" in row:
+            severity = blocks.alias_block_level(row["#severity"])
+        elif "severity" in row:
+            severity = blocks.alias_block_level(row["severity"])
+        else:
+            logger.debug("row='%s' does not contain severity column, setting 'reject'", row)
+            severity = "reject"
+
+        if "reason" in row:
+            reason = tidup.reason(row["reason"]) if row["reason"] != None and row["reason"] != "" else None
+        elif "comment" in row:
+            reason = tidup.reason(row["comment"]) if row["comment"] != None and row["comment"] != "" else None
+        else:
+            logger.debug("row='%s' has no reason/comment key provided", row)
+
+        if "#reject_media" in row and row["#reject_media"].lower() == "true":
+            reject_media = True
+        elif "reject_media" in row and row["reject_media"].lower() == "true":
+            reject_media = True
+
+        if "#reject_reports" in row and row["#reject_reports"].lower() == "true":
+            reject_reports = True
+        elif "reject_reports" in row and row["reject_reports"].lower() == "true":
+            reject_reports = True
+
+        cnt = cnt + 1
+        logger.debug("domain='%s',severity='%s',reject_media='%s',reject_reports='%s'", domain, severity, reject_media, reject_reports)
+        if domain is None or domain == "":
+            logger.debug("domain='%s' is empty - SKIPPED!", domain)
+            continue
+        elif domain.endswith(".onion"):
+            logger.debug("domain='%s' is a TOR .onion domain - SKIPPED", domain)
+            continue
+        elif domain.endswith(".arpa"):
+            logger.debug("domain='%s' is a reverse IP address - SKIPPED", domain)
+            continue
+        elif domain.endswith(".tld"):
+            logger.debug("domain='%s' is a fake domain - SKIPPED", domain)
+            continue
+        elif domain.find("*") >= 0 or domain.find("?") >= 0:
+            logger.debug("domain='%s' is obfuscated - Invoking utils.deobfuscate(%s, %s) ...", domain, domain, blocker)
+            domain = utils.deobfuscate(domain, blocker)
+            logger.debug("domain='%s' - AFTER!", domain)
+
+        if not validators.domain(domain):
+            logger.debug("domain='%s' is not a valid domain - SKIPPED!")
+            continue
+        elif blacklist.is_blacklisted(domain):
+            logger.warning("domain='%s' is blacklisted - SKIPPED!", domain)
+            continue
+        elif blocks.is_instance_blocked(blocker, domain, severity):
+            logger.debug("blocker='%s' has already blocked domain='%s' with severity='%s' - SKIPPED!", blocker, domain, severity)
+            continue
+
+        logger.debug("Marking domain='%s' as handled", domain)
+        domains.append(domain)
+
+        logger.debug("Processing domain='%s',blocker='%s',command='%s' ...", domain, blocker, command)
+        processed = instance(domain, blocker, command)
+        logger.debug("processed='%s'", processed)
+
+        if block(blocker, domain, reason, severity) and config.get("bot_enabled"):
+            logger.debug("Appending blocked='%s',reason='%s' for blocker='%s' ...", domain, reason, blocker)
+            blockdict.append({
+                "blocked": domain,
+                "reason" : reason,
+            })
+
+        if reject_media:
+            block(blocker, domain, None, "reject_media")
+        if reject_reports:
+            block(blocker, domain, None, "reject_reports")
+
+    logger.debug("blocker='%s'", blocker)
+    if not blocklists.has(blocker):
+        logger.debug("Invoking instances.set_total_blocks(%s, domains()=%d) ...", blocker, len(domains))
+        instances.set_total_blocks(blocker, domains)
+
+    logger.debug("Checking if blocker='%s' has pending updates ...", blocker)
+    if instances.has_pending(blocker):
+        logger.debug("Flushing updates for blocker='%s' ...", blocker)
+        instances.update(blocker)
+
+    logger.debug("Invoking commit() ...")
+    database.connection.commit()
+
+    logger.debug("config.get(bot_enabled)='%s',blockdict()=%d", config.get("bot_enabled"), len(blockdict))
+    if config.get("bot_enabled") and len(blockdict) > 0:
+        logger.info("Sending bot POST for blocker='%s',blockdict()=%d ...", blocker, len(blockdict))
+        network.send_bot_post(blocker, blockdict)
+
+    logger.debug("EXIT!")