]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Wed, 21 Jun 2023 17:39:28 +0000 (19:39 +0200)
committerRoland Häder <roland@mxchange.org>
Wed, 21 Jun 2023 17:39:28 +0000 (19:39 +0200)
- renamed fba.fba to fba.utils
- moved database-relevant code to fba.database module

18 files changed:
api.py
fba/__init__.py
fba/boot.py
fba/commands.py
fba/csrf.py
fba/database.py [new file with mode: 0644]
fba/fba.py [deleted file]
fba/http/federation.py
fba/http/network.py
fba/models/blocks.py
fba/models/error_log.py
fba/models/instances.py
fba/networks/friendica.py
fba/networks/lemmy.py
fba/networks/mastodon.py
fba/networks/misskey.py
fba/networks/pleroma.py
fba/utils.py [new file with mode: 0644]

diff --git a/api.py b/api.py
index ff0fa997c096c28c081ef68ac4d386855e6bd93a..ea353035aeda6a85a4da5be8aeb93fb45ee5b504 100644 (file)
--- a/api.py
+++ b/api.py
@@ -29,7 +29,8 @@ import uvicorn
 import requests
 import validators
 
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import config
 from fba.helpers import tidyup
@@ -41,8 +42,8 @@ templates = Jinja2Templates(directory="templates")
 
 @router.get(config.get("base_url") + "/api/info.json", response_class=JSONResponse)
 def api_info():
-    fba.cursor.execute("SELECT (SELECT COUNT(domain) FROM instances), (SELECT COUNT(domain) FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube')), (SELECT COUNT(blocker) FROM blocks), (SELECT COUNT(domain) FROM instances WHERE last_error_details IS NOT NULL)")
-    row = fba.cursor.fetchone()
+    database.cursor.execute("SELECT (SELECT COUNT(domain) FROM instances), (SELECT COUNT(domain) FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube')), (SELECT COUNT(blocker) FROM blocks), (SELECT COUNT(domain) FROM instances WHERE last_error_details IS NOT NULL)")
+    row = database.cursor.fetchone()
 
     return {
         "known_instances"   : row[0],
@@ -58,25 +59,25 @@ def api_scoreboard(mode: str, amount: int):
         raise HTTPException(status_code=400, detail="Too many results")
 
     if mode == "blocked":
-        fba.cursor.execute("SELECT blocked, COUNT(blocked) AS score FROM blocks WHERE block_level = 'reject' GROUP BY blocked ORDER BY score DESC LIMIT ?", [amount])
+        database.cursor.execute("SELECT blocked, COUNT(blocked) AS score FROM blocks WHERE block_level = 'reject' GROUP BY blocked ORDER BY score DESC LIMIT ?", [amount])
     elif mode == "blocker":
-        fba.cursor.execute("SELECT blocker, COUNT(blocker) AS score FROM blocks WHERE block_level = 'reject' GROUP BY blocker ORDER BY score DESC LIMIT ?", [amount])
+        database.cursor.execute("SELECT blocker, COUNT(blocker) AS score FROM blocks WHERE block_level = 'reject' GROUP BY blocker ORDER BY score DESC LIMIT ?", [amount])
     elif mode == "reference":
-        fba.cursor.execute("SELECT origin, COUNT(domain) AS score FROM instances WHERE software IS NOT NULL GROUP BY origin ORDER BY score DESC LIMIT ?", [amount])
+        database.cursor.execute("SELECT origin, COUNT(domain) AS score FROM instances WHERE software IS NOT NULL GROUP BY origin ORDER BY score DESC LIMIT ?", [amount])
     elif mode == "software":
-        fba.cursor.execute("SELECT software, COUNT(domain) AS score FROM instances WHERE software IS NOT NULL GROUP BY software ORDER BY score DESC, software ASC LIMIT ?", [amount])
+        database.cursor.execute("SELECT software, COUNT(domain) AS score FROM instances WHERE software IS NOT NULL GROUP BY software ORDER BY score DESC, software ASC LIMIT ?", [amount])
     elif mode == "command":
-        fba.cursor.execute("SELECT command, COUNT(domain) AS score FROM instances WHERE command IS NOT NULL GROUP BY command ORDER BY score DESC, command ASC LIMIT ?", [amount])
+        database.cursor.execute("SELECT command, COUNT(domain) AS score FROM instances WHERE command IS NOT NULL GROUP BY command ORDER BY score DESC, command ASC LIMIT ?", [amount])
     elif mode == "error_code":
-        fba.cursor.execute("SELECT last_status_code, COUNT(domain) AS score FROM instances WHERE last_status_code IS NOT NULL AND last_status_code != '200' GROUP BY last_status_code ORDER BY score DESC LIMIT ?", [amount])
+        database.cursor.execute("SELECT last_status_code, COUNT(domain) AS score FROM instances WHERE last_status_code IS NOT NULL AND last_status_code != '200' GROUP BY last_status_code ORDER BY score DESC LIMIT ?", [amount])
     elif mode == "avg_peers":
-        fba.cursor.execute("SELECT software, AVG(total_peers) AS sum FROM instances WHERE software IS NOT NULL GROUP BY software HAVING sum>0 ORDER BY sum DESC LIMIT ?", [amount])
+        database.cursor.execute("SELECT software, AVG(total_peers) AS sum FROM instances WHERE software IS NOT NULL GROUP BY software HAVING sum>0 ORDER BY sum DESC LIMIT ?", [amount])
     else:
         raise HTTPException(status_code=400, detail="No filter specified")
 
     scores = list()
 
-    for domain, score in fba.cursor.fetchall():
+    for domain, score in database.cursor.fetchall():
         scores.append({
             "domain": domain,
             "score" : round(score)
@@ -102,18 +103,18 @@ def api_blocked(domain: str = None, reason: str = None, reverse: str = None):
         wildchar = "*." + ".".join(domain.split(".")[-domain.count("."):])
         punycode = domain.encode('idna').decode('utf-8')
 
-        fba.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_seen ASC",
-                  (domain, "*." + domain, wildchar, fba.get_hash(domain), punycode, "*." + punycode))
+        database.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_seen ASC",
+                  (domain, "*." + domain, wildchar, utils.get_hash(domain), punycode, "*." + punycode))
     elif reverse is not None:
         reverse = tidyup.domain(reverse)
         if not validators.domain(reverse):
             raise HTTPException(status_code=500, detail="Invalid domain")
 
-        fba.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocker = ? ORDER BY first_seen ASC", [reverse])
+        database.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocker = ? ORDER BY first_seen ASC", [reverse])
     else:
-        fba.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE reason like ? AND reason != '' ORDER BY first_seen ASC", ["%" + reason + "%"])
+        database.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE reason like ? AND reason != '' ORDER BY first_seen ASC", ["%" + reason + "%"])
 
-    blocklist = fba.cursor.fetchall()
+    blocklist = database.cursor.fetchall()
 
     result = {}
     for blocker, blocked, block_level, reason, first_seen, last_seen in blocklist:
@@ -138,7 +139,7 @@ def api_blocked(domain: str = None, reason: str = None, reverse: str = None):
 @router.get(config.get("base_url") + "/api/mutual.json", response_class=JSONResponse)
 def api_mutual(domains: list[str] = Query()):
     """Return 200 if federation is open between the two, 4xx otherwise"""
-    fba.cursor.execute(
+    database.cursor.execute(
         "SELECT block_level FROM blocks " \
         "WHERE ((blocker = :a OR blocker = :b) AND (blocked = :b OR blocked = :a OR blocked = :aw OR blocked = :bw)) " \
         "AND block_level = 'reject' " \
@@ -150,7 +151,7 @@ def api_mutual(domains: list[str] = Query()):
             "bw": "*." + domains[1],
         },
     )
-    response = fba.cursor.fetchone()
+    response = database.cursor.fetchone()
 
     if response is not None:
         # Blocks found
@@ -264,18 +265,18 @@ def rss(request: Request, domain: str = None):
         wildchar = "*." + ".".join(domain.split(".")[-domain.count("."):])
         punycode = domain.encode("idna").decode("utf-8")
 
-        fba.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_seen DESC LIMIT ?", [
+        database.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks WHERE blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? OR blocked = ? ORDER BY first_seen DESC LIMIT ?", [
             domain,
             "*." + domain, wildchar,
-            fba.get_hash(domain),
+            utils.get_hash(domain),
             punycode,
             "*." + punycode,
             config.get("rss_limit")
         ])
     else:
-        fba.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks ORDER BY first_seen DESC LIMIT ?", [config.get("rss_limit")])
+        database.cursor.execute("SELECT blocker, blocked, block_level, reason, first_seen, last_seen FROM blocks ORDER BY first_seen DESC LIMIT ?", [config.get("rss_limit")])
 
-    result = fba.cursor.fetchall()
+    result = database.cursor.fetchall()
     blocklist = []
 
     for row in result:
index e716e103392940f3796f77efc67c15a7b97151f6..03bb0404bd7deb72ad2d3d805302b6e716314038 100644 (file)
@@ -18,7 +18,8 @@ __all__ = [
     'boot',
     'commands',
     'csrf',
-    'fba',
+    'database',
+    'utils',
     # Sub packages:
     'helpers',
     'http',
index 56d2643a55836f8a986e7df0cf3d87ee97e83dc6..6c49bea7acbfc51f382566ccecafa6074dce7611 100644 (file)
@@ -19,7 +19,7 @@ import logging
 import argparse
 
 from fba import commands
-from fba import fba
+from fba import database
 
 from fba.helpers import locking
 
@@ -148,6 +148,6 @@ def run_command():
 
 def shutdown():
     logger.debug("Closing database connection ...")
-    fba.connection.close()
+    database.connection.close()
     locking.release()
     logger.debug("Shutdown completed.")
index e3d71671f5c61bb98d4250d902f7901e8a56adf6..9ba1ff7905f6f81b826b44969a6bc23190a9117c 100644 (file)
@@ -27,7 +27,8 @@ import markdown
 import reqto
 import validators
 
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import config
@@ -99,17 +100,8 @@ def fetch_bkali(args: argparse.Namespace) -> int:
             if "domain" not in entry:
                 logger.warning("entry()=%d does not contain 'domain' - SKIPPED!", len(entry))
                 continue
-            elif not validators.domain(entry["domain"]):
-                logger.warning("domain='%s' is not a valid domain - SKIPPED!", entry['domain'])
-                continue
-            elif entry["domain"].endswith(".arpa"):
-                logger.debug("entry[domain]='%s' is a domain for reversed IP addresses - SKIPPED!", entry["domain"])
-                continue
-            elif entry["domain"].endswith(".tld"):
-                logger.debug("entry[domain]='%s' is a fake domain - SKIPPED!", entry['domain'])
-                continue
-            elif blacklist.is_blacklisted(entry["domain"]):
-                logger.debug("domain='%s' is blacklisted - SKIPPED!", entry['domain'])
+            elif not utils.is_domain_wanted(entry["domain"]):
+                logger.debug("entry[domain]='%s' is not wanted - SKIPPED!")
                 continue
             elif instances.is_registered(entry["domain"]):
                 logger.debug("domain='%s' is already registered - SKIPPED!", entry['domain'])
@@ -132,7 +124,7 @@ def fetch_bkali(args: argparse.Namespace) -> int:
                 logger.info("Fetching instances from domain='%s' ...", domain)
                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
 
-                logger.debug(f"Invoking cookies.clear({domain}) ...")
+                logger.debug("Invoking cookies.clear(%s) ...", domain)
                 cookies.clear(domain)
             except network.exceptions as exception:
                 logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_bkali) from domain='{domain}'")
@@ -152,7 +144,7 @@ def fetch_blocks(args: argparse.Namespace):
             logger.warning(f"domain='{args.domain}' is blacklisted, won't check it!")
             return
         elif not instances.is_registered(args.domain):
-            logger.warning(f"domain='{args.domain}' is not registered, please run ./fba.py fetch_instances {args.domain} first.")
+            logger.warning(f"domain='{args.domain}' is not registered, please run ./utils.py fetch_instances {args.domain} first.")
             return
 
     locking.acquire()
@@ -160,22 +152,22 @@ def fetch_blocks(args: argparse.Namespace):
     if args.domain is not None and args.domain != "":
         # Re-check single domain
         logger.debug(f"Querying database for single args.domain='{args.domain}' ...")
-        fba.cursor.execute(
+        database.cursor.execute(
             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE domain = ?", [args.domain]
         )
     elif args.software is not None and args.software != "":
         # Re-check single software
         logger.debug(f"Querying database for args.software='{args.software}' ...")
-        fba.cursor.execute(
+        database.cursor.execute(
             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software = ?", [args.software]
         )
     else:
         # Re-check after "timeout" (aka. minimum interval)
-        fba.cursor.execute(
+        database.cursor.execute(
             "SELECT domain, software, origin, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'lemmy', 'friendica', 'misskey', 'peertube') AND (last_blocked IS NULL OR last_blocked < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_block")]
         )
 
-    rows = fba.cursor.fetchall()
+    rows = database.cursor.fetchall()
     logger.info("Checking %d entries ...", len(rows))
     for blocker, software, origin, nodeinfo_url in rows:
         logger.debug("BEFORE blocker='%s',software='%s',origin='%s',nodeinfo_url='%s'", blocker, software, origin, nodeinfo_url)
@@ -229,7 +221,7 @@ def fetch_blocks(args: argparse.Namespace):
                     logger.debug(f"blocked='{blocked}',reason='{reason}' - BEFORE!")
                     blocked = tidyup.domain(blocked)
                     reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
-                    logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
+                    logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
                     if blocked == "":
                         logger.warning("blocked is empty, blocker='%s'", blocker)
@@ -263,14 +255,8 @@ def fetch_blocks(args: argparse.Namespace):
                         nodeinfo_url = row[2]
 
                     logger.debug("Looking up instance by domainm, blocked='%s'", blocked)
-                    if not validators.domain(blocked):
-                        logger.warning(f"blocked='{blocked}',software='{software}' is not a valid domain name - SKIPPED!")
-                        continue
-                    elif blocked.endswith(".arpa"):
-                        logger.debug("blocked='%s' is a domain for reversed IP addresses - SKIPPED!", blocked)
-                        continue
-                    elif blocked.endswith(".tld"):
-                        logger.debug(f"blocked='{blocked}' is a fake domain - SKIPPED!")
+                    if not utils.is_domain_wanted(blocked):
+                        logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                         continue
                     elif not instances.is_registered(blocked):
                         logger.debug("Hash wasn't found, adding: blocked='%s',blocker='%s'", blocked, blocker)
@@ -296,8 +282,8 @@ def fetch_blocks(args: argparse.Namespace):
                     logger.debug(f"Invoking cookies.clear({blocked}) ...")
                     cookies.clear(blocked)
 
-            logger.debug("Committing changes ...")
-            fba.connection.commit()
+            logger.debug("Invoking commit() ...")
+            database.connection.commit()
         else:
             logger.warning("Unknown software: blocker='%s',software='%s'", blocker, software)
 
@@ -356,7 +342,7 @@ def fetch_observer(args: argparse.Namespace):
 
         try:
             logger.debug(f"Fetching table data for software='{software}' ...")
-            raw = fba.fetch_url(f"https://fediverse.observer/app/views/tabledata.php?software={software}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
+            raw = utils.fetch_url(f"https://fediverse.observer/app/views/tabledata.php?software={software}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
             logger.debug(f"raw[{type(raw)}]()={len(raw)}")
 
             doc = bs4.BeautifulSoup(raw, features='html.parser')
@@ -372,26 +358,17 @@ def fetch_observer(args: argparse.Namespace):
             domain = item.decode_contents()
 
             logger.debug("domain='%s'", domain)
-            if not validators.domain(domain.split("/")[0]):
-                logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
-                continue
-            elif domain.endswith(".arpa"):
-                logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                continue
-            elif domain.endswith(".tld"):
-                logger.debug("domain='%s' is a fake domain - SKIPPED!", domain)
-                continue
-            elif blacklist.is_blacklisted(domain):
-                logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
+            if not utils.is_domain_wanted(domain)
+                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
             elif instances.is_registered(domain):
                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
                 continue
 
-            logger.info(f"Fetching instances for domain='{domain}',software='{software}'")
+            logger.info("Fetching instances for domain='%s',software='%s'", domain, software)
             federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
 
-            logger.debug(f"Invoking cookies.clear({domain}) ...")
+            logger.debug("Invoking cookies.clear(%s) ...", domain)
             cookies.clear(domain)
 
     logger.debug("EXIT!")
@@ -405,7 +382,7 @@ def fetch_todon_wiki(args: argparse.Namespace):
         "reject": list(),
     }
 
-    raw = fba.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
+    raw = utils.fetch_url("https://wiki.todon.eu/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
 
     doc = bs4.BeautifulSoup(raw, "html.parser")
@@ -413,11 +390,11 @@ def fetch_todon_wiki(args: argparse.Namespace):
 
     silenced = doc.find("h3", {"id": "silencedlimited_servers"}).find_next("ul").findAll("li")
     logger.info("Checking %d silenced/limited entries ...", len(silenced))
-    blocklist["silenced"] = fba.find_domains(silenced, "div")
+    blocklist["silenced"] = utils.find_domains(silenced, "div")
 
     suspended = doc.find("h3", {"id": "suspended_servers"}).find_next("ul").findAll("li")
     logger.info("Checking %d suspended entries ...", len(suspended))
-    blocklist["reject"] = fba.find_domains(suspended, "div")
+    blocklist["reject"] = utils.find_domains(suspended, "div")
 
     for block_level in blocklist:
         blockers = blocklist[block_level]
@@ -428,10 +405,10 @@ def fetch_todon_wiki(args: argparse.Namespace):
 
             if not instances.is_registered(blocked):
                 try:
-                    logger.info(f"Fetching instances from domain='{row['domain']}' ...")
+                    logger.info("Fetching instances from domain='%s' ...", row['domain'])
                     federation.fetch_instances(blocked, 'chaos.social', None, inspect.currentframe().f_code.co_name)
 
-                    logger.debug(f"Invoking cookies.clear({row['domain']}) ...")
+                    logger.debug("Invoking cookies.clear(%s) ...", row['domain'])
                     cookies.clear(blocked)
                 except network.exceptions as exception:
                     logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_cs) from domain='{row['domain']}'")
@@ -445,7 +422,7 @@ def fetch_todon_wiki(args: argparse.Namespace):
             blocks.add_instance("todon.eu", blocked, None, block_level)
 
         logger.debug("Invoking commit() ...")
-        fba.connection.commit()
+        database.connection.commit()
 
     logger.debug("EXIT!")
 
@@ -476,7 +453,7 @@ def fetch_cs(args: argparse.Namespace):
         "reject"  : list(),
     }
 
-    raw = fba.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
+    raw = utils.fetch_url("https://raw.githubusercontent.com/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
 
     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features='html.parser')
@@ -501,10 +478,10 @@ def fetch_cs(args: argparse.Namespace):
                 logger.debug(f"row='{row}'")
                 if not instances.is_registered(row["domain"]):
                     try:
-                        logger.info(f"Fetching instances from domain='{row['domain']}' ...")
+                        logger.info("Fetching instances from domain='%s' ...", row['domain'])
                         federation.fetch_instances(row["domain"], 'chaos.social', None, inspect.currentframe().f_code.co_name)
 
-                        logger.debug(f"Invoking cookies.clear({row['domain']}) ...")
+                        logger.debug("Invoking cookies.clear(%s) ...", row['domain'])
                         cookies.clear(row["domain"])
                     except network.exceptions as exception:
                         logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_cs) from domain='{row['domain']}'")
@@ -514,8 +491,8 @@ def fetch_cs(args: argparse.Namespace):
                     logger.debug(f"domain='{row['domain']}',block_level='{block_level}' blocked by chaos.social, adding ...")
                     blocks.add_instance('chaos.social', row["domain"], row["reason"], block_level)
 
-        logger.debug("Committing changes ...")
-        fba.connection.commit()
+        logger.debug("Invoking commit() ...")
+        database.connection.commit()
 
     logger.debug("EXIT!")
 
@@ -523,10 +500,10 @@ def fetch_fba_rss(args: argparse.Namespace):
     logger.debug("args[]='%s' - CALLED!", type(args))
     domains = list()
 
-    logger.info(f"Fetch FBA-specific RSS args.feed='{args.feed}' ...")
-    response = fba.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+    logger.info("Fetch FBA-specific RSS args.feed='%s' ...", args.feed)
+    response = utils.fetch_url(args.feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
-    logger.debug(f"response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}")
+    logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
     if response.ok and response.status_code < 300 and len(response.text) > 0:
         logger.debug(f"Parsing RSS feed ({len(response.text)} Bytes) ...")
         rss = atoma.parse_rss_bytes(response.content)
@@ -553,13 +530,13 @@ def fetch_fba_rss(args: argparse.Namespace):
     if len(domains) > 0:
         locking.acquire()
 
-        logger.info(f"Adding {len(domains)} new instances ...")
+        logger.info("Adding %d new instances ...", len(domains))
         for domain in domains:
             try:
-                logger.info(f"Fetching instances from domain='{domain}' ...")
+                logger.info("Fetching instances from domain='%s' ...", domain)
                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
 
-                logger.debug(f"Invoking cookies.clear({domain}) ...")
+                logger.debug("Invoking cookies.clear(%s) ...", domain)
                 cookies.clear(domain)
             except network.exceptions as exception:
                 logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_fba_rss) from domain='{domain}'")
@@ -574,9 +551,9 @@ def fetch_fbabot_atom(args: argparse.Namespace):
     domains = list()
 
     logger.info(f"Fetching ATOM feed='{feed}' from FBA bot account ...")
-    response = fba.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+    response = utils.fetch_url(feed, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
-    logger.debug(f"response.ok={response.ok},response.status_code='{response.status_code}',response.text()={len(response.text)}")
+    logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
     if response.ok and response.status_code < 300 and len(response.text) > 0:
         logger.debug(f"Parsing ATOM feed ({len(response.text)} Bytes) ...")
         atom = atoma.parse_atom_bytes(response.content)
@@ -615,7 +592,7 @@ def fetch_fbabot_atom(args: argparse.Namespace):
                 logger.info(f"Fetching instances from domain='{domain}' ...")
                 federation.fetch_instances(domain, None, None, inspect.currentframe().f_code.co_name)
 
-                logger.debug(f"Invoking cookies.clear({domain}) ...")
+                logger.debug("Invoking cookies.clear(%s) ...", domain)
                 cookies.clear(domain)
             except network.exceptions as exception:
                 logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_fbabot_atom) from domain='{domain}'")
@@ -645,11 +622,11 @@ def fetch_instances(args: argparse.Namespace) -> int:
         return 0
 
     # Loop through some instances
-    fba.cursor.execute(
+    database.cursor.execute(
         "SELECT domain, origin, software, nodeinfo_url FROM instances WHERE software IN ('pleroma', 'mastodon', 'friendica', 'misskey', 'lemmy', 'peertube') AND (last_instance_fetch IS NULL OR last_instance_fetch < ?) ORDER BY rowid DESC", [time.time() - config.get("recheck_instance")]
     )
 
-    rows = fba.cursor.fetchall()
+    rows = database.cursor.fetchall()
     logger.info("Checking %d entries ...", len(rows))
     for row in rows:
         logger.debug(f"domain='{row[0]}'")
@@ -730,7 +707,7 @@ def fetch_oliphant(args: argparse.Namespace):
 
         # Fetch this URL
         logger.info(f"Fetching csv_url='{block['csv_url']}' for blocker='{block['blocker']}' ...")
-        response = fba.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+        response = utils.fetch_url(f"{base_url}/{block['csv_url']}", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
         if response.ok and response.content != "":
@@ -748,24 +725,15 @@ def fetch_oliphant(args: argparse.Namespace):
                     logger.debug(f"row='{row}' does not contain domain column")
                     continue
 
-                if not validators.domain(domain):
-                    logger.warning("domain='%s' is not a valid domain name - SKIPPED!", domain)
-                    continue
-                elif domain.endswith(".arpa"):
-                    logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                    continue
-                elif domain.endswith(".tld"):
-                    logger.debug("domain='%s' is a fake domain - SKIPPED!", domain)
-                    continue
-                elif blacklist.is_blacklisted(domain):
-                    logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
+                if not utils.is_domain_wanted(domain):
+                    logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
                     continue
 
                 logger.debug(f"Marking domain='{domain}' as handled")
                 domains.append(domain)
 
                 logger.debug(f"Processing domain='{domain}' ...")
-                processed = fba.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
+                processed = utils.process_domain(domain, block["blocker"], inspect.currentframe().f_code.co_name)
 
                 logger.debug(f"processed='{processed}'")
 
@@ -783,7 +751,7 @@ def fetch_txt(args: argparse.Namespace):
     logger.info(f"Checking {len(urls)} text file(s) ...")
     for url in urls:
         logger.debug("Fetching url='%s' ...", url)
-        response = fba.fetch_url(url, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+        response = utils.fetch_url(url, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
         logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
         if response.ok and response.status_code < 300 and response.text != "":
@@ -796,21 +764,12 @@ def fetch_txt(args: argparse.Namespace):
                 if domain == "":
                     logger.debug("domain is empty - SKIPPED!")
                     continue
-                elif not validators.domain(domain):
-                    logger.warning("domain='%s' is not a valid domain name - SKIPPED!", domain)
-                    continue
-                elif domain.endswith(".arpa"):
-                    logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                    continue
-                elif domain.endswith(".tld"):
-                    logger.debug("domain='%s' is a fake domain - SKIPPED!", domain)
-                    continue
-                elif blacklist.is_blacklisted(domain):
-                    logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
+                elif not utils.is_domain_wanted(domain):
+                    logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
                     continue
 
                 logger.debug("domain='%s'", domain)
-                processed = fba.process_domain(domain, 'seirdy.one', inspect.currentframe().f_code.co_name)
+                processed = utils.process_domain(domain, 'seirdy.one', inspect.currentframe().f_code.co_name)
 
                 logger.debug(f"processed='{processed}'")
                 if not processed:
@@ -823,7 +782,7 @@ def fetch_fedipact(args: argparse.Namespace):
     logger.debug("args[]='%s' - CALLED!", type(args))
     locking.acquire()
 
-    response = fba.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+    response = utils.fetch_url("https://fedipact.online", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
     logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
     if response.ok and response.status_code < 300 and response.text != "":
@@ -842,17 +801,8 @@ def fetch_fedipact(args: argparse.Namespace):
             if domain == "":
                 logger.debug("domain is empty - SKIPPED!")
                 continue
-            elif not validators.domain(domain):
-                logger.warning("domain='%s' is not a valid domain name - SKIPPED!", domain)
-                continue
-            elif domain.endswith(".arpa"):
-                logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                continue
-            elif domain.endswith(".tld"):
-                logger.debug("domain='%s' is a fake domain - SKIPPED!", domain)
-                continue
-            elif blacklist.is_blacklisted(domain):
-                logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
+            elif not utils.is_domain_wanted(domain):
+                logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
                 continue
             elif instances.is_registered(domain):
                 logger.debug("domain='%s' is already registered - SKIPPED!", domain)
index 69510b5ffdfc2ba42a9abf60f3fdd8700cae936c..89abd0562e26076b55c62470a503080287a252b4 100644 (file)
@@ -56,8 +56,8 @@ def determine(domain: str, headers: dict) -> dict:
         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
     )
 
-    logger.debug(f"response.ok='{response.ok}',response.status_code={response.status_code},response.text()={len(response.text)}")
-    if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
+    logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
+    if response.ok and response.status_code < 300 and response.text != "" and response.text.find("<html") > 0:
         # Save cookies
         logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
         cookies.store(domain, response.cookies.get_dict())
diff --git a/fba/database.py b/fba/database.py
new file mode 100644 (file)
index 0000000..53b33bc
--- /dev/null
@@ -0,0 +1,24 @@
+# Copyright (C) 2023 Free Software Foundation
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import logging
+import sqlite3
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Connect to database
+connection = sqlite3.connect("blocks.db")
+cursor = connection.cursor()
diff --git a/fba/fba.py b/fba/fba.py
deleted file mode 100644 (file)
index 8edca9b..0000000
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (C) 2023 Free Software Foundation
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-import hashlib
-import logging
-import sqlite3
-
-from urllib.parse import urlparse
-
-import bs4
-import requests
-import validators
-
-from fba.helpers import blacklist
-from fba.helpers import cookies
-from fba.helpers import tidyup
-
-from fba.http import federation
-from fba.http import network
-
-from fba.models import instances
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-# Connect to database
-connection = sqlite3.connect("blocks.db")
-cursor = connection.cursor()
-
-##### Other functions #####
-
-def is_primitive(var: any) -> bool:
-    logger.debug(f"var[]='{type(var)}' - CALLED!")
-    return type(var) in {int, str, float, bool} or var is None
-
-def get_hash(domain: str) -> str:
-    logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
-    if not isinstance(domain, str):
-        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
-    elif domain == "":
-        raise ValueError("Parameter 'domain' is empty")
-    elif domain.lower() != domain:
-        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
-    elif not validators.domain(domain.split("/")[0]):
-        raise ValueError(f"domain='{domain}' is not a valid domain")
-    elif domain.endswith(".arpa"):
-        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif domain.endswith(".tld"):
-        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
-
-    return hashlib.sha256(domain.encode("utf-8")).hexdigest()
-
-def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Response:
-    logger.debug(f"url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
-    if not isinstance(url, str):
-        raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
-    elif url == "":
-        raise ValueError("Parameter 'url' is empty")
-    elif not isinstance(headers, dict):
-        raise ValueError(f"Parameter headers[]='{type(headers)}' is not 'dict'")
-    elif not isinstance(timeout, tuple):
-        raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not 'tuple'")
-
-    logger.debug(f"Parsing url='{url}'")
-    components = urlparse(url)
-
-    # Invoke other function, avoid trailing ?
-    logger.debug(f"components[{type(components)}]={components}")
-    if components.query != "":
-        response = network.fetch_response(components.netloc, f"{components.path}?{components.query}", headers, timeout)
-    else:
-        response = network.fetch_response(components.netloc, components.path if isinstance(components.path, str) and components.path != '' else '/', headers, timeout)
-
-    logger.debug(f"response[]='{type(response)}' - EXXIT!")
-    return response
-
-def process_domain(domain: str, blocker: str, command: str) -> bool:
-    logger.debug(f"domain='{domain}',blocker='{blocker}',command='{command}' - CALLED!")
-    if not isinstance(domain, str):
-        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
-    elif domain == "":
-        raise ValueError("Parameter 'domain' is empty")
-    elif domain.lower() != domain:
-        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
-    elif not validators.domain(domain.split("/")[0]):
-        raise ValueError(f"domain='{domain}' is not a valid domain")
-    elif domain.endswith(".arpa"):
-        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif domain.endswith(".tld"):
-        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
-    elif not isinstance(blocker, str):
-        raise ValueError(f"Parameter blocker[]='{type(blocker)}' is not 'str'")
-    elif blocker == "":
-        raise ValueError("Parameter 'blocker' is empty")
-    elif not validators.domain(blocker.split("/")[0]):
-        raise ValueError(f"blocker='{blocker}' is not a valid domain")
-    elif blocker.endswith(".arpa"):
-        raise ValueError(f"blocker='{blocker}' is a domain for reversed IP addresses, please don't crawl them!")
-    elif blocker.endswith(".tld"):
-        raise ValueError(f"blocker='{blocker}' is a fake domain, please don't crawl them!")
-    elif not isinstance(command, str):
-        raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
-    elif command == "":
-        raise ValueError("Parameter 'command' is empty")
-
-    if domain.find("*") > 0:
-        # Try to de-obscure it
-        row = instances.deobscure("*", domain)
-
-        logger.debug(f"row[{type(row)}]='{row}'")
-        if row is None:
-            logger.warning(f"Cannot de-obfucate domain='{domain}' - SKIPPED!")
-            return False
-
-        logger.debug(f"domain='{domain}' de-obscured to '{row[0]}'")
-        domain = row[0]
-    elif domain.find("?") > 0:
-        # Try to de-obscure it
-        row = instances.deobscure("?", domain)
-
-        logger.debug(f"row[{type(row)}]='{row}'")
-        if row is None:
-            logger.warning(f"Cannot de-obfucate domain='{domain}' - SKIPPED!")
-            return False
-
-        logger.debug(f"domain='{domain}' de-obscured to '{row[0]}'")
-        domain = row[0]
-
-    if not validators.domain(domain.split("/")[0]):
-        logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
-        return False
-    elif domain.endswith(".arpa"):
-        logger.warning(f"domain='{domain}' is a reversed .arpa domain and should not be used generally.")
-        return False
-    elif blacklist.is_blacklisted(domain):
-        logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
-        return False
-    elif instances.is_recent(domain):
-        logger.debug(f"domain='{domain}' has been recently checked - SKIPPED!")
-        return False
-
-    processed = False
-    try:
-        logger.info("Fetching instances for domain='%s',blocker='%s',command='%s' ...", domain, blocker, command)
-        federation.fetch_instances(domain, blocker, None, command)
-        processed = True
-
-        logger.debug(f"Invoking cookies.clear({domain}) ...")
-        cookies.clear(domain)
-    except network.exceptions as exception:
-        logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_oliphant) from domain='{domain}'")
-        instances.set_last_error(domain, exception)
-
-    logger.debug(f"processed='{processed}' - EXIT!")
-    return processed
-
-def find_domains(tags: bs4.element.ResultSet, search: str) -> list:
-    logger.debug("tags[%s]()=%d,search='%s' - CALLED!", type(tags), len(tags), search)
-    if not isinstance(tags, bs4.element.ResultSet):
-        raise ValueError(f"Parameter tags[]='{type(tags)}' is not 'ResultSet'")
-    elif not isinstance(search, str):
-        raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
-    elif search == "":
-        raise ValueError("Parameter 'search' is empty")
-
-    domains = list()
-    for tag in tags:
-        logger.debug("tag[]='%s'", type(tag))
-        domain = tidyup.domain(tag.find(search).contents[0])
-        logger.debug("domain='%s'", domain)
-        if domain == "":
-            logger.debug("tag='%s' has no domain, trying <em> ...", tag)
-            domain = tidyup.domain(tag.find("em").contents[0])
-
-        logger.debug("domain='%s'", domain)
-        if not validators.domain(domain):
-            logger.debug("domain='%s' is not a valid domain name - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".arpa"):
-            logger.debug("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".tld"):
-            logger.debug("domain='%s' is a fake domain - SKIPPED!", domain)
-            continue
-        elif blacklist.is_blacklisted(domain):
-            logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
-            continue
-
-        logger.debug("Appending domain='%s'", domain)
-        domains.append(domain)
-
-    logger.debug("domains()=%d - EXIT!", len(domains))
-    return domains
index 751b7fcc9916cf731990065290b053581a199a08..5f6616d0b0777baa6d561d63fece9cedada6e6c0 100644 (file)
@@ -21,6 +21,7 @@ import bs4
 import validators
 
 from fba import csrf
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import config
@@ -123,20 +124,11 @@ def fetch_instances(domain: str, origin: str, software: str, command: str, path:
         if instance == "":
             logger.warning(f"Empty instance after tidyup.domain(), domain='{domain}'")
             continue
-        elif not validators.domain(instance.split("/")[0]):
-            logger.warning(f"Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
-            continue
-        elif instance.endswith(".arpa"):
-            logger.warning(f"instance='{instance}' is a reversed .arpa domain and should not be used generally.")
-            continue
-        elif blacklist.is_blacklisted(instance):
-            logger.debug("instance is blacklisted:", instance)
+        elif not utils.is_domain_wanted((instance):
+            logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
             continue
         elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
-            logger.debug(f"instance='{instance}' is a link to a single user profile - SKIPPED!")
-            continue
-        elif instance.endswith(".tld"):
-            logger.debug(f"instance='{instance}' is a fake domain - SKIPPED!")
+            logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
             continue
         elif not instances.is_registered(instance):
             logger.debug("Adding new instance:", instance, domain)
@@ -362,17 +354,8 @@ def fetch_wellknown_nodeinfo(domain: str) -> dict:
                         url = f"https://{domain}{url}"
                         components = urlparse(url)
 
-                    if not validators.domain(components.netloc):
-                        logger.warning(f"components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
-                        continue
-                    elif domain.endswith(".arpa"):
-                        logger.warning("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-                        continue
-                    elif domain.endswith(".tld"):
-                        logger.warning("domain='%s' is a fake domain - SKIPPED!", domain)
-                        continue
-                    elif blacklist.is_blacklisted(components.netloc):
-                        logger.debug(f"components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
+                    if not utils.is_domain_wanted((components.netloc):
+                        logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
                         continue
 
                     logger.debug("Fetching nodeinfo from:", url)
@@ -422,19 +405,21 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
     logger.debug(f"Fetching path='{path}' from '{domain}' ...")
     response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
 
-    logger.debug("domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
+    logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
     if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
         logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
+
         doc = bs4.BeautifulSoup(response.text, "html.parser")
+        logger.debug("doc[]='%s'", type(doc))
 
-        logger.debug("doc[]:", type(doc))
         generator = doc.find("meta", {"name"    : "generator"})
         site_name = doc.find("meta", {"property": "og:site_name"})
 
-        logger.debug(f"generator='{generator}',site_name='{site_name}'")
+        logger.debug("generator[]='%s',site_name[]='%s'", type(generator), type(site_name))
         if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
             logger.debug("Found generator meta tag:", domain)
             software = tidyup.domain(generator.get("content"))
+
             logger.debug("software[%s]='%s'", type(software), software)
             if software is not None and software != "":
                 logger.info("domain='%s' is generated by '%s'", domain, software)
@@ -442,6 +427,7 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
         elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
             logger.debug("Found property=og:site_name:", domain)
             software = tidyup.domain(site_name.get("content"))
+
             logger.debug("software[%s]='%s'", type(software), software)
             if software is not None and software != "":
                 logger.info("domain='%s' has og:site_name='%s'", domain, software)
@@ -449,7 +435,7 @@ def fetch_generator_from_path(domain: str, path: str = "/") -> str:
 
     logger.debug("software[]='%s'", type(software))
     if isinstance(software, str) and software == "":
-        logger.debug(f"Corrected empty string to None for software of domain='{domain}'")
+        logger.debug("Corrected empty string to None for software of domain='%s'", domain)
         software = None
     elif isinstance(software, str) and ("." in software or " " in software):
         logger.debug(f"software='{software}' may contain a version number, domain='{domain}', removing it ...")
@@ -596,16 +582,7 @@ def find_domains(tag: bs4.element.Tag) -> list:
 
         logger.debug("domain='%s',reason='%s'", domain, reason)
 
-        if not validators.domain(domain.split("/")[0]):
-            logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".arpa"):
-            logger.warning("domain='%s' is a domain for reversed IP addresses - SKIPPED!", domain)
-            continue
-        elif domain.endswith(".tld"):
-            logger.warning("domain='%s' is a fake domain - SKIPPED!", domain)
-            continue
-        elif blacklist.is_blacklisted(domain):
+        if not utils.is_domain_wanted((domain):
             logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
             continue
         elif domain == "gab.com/.ai, develop.gab.com":
@@ -661,17 +638,8 @@ def add_peers(rows: dict) -> list:
                 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
 
             logger.debug(f"peer='{peer}' - AFTER!")
-            if not validators.domain(peer):
-                logger.warning(f"peer='{peer}' is not a valid domain - SKIPPED!")
-                continue
-            elif peer.endswith(".arpa"):
-                logger.warning(f"peer='{peer}' is a domain for reversed IP addresses -SKIPPED!")
-                continue
-            elif peer.endswith(".tld"):
-                logger.warning(f"peer='{peer}' is a fake domain - SKIPPED!")
-                continue
-            elif blacklist.is_blacklisted(peer):
-                logger.debug(f"peer='{peer}' is blacklisted - SKIPPED!")
+            if not utils.is_domain_wanted((peer):
+                logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
                 continue
 
             logger.debug(f"Adding peer='{peer}' ...")
index 7d0c4036c64629ad00aba0127f3834a4c47ae134..da936ac9af82d41fb3c74bcec065f8f9a531d98e 100644 (file)
@@ -22,7 +22,7 @@ import requests
 import urllib3
 import validators
 
-from fba import fba
+from fba import utils
 
 from fba.helpers import config
 from fba.helpers import cookies
@@ -126,7 +126,7 @@ def fetch_api_url(url: str, timeout: tuple) -> dict:
 
     try:
         logger.debug("Fetching url='%s' ...", url)
-        response = fba.fetch_url(url, api_headers, timeout)
+        response = utils.fetch_url(url, api_headers, timeout)
 
         json_reply["json"] = json_from_response(response)
 
index 7ce10bfac17817e9280ad21ed9ac09ee40153ae5..51036fc37203879a438f7fbe8a5d7df96576dc07 100644 (file)
@@ -19,7 +19,7 @@ import logging
 import time
 import validators
 
-from fba import fba
+from fba import database
 
 from fba.helpers import blacklist
 from fba.helpers import tidyup
@@ -51,7 +51,7 @@ def update_reason(reason: str, blocker: str, blocked: str, block_level: str):
         raise ValueError("Accepted domains are not wanted here")
 
     logger.debug("Updating block reason:", reason, blocker, blocked, block_level)
-    fba.cursor.execute(
+    database.cursor.execute(
         "UPDATE blocks SET reason = ?, last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ? AND (reason IS NULL OR reason = '') LIMIT 1",
         [
             reason,
@@ -84,7 +84,7 @@ def update_last_seen(blocker: str, blocked: str, block_level: str):
     elif block_level == "accept":
         raise ValueError("Accepted domains are not wanted here")
 
-    fba.cursor.execute(
+    database.cursor.execute(
         "UPDATE blocks SET last_seen = ? WHERE blocker = ? AND blocked = ? AND block_level = ? LIMIT 1",
         [
             time.time(),
@@ -116,7 +116,7 @@ def is_instance_blocked(blocker: str, blocked: str, block_level: str) -> bool:
     elif block_level == "accept":
         raise ValueError("Accepted domains are not wanted here")
 
-    fba.cursor.execute(
+    database.cursor.execute(
         "SELECT * FROM blocks WHERE blocker = ? AND blocked = ? AND block_level = ? LIMIT 1",
         (
             blocker,
@@ -125,7 +125,7 @@ def is_instance_blocked(blocker: str, blocked: str, block_level: str) -> bool:
         ),
     )
 
-    is_blocked = fba.cursor.fetchone() is not None
+    is_blocked = database.cursor.fetchone() is not None
 
     logger.debug(f"is_blocked='{is_blocked}' - EXIT!")
     return is_blocked
@@ -165,7 +165,7 @@ def add_instance(blocker: str, blocked: str, reason: str, block_level: str):
 
     logger.info("New block: blocker='%s',blocked='%s',reason='%s',block_level='%s'", blocker, blocked, reason, block_level)
 
-    fba.cursor.execute(
+    database.cursor.execute(
         "INSERT INTO blocks (blocker, blocked, reason, block_level, first_seen, last_seen) VALUES (?, ?, ?, ?, ?, ?)",
         [
              blocker,
index 5e844345645829b6dd5c5128e300f24ec97dd9dc..ed1f89fd7720946a41d8850cdb14855ba334aa89 100644 (file)
@@ -19,7 +19,7 @@ import time
 
 import validators
 
-from fba import fba
+from fba import database
 
 from fba.helpers import config
 
@@ -50,13 +50,13 @@ def add(domain: str, error: dict):
 
     logger.debug("AFTER error[]:", type(error))
     if isinstance(error, str):
-        fba.cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
+        database.cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, 999, ?, ?)",[
             domain,
             error,
             time.time()
         ])
     else:
-        fba.cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
+        database.cursor.execute("INSERT INTO error_log (domain, error_code, error_message, created) VALUES (?, ?, ?, ?)",[
             domain,
             error["status_code"],
             error["error_message"],
@@ -65,6 +65,6 @@ def add(domain: str, error: dict):
 
     # Cleanup old entries
     logger.debug(f"Purging old records (distance: {config.get('error_log_cleanup')})")
-    fba.cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
+    database.cursor.execute("DELETE FROM error_log WHERE created < ?", [time.time() - config.get("error_log_cleanup")])
 
     logger.debug("EXIT!")
index f075d1209cab504e500a9916da0ddf466801391a..4238b25fd740f9c9b24f8622ca30f0c71145f8de 100644 (file)
@@ -21,7 +21,8 @@ import time
 import requests
 import validators
 
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import cache
@@ -80,7 +81,7 @@ def _set_data(key: str, domain: str, value: any):
         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
     elif not key in _pending:
         raise ValueError(f"key='{key}' not found in _pending")
-    elif not fba.is_primitive(value):
+    elif not utils.is_primitive(value):
         raise ValueError(f"value[]='{type(value)}' is not a primitive type")
 
     # Set it
@@ -157,14 +158,14 @@ def update_data(domain: str):
     logger.debug("sql_string:", sql_string)
 
     logger.debug("Executing SQL:", sql_string)
-    fba.cursor.execute(sql_string, fields)
+    database.cursor.execute(sql_string, fields)
 
-    logger.debug(f"Success! (rowcount={fba.cursor.rowcount })")
-    if fba.cursor.rowcount == 0:
+    logger.debug(f"Success! (rowcount={database.cursor.rowcount })")
+    if database.cursor.rowcount == 0:
         raise Exception(f"Did not update any rows: domain='{domain}',fields()={len(fields)}")
 
-    logger.debug("Committing changes ...")
-    fba.connection.commit()
+    logger.debug("Invoking commit() ...")
+    database.connection.commit()
 
     logger.debug(f"Deleting _pending for domain='{domain}'")
     for key in _pending:
@@ -233,13 +234,13 @@ def add(domain: str, origin: str, command: str, path: str = None, software: str
             return
 
     logger.info("Adding instance domain='%s' (origin='%s',software='%s')", domain, origin, software)
-    fba.cursor.execute(
+    database.cursor.execute(
         "INSERT INTO instances (domain, origin, command, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)",
         (
            domain,
            origin,
            command,
-           fba.get_hash(domain),
+           utils.get_hash(domain),
            software,
            time.time()
         ),
@@ -339,10 +340,10 @@ def is_registered(domain: str) -> bool:
     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
     if not cache.key_exists("is_registered"):
         logger.debug("Cache for 'is_registered' not initialized, fetching all rows ...")
-        fba.cursor.execute("SELECT domain FROM instances")
+        database.cursor.execute("SELECT domain FROM instances")
 
         # Check Set all
-        cache.set_all("is_registered", fba.cursor.fetchall(), True)
+        cache.set_all("is_registered", database.cursor.fetchall(), True)
 
     # Is cache found?
     registered = cache.sub_key_exists("is_registered", domain)
@@ -369,10 +370,10 @@ def is_recent(domain: str) -> bool:
         return False
 
     # Query database
-    fba.cursor.execute("SELECT last_instance_fetch FROM instances WHERE domain = ? LIMIT 1", [domain])
+    database.cursor.execute("SELECT last_instance_fetch FROM instances WHERE domain = ? LIMIT 1", [domain])
 
     # Fetch row
-    fetched = fba.cursor.fetchone()[0]
+    fetched = database.cursor.fetchone()[0]
 
     logger.debug(f"fetched[{type(fetched)}]='{fetched}'")
     recently = isinstance(fetched, float) and time.time() - fetched <= config.get("recheck_instance")
@@ -403,11 +404,11 @@ def deobscure(char: str, domain: str, blocked_hash: str = None) -> tuple:
 
     if isinstance(blocked_hash, str):
         logger.debug(f"Looking up blocked_hash='{blocked_hash}' ...")
-        fba.cursor.execute(
+        database.cursor.execute(
             "SELECT domain, origin, nodeinfo_url FROM instances WHERE hash = ? LIMIT 1", [blocked_hash]
         )
 
-        row = fba.cursor.fetchone()
+        row = database.cursor.fetchone()
         logger.debug("row[]='%s'", type(row))
 
         if row is None:
@@ -415,11 +416,11 @@ def deobscure(char: str, domain: str, blocked_hash: str = None) -> tuple:
             return deobscure(char, domain)
     else:
         logger.debug(f"Looking up domain='{domain}' ...")
-        fba.cursor.execute(
+        database.cursor.execute(
             "SELECT domain, origin, nodeinfo_url FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", [domain.replace(char, "_")]
         )
 
-        row = fba.cursor.fetchone()
+        row = database.cursor.fetchone()
         logger.debug("row[]='%s'", type(row))
 
     logger.debug(f"row[]='{type(row)}' - EXIT!")
index b3f6bc40d342ceee218fa42b6567b4a2026ef940..8fca3302eb695c88080db184e7ddc13488dbb3f0 100644 (file)
@@ -19,6 +19,8 @@ import logging
 import bs4
 import validators
 
+from fba import utils
+
 from fba.helpers import blacklist
 from fba.helpers import config
 from fba.helpers import tidyup
@@ -87,17 +89,8 @@ def fetch_blocks(domain: str) -> dict:
         reason  = tidyup.reason(line.find_all("td")[1].text)
         logger.debug(f"blocked='{blocked}',reason='{reason}'")
 
-        if not validators.domain(blocked):
-            logger.warning(f"blocked='{blocked}' is not a valid domain - SKIPPED!")
-            continue
-        elif blocked.endswith(".arpa"):
-            logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-            continue
-        elif blocked.endswith(".tld"):
-            logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-            continue
-        elif blacklist.is_blacklisted(blocked):
-            logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+        if not utils.is_domain_wanted((blocked):
+            logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
             continue
 
         logger.debug(f"Appending blocked='{blocked}',reason='{reason}'")
index 5c891748347b4ff2c77c6774a8db3f0dae15bd9e..c5753865cbe5a3aa42c04c400bddd486371c2a8b 100644 (file)
@@ -21,7 +21,8 @@ import bs4
 import validators
 
 from fba import csrf
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import config
@@ -171,7 +172,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
             (config.get("connection_timeout"), config.get("read_timeout"))
         )
 
-        logger.debug(f"response.ok='{response.ok}',response.status_code={response.status_code},response.text()={len(response.text)}")
+        logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
         if response.ok and response.status_code < 300 and response.text != "":
             logger.debug(f"Parsing {len(response.text)} Bytes ...")
 
@@ -203,17 +204,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 blocked = tidyup.domain(tag.contents[0])
 
                 logger.debug(f"blocked='{blocked}'")
-                if not validators.domain(blocked):
-                    logger.warning(f"blocked='{blocked}' is not a valid domain - SKIPPED!")
-                    continue
-                elif blocked.endswith(".arpa"):
-                    logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                    continue
-                elif blocked.endswith(".tld"):
-                    logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                    continue
-                elif blacklist.is_blacklisted(blocked):
-                    logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+                if not utils.is_domain_wanted((blocked):
+                    logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                     continue
                 elif not instances.is_registered(blocked):
                     logger.debug("Hash wasn't found, adding:", blocked, domain)
@@ -231,8 +223,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
                     blocks.update_last_seen(domain, blocked, "reject")
 
-        logger.debug("Committing changes ...")
-        fba.connection.commit()
+        logger.debug("Invoking commit() ...")
+        database.connection.commit()
     except network.exceptions as exception:
         logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
         instances.set_last_error(domain, exception)
index 489062bd349881d997d9e16ce98de9dacd7d20e7..966477cb7ca7e5ae4b66750c655bc106d9a24f0d 100644 (file)
@@ -21,7 +21,8 @@ import bs4
 import validators
 
 from fba import csrf
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import config
@@ -212,7 +213,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
             logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
             for block in blocklist:
                 # Check type
-                logger.debug(f"block[]='{type(block)}'")
+                logger.debug("block[]='%s'", type(block))
                 if not isinstance(block, dict):
                     logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
                     continue
@@ -225,46 +226,46 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     "reason": block["comment"] if "comment" in block else None
                 }
 
-                logger.debug("severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
+                logger.debug("severity='%s',domain='%s',hash='%s',comment='%s'", block['severity'], block['domain'], block['digest'], block['comment'])
                 if block['severity'] == 'suspend':
-                    logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
+                    logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
                     rows['reject'].append(entry)
                 elif block['severity'] == 'silence':
-                    logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
+                    logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
                     rows['followers_only'].append(entry)
                 elif block['severity'] == 'reject_media':
-                    logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
+                    logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
                     rows['media_removal'].append(entry)
                 elif block['severity'] == 'reject_reports':
-                    logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
+                    logger.debug("Adding entry='%s' with severity='%s' ...", entry, block['severity'])
                     rows['report_removal'].append(entry)
                 else:
-                    logger.warning(f"Unknown severity='{block['severity']}', domain='{block['domain']}'")
+                    logger.warning("Unknown severity='%s', domain='%s'", block['severity'], block['domain'])
         else:
-            logger.debug(f"domain='{domain}' has returned zero rows, trying /about/more page ...")
+            logger.debug("domain='%s' has returned zero rows, trying /about/more page ...", domain)
             rows = fetch_blocks_from_about(domain)
 
         logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
         for block_level, blocklist in rows.items():
-            logger.debug("domain,block_level,blocklist():", domain, block_level, len(blocklist))
+            logger.debug("domain='%s',block_level='%s',blocklist()=%d", domain, block_level, len(blocklist))
             block_level = tidyup.domain(block_level)
 
-            logger.debug("AFTER-block_level:", block_level)
+            logger.debug("block_level='%s' - AFTER!", block_level)
             if block_level == "":
-                logger.warning("block_level is empty, domain:", domain)
+                logger.warning("block_level is empty, domain='%s'", domain)
                 continue
             elif block_level == "accept":
-                logger.debug(f"domain='{domain}' skipping block_level='accept'")
+                logger.debug("domain='%s' skipping block_level='accept'", domain)
                 continue
 
             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
             for block in blocklist:
-                logger.debug(f"block[]='{type(block)}'")
+                logger.debug("block[]='%s'", type(block))
                 blocked, blocked_hash, reason = block.values()
                 logger.debug(f"blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
                 blocked = tidyup.domain(blocked)
                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
-                logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
+                logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
                 if blocked == "":
                     logger.warning("blocked is empty, domain='%s'", domain)
@@ -281,7 +282,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
                         continue
 
-                    logger.debug("Updating domain: ", row[0])
+                    logger.debug("Updating domain: row[0]='%s'", row[0])
                     blocked      = row[0]
                     origin       = row[1]
                     nodeinfo_url = row[2]
@@ -294,40 +295,22 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
                         continue
 
-                    logger.debug("Updating domain: ", row[0])
+                    logger.debug("Updating domain: row[0]='%s'", row[0])
                     blocked      = row[0]
                     origin       = row[1]
                     nodeinfo_url = row[2]
 
                 logger.debug("Looking up instance by domain:", blocked)
-                if not validators.domain(blocked):
-                    logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
-                    continue
-                elif blocked.endswith(".arpa"):
-                    logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                    continue
-                elif blocked.endswith(".tld"):
-                    logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                    continue
-                elif blacklist.is_blacklisted(blocked):
-                    logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+                if not utils.is_domain_wanted((blocked):
+                    logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                     continue
                 elif not instances.is_registered(blocked):
                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
 
                 logger.debug("Looking up instance by domain:", blocked)
-                if not validators.domain(blocked):
-                    logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
-                    continue
-                elif blocked.endswith(".arpa"):
-                    logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                    continue
-                elif blocked.endswith(".tld"):
-                    logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                    continue
-                elif blacklist.is_blacklisted(blocked):
-                    logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+                if not utils.is_domain_wanted((blocked):
+                    logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                     continue
                 elif not instances.is_registered(blocked):
                     logger.debug("Hash wasn't found, adding:", blocked, domain)
@@ -347,8 +330,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     blocks.update_last_seen(domain, blocked, block_level)
                     blocks.update_reason(reason, domain, blocked, block_level)
 
-        logger.debug("Committing changes ...")
-        fba.connection.commit()
+        logger.debug("Invoking commit() ...")
+        database.connection.commit()
     except network.exceptions as exception:
         logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
         instances.set_last_error(domain, exception)
index 6407fc8b430a525a484bed3ad9dd19585107022d..239814ceb1a68876d5f7acd9d97be3784492d6d2 100644 (file)
@@ -19,8 +19,8 @@ import logging
 import validators
 
 from fba import csrf
+from fba import utils
 
-from fba.helpers import blacklist
 from fba.helpers import config
 from fba.helpers import dicts
 from fba.helpers import tidyup
@@ -116,17 +116,8 @@ def fetch_peers(domain: str) -> list:
             elif not isinstance(row["host"], str):
                 logger.warning(f"row[host][]='{type(row['host'])}' is not 'str' - SKIPPED!")
                 continue
-            elif not validators.domain(row["host"].split("/")[0]):
-                logger.warning(f"row[host]='{row['host']}' is not a valid domain - SKIPPED!")
-                continue
-            elif row["host"].endswith(".arpa"):
-                logger.warning(f"row[host]='{row['host']}' is a domain for reversed IP addresses - SKIPPED!")
-                continue
-            elif row["host"].endswith(".tld"):
-                logger.warning(f"row[host]='{row['host']}' is a fake domain - SKIPPED!")
-                continue
-            elif blacklist.is_blacklisted(row["host"]):
-                logger.debug(f"row[host]='{row['host']}' is blacklisted. domain='{domain}' - SKIPPED!")
+            elif not utils.is_domain_wanted(row["host"]):
+                logger.debug(f"row[host]='{row['host']}' is not wanted, domain='{domain}' - SKIPPED!")
                 continue
             elif row["host"] in peers:
                 logger.debug(f"Not adding row[host]='{row['host']}', already found.")
index de5864c1f0155c5a58df8b3ade4f13612cab7ff9..8ef7a06a53cb51c7617a32cc1b04ffea75a36beb 100644 (file)
@@ -20,7 +20,8 @@ import logging
 import bs4
 import validators
 
-from fba import fba
+from fba import database
+from fba import utils
 
 from fba.helpers import blacklist
 from fba.helpers import config
@@ -107,7 +108,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 logger.warning("block_level is now empty!")
                 continue
             elif block_level == "accept":
-                logger.debug(f"domain='{domain}' skipping block_level='accept'")
+                logger.debug("domain='%s' skipping block_level='accept'", domain)
                 continue
 
             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
@@ -151,21 +152,13 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                         nodeinfo_url = row[2]
 
                     logger.debug(f"blocked='{blocked}'")
-                    if not validators.domain(blocked):
-                        logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
-                        continue
-                    elif blocked.endswith(".arpa"):
-                        logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                        continue
-                    elif blocked.endswith(".tld"):
-                        logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                        continue
-                    elif blacklist.is_blacklisted(blocked):
-                        logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+                    if not utils.is_domain_wanted(blocked):
+                        logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                         continue
                     elif not instances.is_registered(blocked):
                         # Commit changes
-                        fba.connection.commit()
+                        logger.debug("Invoking commit() ...")
+                        database.connection.commit()
 
                         logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                         instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -227,21 +220,13 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 nodeinfo_url = row[2]
 
             logger.debug(f"blocked='{blocked}'")
-            if not validators.domain(blocked):
-                logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
-                continue
-            elif blocked.endswith(".arpa"):
-                logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                continue
-            elif blocked.endswith(".tld"):
-                logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                continue
-            elif blacklist.is_blacklisted(blocked):
-                logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+            if not utils.is_domain_wanted((blocked):
+                logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                 continue
             elif not instances.is_registered(blocked):
                 # Commit changes
-                fba.connection.commit()
+                logger.debug("Invoking commit() ...")
+                database.connection.commit()
 
                 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
                 instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
@@ -262,8 +247,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
     else:
         logger.warning(f"Cannot find 'mrf_simple' or 'quarantined_instances' in JSON reply: domain='{domain}'")
 
-    logger.debug("Committing changes ...")
-    fba.connection.commit()
+    logger.debug("Invoking commit() ...")
+    database.connection.commit()
 
     # Reasons
     if "mrf_simple_info" in data:
@@ -283,10 +268,10 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 logger.warning("block_level is now empty!")
                 continue
             elif block_level == "accept":
-                logger.debug(f"domain='{domain}' skipping block_level='accept'")
+                logger.debug("domain='%s' skipping block_level='accept'", domain)
                 continue
 
-            logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',software='pleroma',block_level='{block_level}' ...")
+            logger.debug(f"Checking {len(info.items())} entries from domain='{domain}',block_level='{block_level}' ...")
             for blocked, reason in info.items():
                 logger.debug(f"blocked='{blocked}',reason[{type(reason)}]='{reason}' - BEFORE!")
                 blocked = tidyup.domain(blocked)
@@ -300,7 +285,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 elif reason is not None:
                     raise ValueError(f"Cannot handle reason[]='{type(reason)}'")
 
-                logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
+                logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
                 if blocked == "":
                     logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
@@ -336,17 +321,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     nodeinfo_url = row[2]
 
                 logger.debug(f"blocked='{blocked}'")
-                if not validators.domain(blocked):
-                    logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
-                    continue
-                elif blocked.endswith(".arpa"):
-                    logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                    continue
-                elif blocked.endswith(".tld"):
-                    logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                    continue
-                elif blacklist.is_blacklisted(blocked):
-                    logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+                if not utils.is_domain_wanted((blocked):
+                    logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                     continue
                 elif not instances.is_registered(blocked):
                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
@@ -414,17 +390,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                 nodeinfo_url = row[2]
 
             logger.debug(f"blocked='{blocked}'")
-            if not validators.domain(blocked):
-                logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
-                continue
-            elif blocked.endswith(".arpa"):
-                logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                continue
-            elif blocked.endswith(".tld"):
-                logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
-                continue
-            elif blacklist.is_blacklisted(blocked):
-                logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
+            if not utils.is_domain_wanted((blocked):
+                logger.debug("blocked='%s' is not wanted - SKIPPED!", blocked)
                 continue
             elif not instances.is_registered(blocked):
                 logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
@@ -457,7 +424,7 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                     logger.debug(f"record[]='{type(record)}'")
                     blocked = tidyup.domain(record["blocked"])
                     reason  = tidyup.reason(record["reason"])
-                    logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
+                    logger.debug("blocked='%s',reason='%s' - AFTER!", blocked, reason)
 
                     if blocked == "":
                         logger.warning("blocked is empty after tidyup.domain():", domain, block_level)
@@ -493,14 +460,8 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                         nodeinfo_url = row[2]
 
                     logger.debug(f"blocked='{blocked}'")
-                    if not validators.domain(blocked):
-                        logger.warning(f"blocked='{blocked}',software='pleroma' is not a valid domain name - SKIPPED!")
-                        continue
-                    elif blocked.endswith(".arpa"):
-                        logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
-                        continue
-                    elif blocked.endswith(".tld"):
-                        logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
+                    if not utils.is_domain_wanted((blocked):
+                        logger.warning("blocked='%s' is not wanted - SKIPPED!", blocked)
                         continue
                     elif not instances.is_registered(blocked):
                         logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
@@ -520,7 +481,9 @@ def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
                         logger.debug(f"Updating block last seen for domain='{domain}',blocked='{blocked}' ...")
                         blocks.update_reason(reason, domain, blocked, block_level)
 
-    fba.connection.commit()
+    logger.debug("Invoking commit() ...")
+    database.connection.commit()
+
     logger.debug("EXIT!")
 
 def fetch_blocks_from_about(domain: str) -> dict:
diff --git a/fba/utils.py b/fba/utils.py
new file mode 100644 (file)
index 0000000..46fa37e
--- /dev/null
@@ -0,0 +1,211 @@
+# Copyright (C) 2023 Free Software Foundation
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import hashlib
+import logging
+
+from urllib.parse import urlparse
+
+import bs4
+import requests
+import validators
+
+from fba.helpers import blacklist
+from fba.helpers import cookies
+from fba.helpers import tidyup
+
+from fba.http import federation
+from fba.http import network
+
+from fba.models import instances
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+##### Other functions #####
+
+def is_primitive(var: any) -> bool:
+    logger.debug(f"var[]='{type(var)}' - CALLED!")
+    return type(var) in {int, str, float, bool} or var is None
+
+def get_hash(domain: str) -> str:
+    logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
+    if not isinstance(domain, str):
+        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+    elif domain == "":
+        raise ValueError("Parameter 'domain' is empty")
+    elif domain.lower() != domain:
+        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
+    elif not validators.domain(domain.split("/")[0]):
+        raise ValueError(f"domain='{domain}' is not a valid domain")
+    elif domain.endswith(".arpa"):
+        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
+    elif domain.endswith(".tld"):
+        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
+
+    return hashlib.sha256(domain.encode("utf-8")).hexdigest()
+
+def fetch_url(url: str, headers: dict, timeout: tuple) -> requests.models.Response:
+    logger.debug(f"url='{url}',headers()={len(headers)},timeout={timeout} - CALLED!")
+    if not isinstance(url, str):
+        raise ValueError(f"Parameter url[]='{type(url)}' is not 'str'")
+    elif url == "":
+        raise ValueError("Parameter 'url' is empty")
+    elif not isinstance(headers, dict):
+        raise ValueError(f"Parameter headers[]='{type(headers)}' is not 'dict'")
+    elif not isinstance(timeout, tuple):
+        raise ValueError(f"Parameter timeout[]='{type(timeout)}' is not 'tuple'")
+
+    logger.debug(f"Parsing url='{url}'")
+    components = urlparse(url)
+
+    # Invoke other function, avoid trailing ?
+    logger.debug(f"components[{type(components)}]={components}")
+    if components.query != "":
+        response = network.fetch_response(components.netloc, f"{components.path}?{components.query}", headers, timeout)
+    else:
+        response = network.fetch_response(components.netloc, components.path if isinstance(components.path, str) and components.path != '' else '/', headers, timeout)
+
+    logger.debug(f"response[]='{type(response)}' - EXXIT!")
+    return response
+
+def process_domain(domain: str, blocker: str, command: str) -> bool:
+    logger.debug(f"domain='{domain}',blocker='{blocker}',command='{command}' - CALLED!")
+    if not isinstance(domain, str):
+        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+    elif domain == "":
+        raise ValueError("Parameter 'domain' is empty")
+    elif domain.lower() != domain:
+        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
+    elif not validators.domain(domain.split("/")[0]):
+        raise ValueError(f"domain='{domain}' is not a valid domain")
+    elif domain.endswith(".arpa"):
+        raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
+    elif domain.endswith(".tld"):
+        raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
+    elif not isinstance(blocker, str):
+        raise ValueError(f"Parameter blocker[]='{type(blocker)}' is not 'str'")
+    elif blocker == "":
+        raise ValueError("Parameter 'blocker' is empty")
+    elif not validators.domain(blocker.split("/")[0]):
+        raise ValueError(f"blocker='{blocker}' is not a valid domain")
+    elif blocker.endswith(".arpa"):
+        raise ValueError(f"blocker='{blocker}' is a domain for reversed IP addresses, please don't crawl them!")
+    elif blocker.endswith(".tld"):
+        raise ValueError(f"blocker='{blocker}' is a fake domain, please don't crawl them!")
+    elif not isinstance(command, str):
+        raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
+    elif command == "":
+        raise ValueError("Parameter 'command' is empty")
+
+    if domain.find("*") > 0:
+        # Try to de-obscure it
+        row = instances.deobscure("*", domain)
+
+        logger.debug(f"row[{type(row)}]='{row}'")
+        if row is None:
+            logger.warning(f"Cannot de-obfucate domain='{domain}' - SKIPPED!")
+            return False
+
+        logger.debug(f"domain='{domain}' de-obscured to '{row[0]}'")
+        domain = row[0]
+    elif domain.find("?") > 0:
+        # Try to de-obscure it
+        row = instances.deobscure("?", domain)
+
+        logger.debug(f"row[{type(row)}]='{row}'")
+        if row is None:
+            logger.warning(f"Cannot de-obfucate domain='{domain}' - SKIPPED!")
+            return False
+
+        logger.debug(f"domain='{domain}' de-obscured to '{row[0]}'")
+        domain = row[0]
+
+    if not is_domain_wanted(domain)
+        logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+        return False
+    elif instances.is_recent(domain):
+        logger.debug(f"domain='{domain}' has been recently checked - SKIPPED!")
+        return False
+
+    processed = False
+    try:
+        logger.info("Fetching instances for domain='%s',blocker='%s',command='%s' ...", domain, blocker, command)
+        federation.fetch_instances(domain, blocker, None, command)
+        processed = True
+
+        logger.debug("Invoking cookies.clear(%s) ...", domain)
+        cookies.clear(domain)
+    except network.exceptions as exception:
+        logger.warning(f"Exception '{type(exception)}' during fetching instances (fetch_oliphant) from domain='{domain}'")
+        instances.set_last_error(domain, exception)
+
+    logger.debug(f"processed='{processed}' - EXIT!")
+    return processed
+
+def find_domains(tags: bs4.element.ResultSet, search: str) -> list:
+    logger.debug("tags[%s]()=%d,search='%s' - CALLED!", type(tags), len(tags), search)
+    if not isinstance(tags, bs4.element.ResultSet):
+        raise ValueError(f"Parameter tags[]='{type(tags)}' is not 'ResultSet'")
+    elif not isinstance(search, str):
+        raise ValueError(f"Parameter search[]='{type(search)}' is not 'str'")
+    elif search == "":
+        raise ValueError("Parameter 'search' is empty")
+
+    domains = list()
+    for tag in tags:
+        logger.debug("tag[]='%s'", type(tag))
+        domain = tidyup.domain(tag.find(search).contents[0])
+
+        logger.debug("domain='%s'", domain)
+        if domain == "":
+            logger.debug("tag='%s' has no domain, trying <em> ...", tag)
+            domain = tidyup.domain(tag.find("em").contents[0])
+
+        if not is_domain_wanted(domain):
+            logger.debug("domain='%s' is not wanted - SKIPPED!")
+            continue
+
+        logger.debug("Appending domain='%s'", domain)
+        domains.append(domain)
+
+    logger.debug("domains()=%d - EXIT!", len(domains))
+    return domains
+
+def is_domain_wanted (domain: str) -> bool:
+    logger.debug("domain='%s' - CALLED!", domain)
+    wanted = True
+
+    if not isinstance(domain, str):
+        raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+    elif domain == "":
+        raise ValueError("Parameter 'domain' is empty")
+    elif domain.lower() != domain:
+        raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
+    elif not validators.domain(domain.split("/")[0]):
+        logger.debug("domain='%s' is not a valid domain name - settings False ...", domain)
+        wanted = False
+    elif domain.endswith(".arpa"):
+        logger.debug("domain='%s' is a domain for reversed IP addresses - settings False ...", domain)
+        wanted = False
+    elif domain.endswith(".tld"):
+        logger.debug("domain='%s' is a fake domain - settings False ...", domain)
+        wanted = False
+    elif blacklist.is_blacklisted(domain):
+        logger.debug("domain='%s' is blacklisted - settings False ...", domain)
+        wanted = False
+
+    logger.debug("wanted='%s' - EXIT!", wanted)
+    return wanted