X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fmodels%2Finstances.py;h=8a4103ef518d45bf01221270541c6c76a507a5bc;hb=8d598a643e074bba4b83ff42a74a7c8f42c5b22d;hp=d3f7f43bc6b1ebcef2e0d7cea26190b5e6df6a86;hpb=b7cc4315e607c1dc0d4e9473dbd98add2644ec5d;p=fba.git diff --git a/fba/models/instances.py b/fba/models/instances.py index d3f7f43..8a4103e 100644 --- a/fba/models/instances.py +++ b/fba/models/instances.py @@ -27,6 +27,8 @@ from fba import utils from fba.helpers import blacklist from fba.helpers import cache from fba.helpers import config +from fba.helpers import domain as domain_helper +from fba.helpers import tidyup from fba.http import federation from fba.http import network @@ -35,18 +37,23 @@ from fba.models import error_log logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +#logger.setLevel(logging.DEBUG) # Found info from node, such as nodeinfo URL, detection mode that needs to be # written to database. Both arrays must be filled at the same time or else -# update_data() will fail +# update() will fail _pending = { - # Detection mode: 'AUTO_DISCOVERY', 'STATIC_CHECKS' or 'GENERATOR' + # Detection mode # NULL means all detection methods have failed (maybe still reachable instance) "detection_mode" : {}, # Found nodeinfo URL "nodeinfo_url" : {}, # Found total peers "total_peers" : {}, + # Found total blocks + "total_blocks" : {}, + # Obfuscated domains + "obfuscated_blocks" : {}, # Last fetched instances "last_instance_fetch": {}, # Last updated @@ -55,32 +62,32 @@ _pending = { "last_blocked" : {}, # Last nodeinfo (fetched) "last_nodeinfo" : {}, + # Last response time + "last_response_time" : {}, # Last status code "last_status_code" : {}, # Last error details "last_error_details" : {}, + # Wether obfuscation has been used + "has_obfuscation" : {}, + # Original software + "original_software" : {}, + # Aliased software + "software" : {}, } def _set_data(key: str, domain: str, value: any): - logger.debug(f"key='{key}',domain='{domain}',value[]='{type(value)}' - CALLED!") + logger.debug("key='%s',domain='%s',value[]='%s' - CALLED!", key, domain, type(value)) + domain_helper.raise_on(domain) + if not isinstance(key, str): - raise ValueError("Parameter key[]='{type(key)}' is not 'str'") + raise ValueError(f"Parameter key[]='{type(key)}' is not of type 'str'") elif key == "": raise ValueError("Parameter 'key' is empty") - elif not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") elif not key in _pending: raise ValueError(f"key='{key}' not found in _pending") + elif blacklist.is_blacklisted(domain): + raise Exception(f"domain='{domain}' is blacklisted but function has been invoked") elif not utils.is_primitive(value): raise ValueError(f"value[]='{type(value)}' is not a primitive type") @@ -90,254 +97,224 @@ def _set_data(key: str, domain: str, value: any): logger.debug("EXIT!") def has_pending(domain: str) -> bool: - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) + + if not is_registered(domain): + raise ValueError(f"domain='{domain}' is not registered but function was invoked.") + elif blacklist.is_blacklisted(domain): + raise Exception(f"domain='{domain}' is blacklisted but function has been invoked") has = False + logger.debug("Checking %d _pending array elements ...", len(_pending)) for key in _pending: - logger.debug(f"key='{key}',domain='{domain}',_pending[key]()='{len(_pending[key])}'") + logger.debug("domain='%s',_pending[%s]()=%d", domain, key, len(_pending[key])) if domain in _pending[key]: + logger.debug("domain='%s' at key='%s' has pending data ...", domain, key) has = True break - logger.debug(f"has='{has}' - EXIT!") + logger.debug("has='%s' - EXIT!", has) return has -def update_data(domain: str): - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") +def update(domain: str): + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) + + if not is_registered(domain): + raise Exception(f"domain='{domain}' cannot be updated while not being registered") elif not has_pending(domain): raise Exception(f"domain='{domain}' has no pending instance data, but function invoked") - elif not is_registered(domain): - raise Exception(f"domain='{domain}' cannot be updated while not being registered") + elif blacklist.is_blacklisted(domain): + raise Exception(f"domain='{domain}' is blacklisted but function has been invoked") - logger.debug(f"Updating instance data for domain='{domain}' ...") sql_string = "" fields = list() + + logger.debug("Checking %d _pending array elements ...", len(_pending)) for key in _pending: - logger.debug("key:", key) + logger.debug("Checking key='%s',domain='%s'", key, domain) if domain in _pending[key]: - logger.debug(f"Adding '{_pending[key][domain]}' for key='{key}' ...") + logger.debug("Adding '%s' for key='%s' ...", _pending[key][domain], key) fields.append(_pending[key][domain]) sql_string += f" {key} = ?," - logger.debug(f"sql_string()={len(sql_string)}") + logger.debug("sql_string(%d)='%s'", len(sql_string), sql_string) if sql_string == "": - raise ValueError(f"No fields have been set, but method invoked, domain='{domain}'") + raise ValueError(f"No fields have been set, but function invoked, domain='{domain}'") # Set last_updated to current timestamp fields.append(time.time()) # For WHERE statement + logger.debug("Setting domain='%s' for WHERE statement ...", domain) fields.append(domain) - logger.debug(f"sql_string='{sql_string}',fields()={len(fields)}") + logger.debug("sql_string='%s',fields()=%d", sql_string, len(fields)) sql_string = "UPDATE instances SET" + sql_string + " last_updated = ? WHERE domain = ? LIMIT 1" - logger.debug("sql_string:", sql_string) - logger.debug("Executing SQL:", sql_string) + logger.debug("Executing SQL: sql_string='%s',fields()=%d", sql_string, len(fields)) database.cursor.execute(sql_string, fields) - logger.debug(f"Success! (rowcount={database.cursor.rowcount })") + logger.debug("rowcount=%d", database.cursor.rowcount) if database.cursor.rowcount == 0: raise Exception(f"Did not update any rows: domain='{domain}',fields()={len(fields)}") logger.debug("Invoking commit() ...") database.connection.commit() - logger.debug(f"Deleting _pending for domain='{domain}'") + logger.debug("Deleting _pending for domain='%s'", domain) for key in _pending: - logger.debug(f"domain='{domain}',key='{key}'") + logger.debug("domain='%s',key='%s'", domain, key) if domain in _pending[key]: + logger.debug("Deleting key='%s',domain='%s' ...", key, domain) del _pending[key][domain] logger.debug("EXIT!") def add(domain: str, origin: str, command: str, path: str = None, software: str = None): - logger.debug(f"domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(origin, str) and origin is not None: - raise ValueError(f"origin[]='{type(origin)}' is not 'str'") + logger.debug("domain='%s',origin='%s',command='%s',path='%s',software='%s' - CALLED!", domain, origin, command, path, software) + domain_helper.raise_on(domain) + + if not isinstance(origin, str) and origin is not None: + raise ValueError(f"origin[]='{type(origin)}' is not of type 'str'") elif origin == "": raise ValueError("Parameter 'origin' is empty") elif not isinstance(command, str): - raise ValueError(f"command[]='{type(command)}' is not 'str'") + raise ValueError(f"command[]='{type(command)}' is not of type 'str'") elif command == "": raise ValueError("Parameter 'command' is empty") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"Bad domain name='{domain}'") elif not isinstance(path, str) and path is not None: - raise ValueError(f"path[]='{type(path)}' is not 'str'") + raise ValueError(f"path[]='{type(path)}' is not of type 'str'") elif path == "": raise ValueError("Parameter 'path' is empty") + elif path is not None and not path.startswith("/"): + raise ValueError(f"path='{path}' does not start with / but should") elif not isinstance(software, str) and software is not None: - raise ValueError(f"software[]='{type(software)}' is not 'str'") + raise ValueError(f"software[]='{type(software)}' is not of type 'str'") elif software == "": raise ValueError("Parameter 'software' is empty") - elif domain.endswith(".arpa"): - raise ValueError(f"Please don't crawl .arpa domains: domain='{domain}'") elif origin is not None and not validators.domain(origin.split("/")[0]): raise ValueError(f"Bad origin name='{origin}'") elif blacklist.is_blacklisted(domain): - raise Exception(f"domain='{domain}' is blacklisted, but method invoked") - elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (software == "lemmy" and domain.find("/c/") > 0): + raise Exception(f"domain='{domain}' is blacklisted, but function invoked") + elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (is_registered(domain.split("/")[0]) and domain.find("/c/") > 0): raise Exception(f"domain='{domain}' is a single user") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + elif domain.find("/tag/") > 0: + raise Exception(f"domain='{domain}' is a tag") if software is None: try: - logger.debug("domain,origin,command,path:", domain, origin, command, path) + logger.debug("domain='%s',origin='%s',command='%s',path='%s'", domain, origin, command, path) software = federation.determine_software(domain, path) except network.exceptions as exception: logger.warning("Exception '%s' during determining software type, domain='%s'", type(exception), domain) set_last_error(domain, exception) - logger.debug("Determined software:", software) + logger.debug("Determined software='%s'", software) if software == "lemmy" and domain.find("/c/") > 0: domain = domain.split("/c/")[0] + + logger.debug("domain='%s' - LEMMY /c/ !", domain) if is_registered(domain): logger.warning("domain='%s' already registered after cutting off user part. - EXIT!", domain) return - logger.info("Adding instance domain='%s' (origin='%s',software='%s')", domain, origin, software) + logger.info("Adding instance domain='%s',origin='%s',software='%s',command='%s' ...", domain, origin, software, command) database.cursor.execute( - "INSERT INTO instances (domain, origin, command, hash, software, first_seen) VALUES (?, ?, ?, ?, ?, ?)", + "INSERT INTO instances (domain, origin, command, hash, software, original_software, first_seen) VALUES (?, ?, ?, ?, ?, ?, ?)", ( domain, origin, command, utils.get_hash(domain), software, + software, time.time() ), ) - logger.debug(f"Marking domain='{domain}' as registered.") + logger.debug("Marking domain='%s' as registered.", domain) cache.set_sub_key("is_registered", domain, True) + logger.debug("Checking if domain='%s' has pending updates ...", domain) if has_pending(domain): - logger.debug(f"domain='{domain}' has pending nodeinfo being updated ...") - update_data(domain) + logger.debug("Flushing updates for domain='%s' ...", domain) + update(domain) logger.debug("EXIT!") def set_last_nodeinfo(domain: str): - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - - logger.debug("Updating last_nodeinfo for domain:", domain) - _set_data("last_nodeinfo", domain, time.time()) + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) - # Running pending updated - logger.debug(f"Invoking update_data({domain}) ...") - update_data(domain) + logger.debug("Updating last_nodeinfo for domain='%s'", domain) + _set_data("last_nodeinfo", domain, time.time()) logger.debug("EXIT!") def set_last_error(domain: str, error: dict): - logger.debug("domain,error[]:", domain, type(error)) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - - logger.debug("BEFORE error[]:", type(error)) + logger.debug("domain='%s',error[]='%s' - CALLED!", domain, type(error)) + domain_helper.raise_on(domain) + + logger.debug("error[]='%s' - BEFORE!", type(error)) if isinstance(error, (BaseException, json.decoder.JSONDecodeError)): error = f"error[{type(error)}]='{str(error)}'" - logger.debug("AFTER error[]:", type(error)) + logger.debug("error[]='%s' - AFTER!", type(error)) if isinstance(error, str): - logger.debug(f"Setting last_error_details='{error}'") + logger.debug("Setting last_error_details='%s' (str)", error) _set_data("last_status_code" , domain, 999) _set_data("last_error_details", domain, error if error != "" else None) elif isinstance(error, requests.models.Response): - logger.debug(f"Setting last_error_details='{error.reason}'") + logger.debug("Setting last_error_details='%s' (Response)", error.reason) _set_data("last_status_code" , domain, error.status_code) _set_data("last_error_details", domain, error.reason if error.reason != "" else None) elif not isinstance(error, dict): raise KeyError(f"Cannot handle keys in error[{type(error)}]='{error}'") elif "status_code" in error and "error_message" in error: - logger.debug(f"Setting last_error_details='{error['error_message']}'") + logger.debug("Setting last_error_details='%s' (error_message)", error['error_message']) _set_data("last_status_code" , domain, error["status_code"]) _set_data("last_error_details", domain, error["error_message"] if error["error_message"] != "" else None) + elif "json" in error and "error" in error["json"] and "msg" in error["json"]: + logger.debug("Setting last_error_details='%s' (json,error)", error["json"]["msg"]) + _set_data("last_status_code" , domain, error["status_code"]) + _set_data("last_error_details", domain, error["json"]["msg"] if error["json"]["msg"] != "" else None) + elif "json" in error and "error" in error["json"] and "message" in error["json"]["error"]: + logger.debug("Setting last_error_details='%s' (json,error)", error["json"]["error"]["message"]) + _set_data("last_status_code" , domain, error["status_code"]) + _set_data("last_error_details", domain, error["json"]["error"]["message"] if error["json"]["error"]["message"] != "" else None) elif "json" in error and "error" in error["json"]: + logger.debug("Setting last_error_details='%s' (json,error)", error["json"]["error"]) _set_data("last_status_code" , domain, error["status_code"]) _set_data("last_error_details", domain, error["json"]["error"] if error["json"]["error"] != "" else None) - logger.debug(f"Invoking error_log.add(domain='{domain}',error[]='{type(error)}'") + logger.debug("Invoking error_log.add(domain='%s',error[]='%s'", domain, type(error)) error_log.add(domain, error) logger.debug("EXIT!") -def is_registered(domain: str) -> bool: - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) +def set_success(domain: str): + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) + + # Set both to success + _set_data("last_status_code" , domain, 200) + _set_data("last_error_details", domain, None) + + logger.debug("EXIT!") + +def is_registered(domain: str, skip_raise = False) -> bool: + logger.debug("domain='%s',skip_raise='%s' - CALLED!", domain, skip_raise) + domain_helper.raise_on(domain) + + if not isinstance(skip_raise, bool): + raise ValueError(f"skip_raise[]='{type(skip_raise)}' is not type of 'bool'") + + if not skip_raise: + domain_helper.raise_on(domain) + + logger.debug("domain='%s' - CALLED!", domain) if not cache.key_exists("is_registered"): logger.debug("Cache for 'is_registered' not initialized, fetching all rows ...") database.cursor.execute("SELECT domain FROM instances") @@ -348,185 +325,275 @@ def is_registered(domain: str) -> bool: # Is cache found? registered = cache.sub_key_exists("is_registered", domain) - logger.debug(f"registered='{registered}' - EXIT!") + logger.debug("registered='%s' - EXIT!", registered) return registered -def is_recent(domain: str) -> bool: - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") +def is_recent(domain: str, column: str = "last_instance_fetch") -> bool: + logger.debug("domain='%s',column='%s' - CALLED!", domain, column) + domain_helper.raise_on(domain) + + if not isinstance(column, str): + raise ValueError(f"Parameter column[]='{type(column)}' is not of type 'str'") + elif not column.startswith("last_"): + raise ValueError(f"Parameter column='{column}' is not expected") elif not is_registered(domain): - logger.debug(f"domain='{domain}' is not registered, returning False - EXIT!") + logger.debug("domain='%s' is not registered, returning False - EXIT!", domain) return False + key = "recheck_instance" + if column == "last_blocked": + key = "recheck_block" + # Query database - database.cursor.execute("SELECT last_instance_fetch FROM instances WHERE domain = ? LIMIT 1", [domain]) + database.cursor.execute(f"SELECT {column} FROM instances WHERE domain = ? LIMIT 1", [domain]) # Fetch row - fetched = database.cursor.fetchone()[0] + row = database.cursor.fetchone() - logger.debug(f"fetched[{type(fetched)}]='{fetched}'") - recently = isinstance(fetched, float) and time.time() - fetched <= config.get("recheck_instance") + fetched = float(row[column]) if row[column] is not None else 0.0 - logger.debug(f"recently='{recently}' - EXIT!") + diff = (time.time() - fetched) + + logger.debug("fetched[%s]='%s',key='%s',diff=%f", type(fetched), fetched, key, diff) + recently = bool(diff < config.get(key)) + + logger.debug("recently='%s' - EXIT!", recently) return recently -def deobscure(char: str, domain: str, blocked_hash: str = None) -> tuple: - logger.debug(f"char='{char}',domain='{domain}',blocked_hash='{blocked_hash}' - CALLED!") +def deobfuscate(char: str, domain: str, blocked_hash: str = None) -> tuple: + logger.debug("char='%s',domain='%s',blocked_hash='%s' - CALLED!", char, domain, blocked_hash) + if not isinstance(char, str): - raise ValueError(f"Parameter char[]='{type(char)}' is not 'str'") + raise ValueError(f"Parameter char[]='{type(char)}' is not of type 'str'") elif char == "": raise ValueError("Parameter 'char' is empty") + elif not char in domain: + raise ValueError(f"char='{char}' not found in domain='{domain}' but function invoked") elif not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + raise ValueError(f"Parameter domain[]='{type(domain)}'") elif not isinstance(blocked_hash, str) and blocked_hash is not None: - raise ValueError(f"Parameter blocked_hash[]='{type(blocked_hash)}' is not 'str'") + raise ValueError(f"Parameter blocked_hash[]='{type(blocked_hash)}' is not of type 'str'") + # Init row + row = None + + logger.debug("blocked_hash[]='%s'", type(blocked_hash)) if isinstance(blocked_hash, str): - logger.debug(f"Looking up blocked_hash='{blocked_hash}' ...") + logger.debug("Looking up blocked_hash='%s',domain='%s' ...", blocked_hash, domain) database.cursor.execute( - "SELECT domain, origin, nodeinfo_url FROM instances WHERE hash = ? LIMIT 1", [blocked_hash] + "SELECT domain, origin, nodeinfo_url FROM instances WHERE hash = ? OR domain LIKE ? LIMIT 1", [blocked_hash, domain.replace(char, "_")] ) row = database.cursor.fetchone() logger.debug("row[]='%s'", type(row)) if row is None: - logger.debug(f"blocked_hash='{blocked_hash}' not found, trying domain='{domain}' ...") - return deobscure(char, domain) - else: - logger.debug(f"Looking up domain='{domain}' ...") + logger.debug("blocked_hash='%s' not found, trying domain='%s' ...", blocked_hash, domain) + return deobfuscate(char, domain) + elif not domain.startswith("*."): + logger.debug("domain='%s' - BEFORE!", domain) + domain = tidyup.domain(domain) + logger.debug("domain='%s' - AFTER!", domain) + + if domain == "": + logger.warning("domain is empty after tidyup - EXIT!") + return None + + search = domain.replace(char, "_") + + logger.debug("Looking up domain='%s',search='%s' ...", domain, search) database.cursor.execute( - "SELECT domain, origin, nodeinfo_url FROM instances WHERE domain LIKE ? ORDER BY rowid LIMIT 1", [domain.replace(char, "_")] + "SELECT domain, origin, nodeinfo_url FROM instances WHERE domain LIKE ? OR 'https://' || domain LIKE ? ORDER BY rowid LIMIT 1", [search, search] ) row = database.cursor.fetchone() logger.debug("row[]='%s'", type(row)) - logger.debug(f"row[]='{type(row)}' - EXIT!") + logger.debug("row[]='%s' - EXIT!", type(row)) return row def set_last_blocked(domain: str): - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) # Set timestamp _set_data("last_blocked", domain, time.time()) logger.debug("EXIT!") def set_last_instance_fetch(domain: str): - logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain) - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") + logger.debug("domain='%s' - CALLED!", domain) + domain_helper.raise_on(domain) # Set timestamp _set_data("last_instance_fetch", domain, time.time()) logger.debug("EXIT!") +def set_last_response_time(domain: str, response_time: float): + logger.debug("domain='%s',response_time=%d - CALLED!", domain, response_time) + domain_helper.raise_on(domain) + + if not isinstance(response_time, float): + raise ValueError(f"response_time[]='{type(response_time)}' is not of type 'float'") + elif response_time < 0: + raise ValueError(f"response_time={response_time} is below zero") + + # Set timestamp + _set_data("last_response_time", domain, response_time) + logger.debug("EXIT!") + def set_total_peers(domain: str, peers: list): - logger.debug(f"domain='{domain}',peers()={len(peers)} - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(peers, list): - raise ValueError(f"Parameter peers[]='{type(peers)}' is not 'list'") + logger.debug("domain='%s',peers()=%d - CALLED!", domain, len(peers)) + domain_helper.raise_on(domain) + + if not isinstance(peers, list): + raise ValueError(f"Parameter peers[]='{type(peers)}' is not of type 'list'") # Set timestamp _set_data("total_peers", domain, len(peers)) logger.debug("EXIT!") +def set_total_blocks(domain: str, blocks: list): + logger.debug("domain='%s',blocks()=%d - CALLED!", domain, len(blocks)) + domain_helper.raise_on(domain) + + if not isinstance(blocks, list): + raise ValueError(f"Parameter blocks[]='{type(blocks)}' is not of type 'list'") + + # Set timestamp + _set_data("total_blocks", domain, len(blocks)) + logger.debug("EXIT!") + +def set_obfuscated_blocks(domain: str, obfuscated: int): + logger.debug("domain='%s',obfuscated=%d - CALLED!", domain, obfuscated) + domain_helper.raise_on(domain) + + if not isinstance(obfuscated, int): + raise ValueError(f"Parameter obfuscated[]='{type(obfuscated)}' is not of type 'int'") + elif obfuscated < 0: + raise ValueError(f"Parameter obfuscated={obfuscated} is not valid") + + # Set timestamp + _set_data("obfuscated_blocks", domain, obfuscated) + logger.debug("EXIT!") + def set_nodeinfo_url(domain: str, url: str): - logger.debug(f"domain='{domain}',url='{url}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(url, str): - raise ValueError("Parameter url[]='{type(url)}' is not 'list'") + logger.debug("domain='%s',url='%s' - CALLED!", domain, url) + domain_helper.raise_on(domain) + + if not isinstance(url, str) and url is not None: + raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'") elif url == "": raise ValueError("Parameter 'url' is empty") + elif url is not None and not validators.url(url): + raise ValueError(f"Parameter url='{url}' is not a valid URL") # Set timestamp _set_data("nodeinfo_url", domain, url) logger.debug("EXIT!") def set_detection_mode(domain: str, mode: str): - logger.debug(f"domain='{domain}',mode='{mode}' - CALLED!") - if not isinstance(domain, str): - raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'") - elif domain == "": - raise ValueError("Parameter 'domain' is empty") - elif domain.lower() != domain: - raise ValueError(f"Parameter domain='{domain}' must be all lower-case") - elif not validators.domain(domain.split("/")[0]): - raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") - elif domain.endswith(".tld"): - raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") - elif not isinstance(mode, str): - raise ValueError("Parameter mode[]='{type(mode)}' is not 'list'") + logger.debug("domain='%s',mode='%s' - CALLED!", domain, mode) + domain_helper.raise_on(domain) + + if not isinstance(mode, str) and mode is not None: + raise ValueError(f"Parameter mode[]='{type(mode)}' is not of type 'str'") elif mode == "": raise ValueError("Parameter 'mode' is empty") # Set timestamp _set_data("detection_mode", domain, mode) logger.debug("EXIT!") + +def set_has_obfuscation(domain: str, status: bool): + logger.debug("domain='%s',status='%s' - CALLED!", domain, status) + domain_helper.raise_on(domain) + + if not isinstance(status, bool): + raise ValueError(f"Parameter status[]='{type(status)}' is not of type 'bool'") + + # Set timestamp + _set_data("has_obfuscation", domain, status) + logger.debug("EXIT!") + +def set_original_software(domain: str, software: str): + logger.debug("domain='%s',software='%s' - CALLED!", domain, software) + domain_helper.raise_on(domain) + + if not isinstance(software, str) and software is not None: + raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'") + elif software == "": + raise ValueError("Parameter 'software' is empty") + + # Set original software + _set_data("original_software", domain, software) + logger.debug("EXIT!") + + +def set_software(domain: str, software: str): + logger.debug("domain='%s',software='%s' - CALLED!", domain, software) + domain_helper.raise_on(domain) + + if not isinstance(software, str) and software is not None: + raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'") + elif software == "": + raise ValueError("Parameter 'software' is empty") + + # Set software (maybe aliased to generic name) + _set_data("software", domain, software) + logger.debug("EXIT!") + +def valid(value: str, column: str) -> bool: + logger.debug("value='%s' - CALLED!", value) + if not isinstance(value, str): + raise ValueError(f"Parameter value[]='{type(value)}' is not of type 'str'") + elif value == "": + raise ValueError("Parameter 'value' is empty") + elif not isinstance(column, str): + raise ValueError(f"Parameter column[]='{type(column)}' is not of type 'str'") + elif column == "": + raise ValueError("Parameter 'column' is empty") + + # Query database + database.cursor.execute( + f"SELECT {column} FROM instances WHERE {column} = ? LIMIT 1", [value] + ) + + is_valid = database.cursor.fetchone() is not None + + logger.debug("is_valid='%s' - EXIT!", is_valid) + return is_valid + +def translate_idnas(rows: list, column: str): + logger.debug("rows[]='%s' - CALLED!", type(rows)) + + if not isinstance(rows, list): + raise ValueError("rows[]='{type(rows)}' is not of type 'list'") + elif len(rows) == 0: + raise ValueError("Parameter 'rows' is an empty list") + elif not isinstance(column, str): + raise ValueError(f"column='{type(column)}' is not of type 'str'") + elif column == "": + raise ValueError("Parameter 'column' is empty") + elif column not in ["domain", "origin"]: + raise ValueError(f"column='{column}' is not supported") + + logger.info("Checking/converting %d domain names ...", len(rows)) + for row in rows: + logger.debug("row[]='%s'", type(row)) + + translated = row[column].encode("idna").decode("utf-8") + logger.debug("translated='%s',row[%s]='%s'", translated, column, row[column]) + + if translated != row[column]: + logger.info("Translated row[%s]='%s' to '%s'", column, row[column], translated) + if is_registered(translated, True): + logger.warning("Deleting row[%s]='%s' as translated='%s' already exist", column, row[column], translated) + database.cursor.execute(f"DELETE FROM instances WHERE {column} = ? LIMIT 1", [row[column]]) + else: + logger.debug("Updating row[%s]='%s' to translated='%s' ...", column, row[column], translated) + database.cursor.execute(f"UPDATE instances SET {column} = ? WHERE {column} = ? LIMIT 1", [translated, row[column]]) + + logger.debug("Invoking commit() ...") + database.connection.commit() + + logger.debug("EXIT!")