X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=fba%2Fhelpers%2Fdomain.py;h=86148928e405748613a914f241f93489aed216b5;hb=600b452ced0fbf1c9c9796cf5afed63a1baead5a;hp=5328e3f37eeddc8a25043b390249704a7486de82;hpb=aa6a95081bb9fd510eb1b838c7c929fbae92c300;p=fba.git diff --git a/fba/helpers/domain.py b/fba/helpers/domain.py index 5328e3f..8614892 100644 --- a/fba/helpers/domain.py +++ b/fba/helpers/domain.py @@ -16,15 +16,22 @@ import logging +from functools import lru_cache from urllib.parse import urlparse import validators +from fba.helpers import blacklist +from fba.helpers import config + +from fba.models import instances + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def raise_on(domain: str): logger.debug("domain='%s' - CALLED!", domain) + if not isinstance(domain, str): raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'") elif domain == "": @@ -33,29 +40,78 @@ def raise_on(domain: str): raise ValueError(f"Parameter domain='{domain}' must be all lower-case") elif not validators.domain(domain.split("/")[0]): raise ValueError(f"domain='{domain}' is not a valid domain") - elif domain.endswith(".arpa"): - raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") elif domain.endswith(".onion"): raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!") + elif domain.endswith(".i2p") and not config.get("allow_i2p_domain") == "true": + raise ValueError(f"domain='{domain}' is an I2P, please don't crawl them!") + elif domain.endswith(".arpa"): + raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!") elif domain.endswith(".tld"): raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!") logger.debug("EXIT!") +@lru_cache def is_in_url(domain: str, url: str) -> bool: logger.debug("domain='%s',url='%s' - CALLED!", domain, url) raise_on(domain) - if not isinstance(url, str): - raise ValueError(f"Parameter url[]='%s' is not of type 'str'", type(url)) + if blacklist.is_blacklisted(domain): + raise ValueError(f"domain='{domain}' is blacklisted but function was invoked") + elif not isinstance(url, str): + raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'") elif url == "": raise ValueError("Parameter 'url' is empty") + elif not validators.url(url): + raise ValueError(f"Parameter url='{url}' is not a valid URL") - components = urlparse(url) punycode = domain.encode("idna").decode("utf-8") + components = urlparse(url) logger.debug("components[]='%s',punycode='%s'", type(components), punycode) - is_found = (punycode == components.netloc or punycode == components.hostname) + + is_found = (punycode in [components.netloc, components.hostname]) logger.debug("is_found='%s' - EXIT!", is_found) return is_found + +@lru_cache +def is_wanted(domain: str) -> bool: + logger.debug("domain='%s' - CALLED!", domain) + + if not isinstance(domain, str): + raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'") + elif domain == "": + raise ValueError("Parameter 'domain' is empty") + + wanted = True + if domain.lower() != domain: + logger.debug("domain='%s' is not all-lowercase - setting False ...", domain) + wanted = False + elif not validators.domain(domain.split("/")[0]): + logger.debug("domain='%s' is not a valid domain name - setting False ...", domain) + wanted = False + elif domain.endswith(".arpa"): + logger.debug("domain='%s' is a domain for reversed IP addresses - setting False ...", domain) + wanted = False + elif domain.endswith(".onion"): + logger.debug("domain='%s' is a TOR .onion domain - setting False ...", domain) + wanted = False + elif domain.endswith(".i2p") and not config.get("allow_i2p_domain") == "true": + logger.debug("domain='%s' is an I2P domain - setting False ...", domain) + wanted = False + elif domain.endswith(".tld"): + logger.debug("domain='%s' is a fake domain - setting False ...", domain) + wanted = False + elif blacklist.is_blacklisted(domain): + logger.debug("domain='%s' is blacklisted - setting False ...", domain) + wanted = False + elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (instances.is_registered(domain.split("/")[0]) and domain.find("/c/") > 0): + logger.debug("domain='%s' is a single user", domain) + wanted = False + elif domain.find("/tag/") > 0: + logger.debug("domain='%s' is a tag", domain) + wanted = False + + logger.debug("wanted='%s' - EXIT!", wanted) + return wanted