import logging
+from functools import lru_cache
from urllib.parse import urlparse
import validators
raise ValueError(f"domain='{domain}' is not a valid domain")
elif domain.endswith(".onion"):
raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!")
- elif domain.endswith(".i2p") and config.get("allow_i2p_domain"):
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
raise ValueError(f"domain='{domain}' is an I2P, please don't crawl them!")
elif domain.endswith(".arpa"):
raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
logger.debug("EXIT!")
+@lru_cache
def is_in_url(domain: str, url: str) -> bool:
logger.debug("domain='%s',url='%s' - CALLED!", domain, url)
raise_on(domain)
- if not isinstance(url, str):
+ if blacklist.is_blacklisted(domain):
+ raise ValueError(f"domain='{domain}' is blacklisted but function was invoked")
+ elif not isinstance(url, str):
raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'")
elif url == "":
raise ValueError("Parameter 'url' is empty")
+ elif not validators.url(url):
+ raise ValueError(f"Parameter url='{url}' is not a valid URL")
punycode = domain.encode("idna").decode("utf-8")
logger.debug("is_found='%s' - EXIT!", is_found)
return is_found
+@lru_cache
def is_wanted(domain: str) -> bool:
logger.debug("domain='%s' - CALLED!", domain)
wanted = True
if domain.lower() != domain:
- logger.debug("domain='%s' is not all-lowercase - setting False ...", domain)
+ logger.debug("domain='%s' is not all-lowercase - setting False ...", domain)
wanted = False
elif not validators.domain(domain.split("/")[0]):
logger.debug("domain='%s' is not a valid domain name - setting False ...", domain)
elif domain.endswith(".onion"):
logger.debug("domain='%s' is a TOR .onion domain - setting False ...", domain)
wanted = False
- elif domain.endswith(".i2p") and config.get("allow_i2p_domain"):
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
logger.debug("domain='%s' is an I2P domain - setting False ...", domain)
wanted = False
elif domain.endswith(".tld"):