import logging
+from functools import lru_cache
from urllib.parse import urlparse
import validators
from fba.helpers import blacklist
+from fba.helpers import config
from fba.models import instances
def raise_on(domain: str):
logger.debug("domain='%s' - CALLED!", domain)
+
if not isinstance(domain, str):
raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
elif domain == "":
raise ValueError("Parameter 'domain' is empty")
elif domain.lower() != domain:
raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
- elif not validators.hostname(domain.split("/")[0]):
+ elif not validators.domain(domain.split("/")[0]):
raise ValueError(f"domain='{domain}' is not a valid domain")
- elif domain.endswith(".arpa"):
- raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
elif domain.endswith(".onion"):
raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!")
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
+ raise ValueError(f"domain='{domain}' is an I2P, please don't crawl them!")
+ elif domain.endswith(".arpa"):
+ raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
elif domain.endswith(".tld"):
raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
logger.debug("EXIT!")
+@lru_cache
def is_in_url(domain: str, url: str) -> bool:
logger.debug("domain='%s',url='%s' - CALLED!", domain, url)
raise_on(domain)
- if not isinstance(url, str):
+ if blacklist.is_blacklisted(domain):
+ raise ValueError(f"domain='{domain}' is blacklisted but function was invoked")
+ elif not isinstance(url, str):
raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'")
elif url == "":
raise ValueError("Parameter 'url' is empty")
+ elif not validators.url(url):
+ raise ValueError(f"Parameter url='{url}' is not a valid URL")
punycode = domain.encode("idna").decode("utf-8")
logger.debug("is_found='%s' - EXIT!", is_found)
return is_found
+@lru_cache
def is_wanted(domain: str) -> bool:
logger.debug("domain='%s' - CALLED!", domain)
- wanted = True
if not isinstance(domain, str):
raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
elif domain == "":
raise ValueError("Parameter 'domain' is empty")
- elif domain.lower() != domain:
+
+ wanted = True
+ if domain.lower() != domain:
+ logger.debug("domain='%s' is not all-lowercase - setting False ...", domain)
wanted = False
- elif not validators.hostname(domain.split("/")[0]):
+ elif not validators.domain(domain.split("/")[0]):
logger.debug("domain='%s' is not a valid domain name - setting False ...", domain)
wanted = False
elif domain.endswith(".arpa"):
elif domain.endswith(".onion"):
logger.debug("domain='%s' is a TOR .onion domain - setting False ...", domain)
wanted = False
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
+ logger.debug("domain='%s' is an I2P domain - setting False ...", domain)
+ wanted = False
elif domain.endswith(".tld"):
logger.debug("domain='%s' is a fake domain - setting False ...", domain)
wanted = False