import logging
+from functools import lru_cache
from urllib.parse import urlparse
import validators
+from fba.helpers import blacklist
+from fba.helpers import config
+
+from fba.models import instances
+
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def raise_on(domain: str):
logger.debug("domain='%s' - CALLED!", domain)
+
if not isinstance(domain, str):
- raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
+ raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
elif domain == "":
raise ValueError("Parameter 'domain' is empty")
elif domain.lower() != domain:
raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
elif not validators.domain(domain.split("/")[0]):
raise ValueError(f"domain='{domain}' is not a valid domain")
- elif domain.endswith(".arpa"):
- raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
elif domain.endswith(".onion"):
raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!")
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
+ raise ValueError(f"domain='{domain}' is an I2P, please don't crawl them!")
+ elif domain.endswith(".arpa"):
+ raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
elif domain.endswith(".tld"):
raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
logger.debug("EXIT!")
+@lru_cache
def is_in_url(domain: str, url: str) -> bool:
logger.debug("domain='%s',url='%s' - CALLED!", domain, url)
raise_on(domain)
- if not isinstance(url, str):
- raise ValueError(f"Parameter url[]='%s' is not 'str'", type(url))
+ if blacklist.is_blacklisted(domain):
+ raise ValueError(f"domain='{domain}' is blacklisted but function was invoked")
+ elif not isinstance(url, str):
+ raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'")
elif url == "":
raise ValueError("Parameter 'url' is empty")
+ elif not validators.url(url):
+ raise ValueError(f"Parameter url='{url}' is not a valid URL")
- components = urlparse(url)
punycode = domain.encode("idna").decode("utf-8")
+ components = urlparse(url)
logger.debug("components[]='%s',punycode='%s'", type(components), punycode)
- is_found = (punycode == components.netloc or punycode == components.hostname)
+
+ is_found = (punycode in [components.netloc, components.hostname])
logger.debug("is_found='%s' - EXIT!", is_found)
return is_found
+
+@lru_cache
+def is_wanted(domain: str) -> bool:
+ logger.debug("domain='%s' - CALLED!", domain)
+
+ if not isinstance(domain, str):
+ raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
+ elif domain == "":
+ raise ValueError("Parameter 'domain' is empty")
+
+ wanted = True
+ if domain.lower() != domain:
+ logger.debug("domain='%s' is not all-lowercase - setting False ...", domain)
+ wanted = False
+ elif not validators.domain(domain.split("/")[0]):
+ logger.debug("domain='%s' is not a valid domain name - setting False ...", domain)
+ wanted = False
+ elif domain.endswith(".arpa"):
+ logger.debug("domain='%s' is a domain for reversed IP addresses - setting False ...", domain)
+ wanted = False
+ elif domain.endswith(".onion"):
+ logger.debug("domain='%s' is a TOR .onion domain - setting False ...", domain)
+ wanted = False
+ elif domain.endswith(".i2p") and config.get("allow_i2p_domain") == "true":
+ logger.debug("domain='%s' is an I2P domain - setting False ...", domain)
+ wanted = False
+ elif domain.endswith(".tld"):
+ logger.debug("domain='%s' is a fake domain - setting False ...", domain)
+ wanted = False
+ elif blacklist.is_blacklisted(domain):
+ logger.debug("domain='%s' is blacklisted - setting False ...", domain)
+ wanted = False
+ elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (instances.is_registered(domain.split("/")[0]) and domain.find("/c/") > 0):
+ logger.debug("domain='%s' is a single user", domain)
+ wanted = False
+ elif domain.find("/tag/") > 0:
+ logger.debug("domain='%s' is a tag", domain)
+ wanted = False
+
+ logger.debug("wanted='%s' - EXIT!", wanted)
+ return wanted