1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
2 # Copyright (C) 2023 Free Software Foundation
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU Affero General Public License as published
6 # by the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU Affero General Public License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 from functools import lru_cache
20 from urllib.parse import urlparse
21 from urllib.parse import urlunparse
25 from fba.helpers import blacklist
26 from fba.helpers import config
28 from fba.models import instances
30 logging.basicConfig(level=logging.INFO)
31 logger = logging.getLogger(__name__)
33 def raise_on(domain: str) -> None:
34 logger.debug("domain='%s' - CALLED!", domain)
36 if not isinstance(domain, str):
37 raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
39 raise ValueError("Parameter 'domain' is empty")
40 elif domain.lower() != domain:
41 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
43 raise ValueError(f"Parameter domain='{domain}' contains a question-mark")
44 elif not validators.domain(domain.split("/")[0], rfc_2782=True):
45 raise ValueError(f"domain='{domain}' is not a valid domain")
46 elif domain.endswith(".onion"):
47 raise ValueError(f"domain='{domain}' is a TOR, please don't crawl them!")
48 elif domain.endswith(".i2p") and not config.get("allow_i2p_domain"):
49 raise ValueError(f"domain='{domain}' is an I2P, please don't crawl them!")
50 elif domain.endswith(".arpa"):
51 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
52 elif domain.endswith(".tld"):
53 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
58 def is_in_url(domain: str, url: str) -> bool:
59 logger.debug("domain='%s',url='%s' - CALLED!", domain, url)
62 if blacklist.is_blacklisted(domain):
63 raise ValueError(f"domain='{domain}' is blacklisted but function was invoked")
64 elif not isinstance(url, str):
65 raise ValueError(f"Parameter url[]='{type(url)}' is not of type 'str'")
67 raise ValueError("Parameter 'url' is empty")
68 elif not validators.url(url):
69 raise ValueError(f"Parameter url='{url}' is not a valid URL")
71 punycode = encode_idna(domain)
72 logger.debug("punycode='%s'", punycode)
74 components = urlparse(url)
75 logger.debug("components[]='%s',punycode='%s'", type(components), punycode)
77 is_found = (punycode in [components.netloc, components.hostname])
79 logger.debug("is_found='%s' - EXIT!", is_found)
83 def is_tld_wanted(domain: str) -> bool:
84 logger.debug("domain='%s' - CALLED!", domain)
86 if not isinstance(domain, str):
87 raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
89 raise ValueError("Parameter 'domain' is empty")
93 if domain.endswith(".onion"):
94 logger.debug("domain='%s' is a TOR .onion domain - setting wanted=False ...", domain)
96 elif domain.endswith(".i2p") and not config.get("allow_i2p_domain"):
97 logger.debug("domain='%s' is an I2P .onion domain - setting wanted=False ...", domain)
99 elif domain.endswith(".arpa"):
100 logger.debug("domain='%s' is a reverse IP address - setting wanted=False ...", domain)
102 elif domain.endswith(".tld"):
103 logger.debug("domain='%s' is a fake domain - setting wanted=False ...", domain)
106 logger.debug("wanted='%s' - EXIT!", wanted)
110 def is_wanted(domain: str) -> bool:
111 logger.debug("domain='%s' - CALLED!", domain)
113 if not isinstance(domain, str):
114 raise ValueError(f"Parameter domain[]='{type(domain)}' is not of type 'str'")
116 raise ValueError("Parameter 'domain' is empty")
119 if domain.lower() != domain:
120 logger.debug("domain='%s' is not all-lowercase - setting False ...", domain)
122 elif not validators.domain(domain.split("/")[0], rfc_2782=True):
123 logger.debug("domain='%s' is not a valid domain name - setting False ...", domain)
125 elif not is_tld_wanted(domain):
126 logger.debug("domain='%s' has an unwanted TLD - setting False ...", domain)
128 elif blacklist.is_blacklisted(domain):
129 logger.debug("domain='%s' is blacklisted - setting False ...", domain)
131 elif domain.find("/profile/") > 0 or domain.find("/users/") > 0 or (instances.is_registered(domain.split("/")[0]) and domain.find("/c/") > 0):
132 logger.debug("domain='%s' is a single user", domain)
134 elif domain.find("/tag/") > 0:
135 logger.debug("domain='%s' is a tag", domain)
138 logger.debug("wanted='%s' - EXIT!", wanted)
142 def encode_idna(domain: str) -> str:
143 logger.debug("domain='%s' - CALLED!", domain)
146 punycode = domain.lstrip(".").split("?")[0]
147 logger.debug("punycode='%s' - AFTER!", punycode)
150 components = urlparse("https://" + punycode)
151 logger.debug("components[%s](%d)='%s'", type(components), len(components), components)
153 punycode = components.netloc.encode("idna").decode("utf-8") + components.path
154 logger.debug("punycode='%s',domain='%s'", punycode, domain)
156 punycode = domain.encode("idna").decode("utf-8")
158 logger.debug("punycode='%s' - EXIT!", punycode)