1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
25 from fba.helpers import blacklist
26 from fba.helpers import config
27 from fba.helpers import tidyup
28 from fba.helpers import version
30 from fba.http import network
32 from fba.models import instances
34 from fba.networks import lemmy
35 from fba.networks import misskey
36 from fba.networks import peertube
38 logging.basicConfig(level=logging.INFO)
39 logger = logging.getLogger(__name__)
41 # "rel" identifiers (no real URLs)
42 nodeinfo_identifier = [
43 "https://nodeinfo.diaspora.software/ns/schema/2.1",
44 "https://nodeinfo.diaspora.software/ns/schema/2.0",
45 "https://nodeinfo.diaspora.software/ns/schema/1.1",
46 "https://nodeinfo.diaspora.software/ns/schema/1.0",
47 "http://nodeinfo.diaspora.software/ns/schema/2.1",
48 "http://nodeinfo.diaspora.software/ns/schema/2.0",
49 "http://nodeinfo.diaspora.software/ns/schema/1.1",
50 "http://nodeinfo.diaspora.software/ns/schema/1.0",
53 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
54 logger.debug(f"domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
55 if not isinstance(domain, str):
56 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
58 raise ValueError("Parameter 'domain' is empty")
59 elif domain.lower() != domain:
60 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
61 elif not validators.domain(domain.split("/")[0]):
62 raise ValueError(f"domain='{domain}' is not a valid domain")
63 elif domain.endswith(".arpa"):
64 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
65 elif domain.endswith(".tld"):
66 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
67 elif not isinstance(origin, str) and origin is not None:
68 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
69 elif software is None:
70 logger.debug(f"Updating last_instance_fetch for domain='{domain}' ...")
71 instances.set_last_instance_fetch(domain)
73 logger.debug(f"software for domain='{domain}' is not set, determining ...")
76 software = determine_software(domain, path)
77 except network.exceptions as exception:
78 logger.debug(f"Exception '{type(exception)}' during determining software type")
81 logger.debug(f"Determined software='{software}' for domain='{domain}'")
82 elif not isinstance(software, str):
83 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
84 elif not isinstance(command, str):
85 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
87 raise ValueError("Parameter 'command' is empty")
88 elif not validators.domain(domain.split("/")[0]):
89 raise ValueError(f"domain='{domain}' is not a valid domain")
90 elif domain.endswith(".arpa"):
91 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
92 elif domain.endswith(".tld"):
93 raise ValueError(f"domain='{domain}' is a fake domain")
95 if not instances.is_registered(domain):
96 logger.debug(f"Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
97 instances.add(domain, origin, command, path, software)
99 logger.debug(f"Updating last_instance_fetch for domain='{domain}' ...")
100 instances.set_last_instance_fetch(domain)
102 logger.debug("Fetching instances for domain:", domain, software)
103 peerlist = fetch_peers(domain, software)
106 logger.warning("Cannot fetch peers:", domain)
108 elif instances.has_pending(domain):
109 logger.debug(f"domain='{domain}' has pending nodeinfo data, flushing ...")
110 instances.update_data(domain)
112 logger.info(f"Checking {len(peerlist)} instances from domain='{domain}' ...")
113 for instance in peerlist:
114 logger.debug(f"instance='{instance}'")
116 # Skip "None" types as tidup.domain() cannot parse them
119 logger.debug(f"instance='{instance}' - BEFORE")
120 instance = tidyup.domain(instance)
121 logger.debug(f"instance='{instance}' - AFTER")
124 logger.warning(f"Empty instance after tidyup.domain(), domain='{domain}'")
126 elif not validators.domain(instance.split("/")[0]):
127 logger.warning(f"Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
129 elif instance.endswith(".arpa"):
130 logger.warning(f"instance='{instance}' is a reversed .arpa domain and should not be used generally.")
132 elif blacklist.is_blacklisted(instance):
133 logger.debug("instance is blacklisted:", instance)
135 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
136 logger.debug(f"instance='{instance}' is a link to a single user profile - SKIPPED!")
138 elif instance.endswith(".tld"):
139 logger.debug(f"instance='{instance}' is a fake domain - SKIPPED!")
141 elif not instances.is_registered(instance):
142 logger.debug("Adding new instance:", instance, domain)
143 instances.add(instance, domain, command)
145 logger.debug("EXIT!")
147 def fetch_peers(domain: str, software: str) -> list:
148 logger.debug(f"domain({len(domain)})='{domain}',software='{software}' - CALLED!")
149 if not isinstance(domain, str):
150 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
152 raise ValueError("Parameter 'domain' is empty")
153 elif domain.lower() != domain:
154 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
155 elif not validators.domain(domain.split("/")[0]):
156 raise ValueError(f"domain='{domain}' is not a valid domain")
157 elif domain.endswith(".arpa"):
158 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
159 elif domain.endswith(".tld"):
160 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
161 elif not isinstance(software, str) and software is not None:
162 raise ValueError(f"software[]='{type(software)}' is not 'str'")
164 if software == "misskey":
165 logger.debug(f"Invoking misskey.fetch_peers({domain}) ...")
166 return misskey.fetch_peers(domain)
167 elif software == "lemmy":
168 logger.debug(f"Invoking lemmy.fetch_peers({domain}) ...")
169 return lemmy.fetch_peers(domain)
170 elif software == "peertube":
171 logger.debug(f"Invoking peertube.fetch_peers({domain}) ...")
172 return peertube.fetch_peers(domain)
174 # Init peers variable
177 # No CSRF by default, you don't have to add network.api_headers by yourself here
181 logger.debug(f"Checking CSRF for domain='{domain}'")
182 headers = csrf.determine(domain, dict())
183 except network.exceptions as exception:
184 logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
185 instances.set_last_error(domain, exception)
188 logger.debug(f"Fetching peers from '{domain}',software='{software}' ...")
189 data = network.get_json_api(
191 "/api/v1/instance/peers",
193 (config.get("connection_timeout"), config.get("read_timeout"))
196 logger.debug(f"data[]='{type(data)}'")
197 if "error_message" in data:
198 logger.debug("Was not able to fetch peers, trying alternative ...")
199 data = network.get_json_api(
203 (config.get("connection_timeout"), config.get("read_timeout"))
206 logger.debug(f"response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
207 if "error_message" in data:
208 logger.warning(f"Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
209 elif "federated_instances" in data["json"]:
210 logger.debug(f"Found federated_instances for domain='{domain}'")
211 peers = peers + add_peers(data["json"]["federated_instances"])
212 logger.debug("Added instance(s) to peers")
214 message = "JSON response does not contain 'federated_instances' or 'error_message'"
215 logger.warning(f"{message},domain='{domain}'")
216 instances.set_last_error(domain, message)
217 elif isinstance(data["json"], list):
218 # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
221 logger.warning(f"Cannot parse data[json][]='{type(data['json'])}'")
223 logger.debug(f"Adding '{len(peers)}' for domain='{domain}'")
224 instances.set_total_peers(domain, peers)
226 logger.debug("Returning peers[]:", type(peers))
229 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
230 logger.debug(f"domain='{domain}',path='{path}' - CALLED!")
231 if not isinstance(domain, str):
232 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
234 raise ValueError("Parameter 'domain' is empty")
235 elif domain.lower() != domain:
236 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
237 elif not validators.domain(domain.split("/")[0]):
238 raise ValueError(f"domain='{domain}' is not a valid domain")
239 elif domain.endswith(".arpa"):
240 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
241 elif domain.endswith(".tld"):
242 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
243 elif not isinstance(path, str) and path is not None:
244 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
246 logger.debug(f"Fetching nodeinfo from domain='{domain}' ...")
247 nodeinfo = fetch_wellknown_nodeinfo(domain)
249 logger.debug(f"nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
250 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
251 logger.debug(f"Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
252 return nodeinfo["json"]
254 # No CSRF by default, you don't have to add network.api_headers by yourself here
259 logger.debug(f"Checking CSRF for domain='{domain}'")
260 headers = csrf.determine(domain, dict())
261 except network.exceptions as exception:
262 logger.warning(f"Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
263 instances.set_last_error(domain, exception)
266 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
267 "exception" : exception,
271 "/nodeinfo/2.1.json",
273 "/nodeinfo/2.0.json",
279 for request in request_paths:
280 logger.debug(f"path[{type(path)}]='{path}',request='{request}'")
281 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
282 logger.debug(f"Fetching request='{request}' from domain='{domain}' ...")
283 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
284 logger.debug(f"domain='{domain}',path='{path}' has protocol in path, splitting ...")
285 components = urlparse(path)
286 path = components.path
288 data = network.get_json_api(
292 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
295 logger.debug(f"response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
296 if "error_message" not in data:
297 logger.debug("Success:", request)
298 instances.set_detection_mode(domain, "STATIC_CHECK")
299 instances.set_nodeinfo_url(domain, request)
302 logger.warning(f"Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
304 logger.debug(f"data()={len(data)} - EXIT!")
307 def fetch_wellknown_nodeinfo(domain: str) -> dict:
308 logger.debug(f"domain='{domain}' - CALLED!")
309 if not isinstance(domain, str):
310 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
312 raise ValueError("Parameter 'domain' is empty")
313 elif domain.lower() != domain:
314 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
315 elif not validators.domain(domain.split("/")[0]):
316 raise ValueError(f"domain='{domain}' is not a valid domain")
317 elif domain.endswith(".arpa"):
318 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
319 elif domain.endswith(".tld"):
320 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
322 # No CSRF by default, you don't have to add network.api_headers by yourself here
326 logger.debug(f"Checking CSRF for domain='{domain}'")
327 headers = csrf.determine(domain, dict())
328 except network.exceptions as exception:
329 logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
330 instances.set_last_error(domain, exception)
333 "error_message": type(exception),
334 "exception" : exception,
337 logger.debug("Fetching .well-known info for domain:", domain)
338 data = network.get_json_api(
340 "/.well-known/nodeinfo",
342 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
345 if "error_message" not in data:
346 nodeinfo = data["json"]
347 logger.debug("Found entries:", len(nodeinfo), domain)
348 if "links" in nodeinfo:
349 logger.debug("Found links in nodeinfo():", len(nodeinfo["links"]))
350 for link in nodeinfo["links"]:
351 logger.debug(f"link[{type(link)}]='{link}'")
352 if not isinstance(link, dict) or not "rel" in link:
353 logger.warning(f"link[]='{type(link)}' is not 'dict' or no element 'rel' found")
354 elif link["rel"] in nodeinfo_identifier:
355 # Default is that 'href' has a complete URL, but some hosts don't send that
357 components = urlparse(link["href"])
359 logger.debug(f"components[{type(components)}]='{components}'")
360 if components.scheme == "" and components.netloc == "":
361 logger.debug(f"link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'")
362 url = f"https://{domain}{url}"
363 components = urlparse(url)
365 if not validators.domain(components.netloc):
366 logger.warning(f"components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
368 elif domain.endswith(".arpa"):
369 logger.warning(f"domain='{domain}' is a domain for reversed IP addresses - SKIPPED!")
371 elif domain.endswith(".tld"):
372 logger.warning(f"domain='{domain}' is a fake domain - SKIPPED!")
374 elif blacklist.is_blacklisted(components.netloc):
375 logger.debug(f"components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
378 logger.debug("Fetching nodeinfo from:", url)
379 data = network.fetch_api_url(
381 (config.get("connection_timeout"), config.get("read_timeout"))
384 logger.debug("href,data[]:", link["href"], type(data))
385 if "error_message" not in data and "json" in data:
386 logger.debug("Found JSON nodeinfo():", len(data))
387 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
388 instances.set_nodeinfo_url(domain, link["href"])
391 instances.set_last_error(domain, data)
393 logger.warning("Unknown 'rel' value:", domain, link["rel"])
395 logger.warning("nodeinfo does not contain 'links':", domain)
397 logger.debug("Returning data[]:", type(data))
400 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
401 logger.debug(f"domain({len(domain)})='{domain}',path='{path}' - CALLED!")
402 if not isinstance(domain, str):
403 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
405 raise ValueError("Parameter 'domain' is empty")
406 elif domain.lower() != domain:
407 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
408 elif not validators.domain(domain.split("/")[0]):
409 raise ValueError(f"domain='{domain}' is not a valid domain")
410 elif domain.endswith(".arpa"):
411 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
412 elif domain.endswith(".tld"):
413 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
414 elif not isinstance(path, str):
415 raise ValueError(f"path[]='{type(path)}' is not 'str'")
417 raise ValueError("Parameter 'path' is empty")
419 logger.debug(f"domain='{domain}',path='{path}' - CALLED!")
422 logger.debug(f"Fetching path='{path}' from '{domain}' ...")
423 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
425 logger.debug("domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
426 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
427 logger.debug(f"Parsing response.text()={len(response.text)} Bytes ...")
428 doc = bs4.BeautifulSoup(response.text, "html.parser")
430 logger.debug("doc[]:", type(doc))
431 generator = doc.find("meta", {"name" : "generator"})
432 site_name = doc.find("meta", {"property": "og:site_name"})
434 logger.debug(f"generator='{generator}',site_name='{site_name}'")
435 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
436 logger.debug("Found generator meta tag:", domain)
437 software = tidyup.domain(generator.get("content"))
438 logger.debug(f"software[{type(software)}]='{software}'")
439 if software is not None and software != "":
440 logger.info(f"domain='{domain}' is generated by '{software}'")
441 instances.set_detection_mode(domain, "GENERATOR")
442 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
443 logger.debug("Found property=og:site_name:", domain)
444 software = tidyup.domain(site_name.get("content"))
445 logger.debug(f"software[{type(software)}]='{software}'")
446 if software is not None and software != "":
447 logger.info(f"domain='{domain}' has og:site_name='{software}'")
448 instances.set_detection_mode(domain, "SITE_NAME")
450 logger.debug(f"software[]='{type(software)}'")
451 if isinstance(software, str) and software == "":
452 logger.debug(f"Corrected empty string to None for software of domain='{domain}'")
454 elif isinstance(software, str) and ("." in software or " " in software):
455 logger.debug(f"software='{software}' may contain a version number, domain='{domain}', removing it ...")
456 software = version.remove(software)
458 logger.debug(f"software[]='{type(software)}'")
459 if isinstance(software, str) and "powered by " in software:
460 logger.debug(f"software='{software}' has 'powered by' in it")
461 software = version.remove(version.strip_powered_by(software))
462 elif isinstance(software, str) and " hosted on " in software:
463 logger.debug(f"software='{software}' has 'hosted on' in it")
464 software = version.remove(version.strip_hosted_on(software))
465 elif isinstance(software, str) and " by " in software:
466 logger.debug(f"software='{software}' has ' by ' in it")
467 software = version.strip_until(software, " by ")
468 elif isinstance(software, str) and " see " in software:
469 logger.debug(f"software='{software}' has ' see ' in it")
470 software = version.strip_until(software, " see ")
472 logger.debug(f"software='{software}' - EXIT!")
475 def determine_software(domain: str, path: str = None) -> str:
476 logger.debug(f"domain({len(domain)})='{domain}',path='{path}' - CALLED!")
477 if not isinstance(domain, str):
478 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
480 raise ValueError("Parameter 'domain' is empty")
481 elif domain.lower() != domain:
482 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
483 elif not validators.domain(domain.split("/")[0]):
484 raise ValueError(f"domain='{domain}' is not a valid domain")
485 elif domain.endswith(".arpa"):
486 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
487 elif domain.endswith(".tld"):
488 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
489 elif not isinstance(path, str) and path is not None:
490 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
492 logger.debug("Determining software for domain,path:", domain, path)
495 logger.debug(f"Fetching nodeinfo from '{domain}' ...")
496 data = fetch_nodeinfo(domain, path)
498 logger.debug(f"data[{type(data)}]='{data}'")
499 if "exception" in data:
500 # Continue raising it
501 raise data["exception"]
502 elif "error_message" in data:
503 logger.debug(f"Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
504 return fetch_generator_from_path(domain)
505 elif "status" in data and data["status"] == "error" and "message" in data:
506 logger.warning("JSON response is an error:", data["message"])
507 instances.set_last_error(domain, data["message"])
508 return fetch_generator_from_path(domain)
509 elif "message" in data:
510 logger.warning("JSON response contains only a message:", data["message"])
511 instances.set_last_error(domain, data["message"])
512 return fetch_generator_from_path(domain)
513 elif "software" not in data or "name" not in data["software"]:
514 logger.debug(f"JSON response from domain='{domain}' does not include [software][name], fetching / ...")
515 software = fetch_generator_from_path(domain)
516 logger.debug(f"Generator for domain='{domain}' is: '{software}'")
517 elif "software" in data and "name" in data["software"]:
518 logger.debug("Found data[software][name] in JSON response")
519 software = data["software"]["name"]
522 logger.debug("Returning None - EXIT!")
525 sofware = tidyup.domain(software)
526 logger.debug("sofware after tidyup.domain():", software)
528 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
529 logger.debug("Setting pleroma:", domain, software)
531 elif software in ["hometown", "ecko"]:
532 logger.debug("Setting mastodon:", domain, software)
533 software = "mastodon"
534 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
535 logger.debug("Setting misskey:", domain, software)
537 elif software == "runtube.re":
538 logger.debug("Setting peertube:", domain, software)
539 software = "peertube"
540 elif software == "nextcloud social":
541 logger.debug("Setting nextcloud:", domain, software)
542 software = "nextcloud"
543 elif software.find("/") > 0:
544 logger.warning("Spliting of slash:", software)
545 software = tidyup.domain(software.split("/")[-1])
546 elif software.find("|") > 0:
547 logger.warning("Spliting of pipe:", software)
548 software = tidyup.domain(software.split("|")[0])
549 elif "powered by" in software:
550 logger.debug(f"software='{software}' has 'powered by' in it")
551 software = version.strip_powered_by(software)
552 elif isinstance(software, str) and " by " in software:
553 logger.debug(f"software='{software}' has ' by ' in it")
554 software = version.strip_until(software, " by ")
555 elif isinstance(software, str) and " see " in software:
556 logger.debug(f"software='{software}' has ' see ' in it")
557 software = version.strip_until(software, " see ")
559 logger.debug(f"software[]='{type(software)}'")
561 logger.warning("tidyup.domain() left no software name behind:", domain)
564 logger.debug(f"software[]='{type(software)}'")
565 if str(software) == "":
566 logger.debug(f"software for '{domain}' was not detected, trying generator ...")
567 software = fetch_generator_from_path(domain)
568 elif len(str(software)) > 0 and ("." in software or " " in software):
569 logger.debug(f"software='{software}' may contain a version number, domain='{domain}', removing it ...")
570 software = version.remove(software)
572 logger.debug(f"software[]='{type(software)}'")
573 if isinstance(software, str) and "powered by" in software:
574 logger.debug(f"software='{software}' has 'powered by' in it")
575 software = version.remove(version.strip_powered_by(software))
577 logger.debug("Returning domain,software:", domain, software)
580 def find_domains(tag: bs4.element.Tag) -> list:
581 logger.debug(f"tag[]='{type(tag)}' - CALLED!")
582 if not isinstance(tag, bs4.element.Tag):
583 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
584 elif len(tag.select("tr")) == 0:
585 raise KeyError("No table rows found in table!")
588 for element in tag.select("tr"):
589 logger.debug(f"element[]='{type(element)}'")
590 if not element.find("td"):
591 logger.debug("Skipping element, no <td> found")
594 domain = tidyup.domain(element.find("td").text)
595 reason = tidyup.reason(element.findAll("td")[1].text)
597 logger.debug(f"domain='{domain}',reason='{reason}'")
599 if not validators.domain(domain.split("/")[0]):
600 logger.warning(f"domain='{domain}' is not a valid domain - SKIPPED!")
602 elif domain.endswith(".arpa"):
603 logger.warning(f"domain='{domain}' is a domain for reversed IP addresses - SKIPPED!")
605 elif domain.endswith(".tld"):
606 logger.warning(f"domain='{domain}' is a fake domain - SKIPPED!")
608 elif blacklist.is_blacklisted(domain):
609 logger.debug(f"domain='{domain}' is blacklisted - SKIPPED!")
611 elif domain == "gab.com/.ai, develop.gab.com":
612 logger.debug("Multiple domains detected in one row")
622 "domain": "develop.gab.com",
626 elif not validators.domain(domain.split("/")[0]):
627 logger.warning(f"domain='{domain}' is not a valid domain - SKIPPED!")
630 logger.debug(f"Adding domain='{domain}',reason='{reason}' ...")
636 logger.debug(f"domains()={len(domains)} - EXIT!")
639 def add_peers(rows: dict) -> list:
640 logger.debug(f"rows()={len(rows)} - CALLED!")
642 for key in ["linked", "allowed", "blocked"]:
643 logger.debug(f"Checking key='{key}'")
644 if key not in rows or rows[key] is None:
645 logger.warning(f"Cannot find key='{key}' or it is NoneType - SKIPPED!")
648 logger.debug(f"Adding {len(rows[key])} peer(s) to peers list ...")
649 for peer in rows[key]:
650 logger.debug(f"peer='{peer}' - BEFORE!")
651 if isinstance(peer, dict) and "domain" in peer:
652 logger.debug(f"peer[domain]='{peer['domain']}'")
653 peer = tidyup.domain(peer["domain"])
654 elif isinstance(peer, str):
655 logger.debug(f"peer='{peer}'")
656 peer = tidyup.domain(peer)
658 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
660 logger.debug(f"peer='{peer}' - AFTER!")
661 if not validators.domain(peer):
662 logger.warning(f"peer='{peer}' is not a valid domain - SKIPPED!")
664 elif peer.endswith(".arpa"):
665 logger.warning(f"peer='{peer}' is a domain for reversed IP addresses -SKIPPED!")
667 elif peer.endswith(".tld"):
668 logger.warning(f"peer='{peer}' is a fake domain - SKIPPED!")
670 elif blacklist.is_blacklisted(peer):
671 logger.debug(f"peer='{peer}' is blacklisted - SKIPPED!")
674 logger.debug(f"Adding peer='{peer}' ...")
677 logger.debug(f"peers()={len(peers)} - EXIT!")