1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 from urllib.parse import urlparse
23 from fba.helpers import blacklist
24 from fba.helpers import config
25 from fba.helpers import tidyup
26 from fba.helpers import version
28 from fba.http import network
30 from fba.models import instances
32 from fba.networks import lemmy
33 from fba.networks import misskey
34 from fba.networks import peertube
36 # "rel" identifiers (no real URLs)
37 nodeinfo_identifier = [
38 "https://nodeinfo.diaspora.software/ns/schema/2.1",
39 "https://nodeinfo.diaspora.software/ns/schema/2.0",
40 "https://nodeinfo.diaspora.software/ns/schema/1.1",
41 "https://nodeinfo.diaspora.software/ns/schema/1.0",
42 "http://nodeinfo.diaspora.software/ns/schema/2.1",
43 "http://nodeinfo.diaspora.software/ns/schema/2.0",
44 "http://nodeinfo.diaspora.software/ns/schema/1.1",
45 "http://nodeinfo.diaspora.software/ns/schema/1.0",
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
50 if not isinstance(domain, str):
51 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
53 raise ValueError("Parameter 'domain' is empty")
54 elif domain.lower() != domain:
55 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
56 elif not validators.domain(domain.split("/")[0]):
57 raise ValueError(f"domain='{domain}' is not a valid domain")
58 elif domain.endswith(".arpa"):
59 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
60 elif domain.endswith(".tld"):
61 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
62 elif not isinstance(origin, str) and origin is not None:
63 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
64 elif software is None:
65 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
66 instances.set_last_instance_fetch(domain)
68 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
71 software = determine_software(domain, path)
72 except network.exceptions as exception:
73 # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type")
76 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
77 elif not isinstance(software, str):
78 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
79 elif not isinstance(command, str):
80 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
82 raise ValueError("Parameter 'command' is empty")
83 elif not validators.domain(domain.split("/")[0]):
84 raise ValueError(f"domain='{domain}' is not a valid domain")
85 elif domain.endswith(".arpa"):
86 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
87 elif domain.endswith(".tld"):
88 raise ValueError(f"domain='{domain}' is a fake domain")
90 if not instances.is_registered(domain):
91 # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
92 instances.add(domain, origin, command, path, software)
94 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
95 instances.set_last_instance_fetch(domain)
97 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
98 peerlist = fetch_peers(domain, software)
101 print("ERROR: Cannot fetch peers:", domain)
103 elif instances.has_pending(domain):
104 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
105 instances.update_data(domain)
107 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
108 for instance in peerlist:
109 # DEBUG: print(f"DEBUG: instance='{instance}'")
111 # Skip "None" types as tidup.domain() cannot parse them
114 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
115 instance = tidyup.domain(instance)
116 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
119 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
121 elif not validators.domain(instance.split("/")[0]):
122 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
124 elif instance.endswith(".arpa"):
125 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
127 elif blacklist.is_blacklisted(instance):
128 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
130 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
131 # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!")
133 elif instance.endswith(".tld"):
134 # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!")
136 elif not instances.is_registered(instance):
137 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
138 instances.add(instance, domain, command)
140 # DEBUG: print("DEBUG: EXIT!")
142 def fetch_peers(domain: str, software: str) -> list:
143 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!")
144 if not isinstance(domain, str):
145 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
147 raise ValueError("Parameter 'domain' is empty")
148 elif domain.lower() != domain:
149 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
150 elif not validators.domain(domain.split("/")[0]):
151 raise ValueError(f"domain='{domain}' is not a valid domain")
152 elif domain.endswith(".arpa"):
153 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
154 elif domain.endswith(".tld"):
155 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
156 elif not isinstance(software, str) and software is not None:
157 raise ValueError(f"software[]='{type(software)}' is not 'str'")
159 if software == "misskey":
160 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
161 return misskey.fetch_peers(domain)
162 elif software == "lemmy":
163 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
164 return lemmy.fetch_peers(domain)
165 elif software == "peertube":
166 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
167 return peertube.fetch_peers(domain)
169 # Init peers variable
172 # No CSRF by default, you don't have to add network.api_headers by yourself here
176 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
177 headers = csrf.determine(domain, dict())
178 except network.exceptions as exception:
179 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
180 instances.set_last_error(domain, exception)
183 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
184 data = network.get_json_api(
186 "/api/v1/instance/peers",
188 (config.get("connection_timeout"), config.get("read_timeout"))
191 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
192 if "error_message" in data:
193 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
194 data = network.get_json_api(
198 (config.get("connection_timeout"), config.get("read_timeout"))
201 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
202 if "error_message" in data:
203 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
204 elif "federated_instances" in data["json"]:
205 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
206 peers = peers + add_peers(data["json"]["federated_instances"])
207 # DEBUG: print("DEBUG: Added instance(s) to peers")
209 message = "JSON response does not contain 'federated_instances' or 'error_message'"
210 print(f"WARNING: {message},domain='{domain}'")
211 instances.set_last_error(domain, message)
212 elif isinstance(data["json"], list):
213 # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
216 print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'")
218 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
219 instances.set_total_peers(domain, peers)
221 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
224 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
225 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
226 if not isinstance(domain, str):
227 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
229 raise ValueError("Parameter 'domain' is empty")
230 elif domain.lower() != domain:
231 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
232 elif not validators.domain(domain.split("/")[0]):
233 raise ValueError(f"domain='{domain}' is not a valid domain")
234 elif domain.endswith(".arpa"):
235 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
236 elif domain.endswith(".tld"):
237 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
238 elif not isinstance(path, str) and path is not None:
239 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
241 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
242 nodeinfo = fetch_wellknown_nodeinfo(domain)
244 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
245 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
246 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
247 return nodeinfo["json"]
249 # No CSRF by default, you don't have to add network.api_headers by yourself here
254 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
255 headers = csrf.determine(domain, dict())
256 except network.exceptions as exception:
257 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
258 instances.set_last_error(domain, exception)
261 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
262 "exception" : exception,
266 "/nodeinfo/2.1.json",
268 "/nodeinfo/2.0.json",
274 for request in request_paths:
275 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
276 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
277 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
278 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
279 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...")
280 components = urlparse(path)
281 path = components.path
283 data = network.get_json_api(
287 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
290 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
291 if "error_message" not in data:
292 # DEBUG: print("DEBUG: Success:", request)
293 instances.set_detection_mode(domain, "STATIC_CHECK")
294 instances.set_nodeinfo_url(domain, request)
297 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
299 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
302 def fetch_wellknown_nodeinfo(domain: str) -> dict:
303 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
304 if not isinstance(domain, str):
305 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
307 raise ValueError("Parameter 'domain' is empty")
308 elif domain.lower() != domain:
309 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
310 elif not validators.domain(domain.split("/")[0]):
311 raise ValueError(f"domain='{domain}' is not a valid domain")
312 elif domain.endswith(".arpa"):
313 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
314 elif domain.endswith(".tld"):
315 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
317 # No CSRF by default, you don't have to add network.api_headers by yourself here
321 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
322 headers = csrf.determine(domain, dict())
323 except network.exceptions as exception:
324 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
325 instances.set_last_error(domain, exception)
328 "error_message": type(exception),
329 "exception" : exception,
332 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
333 data = network.get_json_api(
335 "/.well-known/nodeinfo",
337 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
340 if "error_message" not in data:
341 nodeinfo = data["json"]
342 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
343 if "links" in nodeinfo:
344 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
345 for link in nodeinfo["links"]:
346 # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'")
347 if not isinstance(link, dict) or not "rel" in link:
348 print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found")
349 elif link["rel"] in nodeinfo_identifier:
350 # Default is that 'href' has a complete URL, but some hosts don't send that
352 components = urlparse(link["href"])
354 # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'")
355 if components.scheme == "" and components.netloc == "":
356 # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'")
357 url = f"https://{domain}{url}"
358 components = urlparse(url)
360 if blacklist.is_blacklisted(components.netloc):
361 print(f"WARNING: components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
363 elif not validators.domain(components.netloc):
364 print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
367 # DEBUG: print("DEBUG: Fetching nodeinfo from:", url)
368 data = network.fetch_api_url(
370 (config.get("connection_timeout"), config.get("read_timeout"))
373 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
374 if "error_message" not in data and "json" in data:
375 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
376 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
377 instances.set_nodeinfo_url(domain, link["href"])
380 instances.set_last_error(domain, data)
382 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
384 print("WARNING: nodeinfo does not contain 'links':", domain)
386 # DEBUG: print("DEBUG: Returning data[]:", type(data))
389 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
390 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
391 if not isinstance(domain, str):
392 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
394 raise ValueError("Parameter 'domain' is empty")
395 elif domain.lower() != domain:
396 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
397 elif not validators.domain(domain.split("/")[0]):
398 raise ValueError(f"domain='{domain}' is not a valid domain")
399 elif domain.endswith(".arpa"):
400 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
401 elif domain.endswith(".tld"):
402 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
403 elif not isinstance(path, str):
404 raise ValueError(f"path[]='{type(path)}' is not 'str'")
406 raise ValueError("Parameter 'path' is empty")
408 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
411 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
412 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
414 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
415 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
416 # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...")
417 doc = bs4.BeautifulSoup(response.text, "html.parser")
419 # DEBUG: print("DEBUG: doc[]:", type(doc))
420 generator = doc.find("meta", {"name" : "generator"})
421 site_name = doc.find("meta", {"property": "og:site_name"})
423 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
424 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
425 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
426 software = tidyup.domain(generator.get("content"))
427 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
428 if software is not None and software != "":
429 print(f"INFO: domain='{domain}' is generated by '{software}'")
430 instances.set_detection_mode(domain, "GENERATOR")
431 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
432 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
433 software = tidyup.domain(site_name.get("content"))
434 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
435 if software is not None and software != "":
436 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
437 instances.set_detection_mode(domain, "SITE_NAME")
439 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
440 if isinstance(software, str) and software == "":
441 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
443 elif isinstance(software, str) and ("." in software or " " in software):
444 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
445 software = version.remove(software)
447 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
448 if isinstance(software, str) and "powered by " in software:
449 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
450 software = version.remove(version.strip_powered_by(software))
451 elif isinstance(software, str) and " hosted on " in software:
452 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
453 software = version.remove(version.strip_hosted_on(software))
454 elif isinstance(software, str) and " by " in software:
455 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
456 software = version.strip_until(software, " by ")
457 elif isinstance(software, str) and " see " in software:
458 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
459 software = version.strip_until(software, " see ")
461 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
464 def determine_software(domain: str, path: str = None) -> str:
465 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
466 if not isinstance(domain, str):
467 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
469 raise ValueError("Parameter 'domain' is empty")
470 elif domain.lower() != domain:
471 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
472 elif not validators.domain(domain.split("/")[0]):
473 raise ValueError(f"domain='{domain}' is not a valid domain")
474 elif domain.endswith(".arpa"):
475 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
476 elif domain.endswith(".tld"):
477 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
478 elif not isinstance(path, str) and path is not None:
479 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
481 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
484 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
485 data = fetch_nodeinfo(domain, path)
487 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
488 if "exception" in data:
489 # Continue raising it
490 raise data["exception"]
491 elif "error_message" in data:
492 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
493 return fetch_generator_from_path(domain)
494 elif "status" in data and data["status"] == "error" and "message" in data:
495 print("WARNING: JSON response is an error:", data["message"])
496 instances.set_last_error(domain, data["message"])
497 return fetch_generator_from_path(domain)
498 elif "message" in data:
499 print("WARNING: JSON response contains only a message:", data["message"])
500 instances.set_last_error(domain, data["message"])
501 return fetch_generator_from_path(domain)
502 elif "software" not in data or "name" not in data["software"]:
503 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
504 software = fetch_generator_from_path(domain)
505 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'")
506 elif "software" in data and "name" in data["software"]:
507 # DEBUG: print("DEBUG: Found data[software][name] in JSON response")
508 software = data["software"]["name"]
511 # DEBUG: print("DEBUG: Returning None - EXIT!")
514 sofware = tidyup.domain(software)
515 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
517 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
518 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
520 elif software in ["hometown", "ecko"]:
521 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
522 software = "mastodon"
523 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
524 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
526 elif software == "runtube.re":
527 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
528 software = "peertube"
529 elif software == "nextcloud social":
530 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
531 software = "nextcloud"
532 elif software.find("/") > 0:
533 print("WARNING: Spliting of slash:", software)
534 software = tidyup.domain(software.split("/")[-1])
535 elif software.find("|") > 0:
536 print("WARNING: Spliting of pipe:", software)
537 software = tidyup.domain(software.split("|")[0])
538 elif "powered by" in software:
539 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
540 software = version.strip_powered_by(software)
541 elif isinstance(software, str) and " by " in software:
542 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
543 software = version.strip_until(software, " by ")
544 elif isinstance(software, str) and " see " in software:
545 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
546 software = version.strip_until(software, " see ")
548 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
550 print("WARNING: tidyup.domain() left no software name behind:", domain)
553 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
554 if str(software) == "":
555 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
556 software = fetch_generator_from_path(domain)
557 elif len(str(software)) > 0 and ("." in software or " " in software):
558 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
559 software = version.remove(software)
561 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
562 if isinstance(software, str) and "powered by" in software:
563 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
564 software = version.remove(version.strip_powered_by(software))
566 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
569 def find_domains(tag: bs4.element.Tag) -> list:
570 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
571 if not isinstance(tag, bs4.element.Tag):
572 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
573 elif len(tag.select("tr")) == 0:
574 raise KeyError("No table rows found in table!")
577 for element in tag.select("tr"):
578 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
579 if not element.find("td"):
580 # DEBUG: print("DEBUG: Skipping element, no <td> found")
583 domain = tidyup.domain(element.find("td").text)
584 reason = tidyup.reason(element.findAll("td")[1].text)
586 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
588 if blacklist.is_blacklisted(domain):
589 print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!")
591 elif domain == "gab.com/.ai, develop.gab.com":
592 # DEBUG: print("DEBUG: Multiple domains detected in one row")
602 "domain": "develop.gab.com",
606 elif not validators.domain(domain.split("/")[0]):
607 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
610 # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...")
616 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
619 def add_peers(rows: dict) -> list:
620 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
622 for key in ["linked", "allowed", "blocked"]:
623 # DEBUG: print(f"DEBUG: Checking key='{key}'")
624 if key in rows and rows[key] is not None:
625 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
626 for peer in rows[key]:
627 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
628 peer = tidyup.domain(peer)
630 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
631 if not validators.domain(peer):
632 print(f"WARNING: peer='{peer}' is not a valid domain - SKIPPED!")
634 elif peer.endswith(".arpa"):
635 print(f"WARNING: peer='{peer}' is a domain for reversed IP addresses -SKIPPED!")
637 elif peer.endswith(".tld"):
638 print(f"WARNING: peer='{peer}' is a fake domain - SKIPPED!")
640 elif blacklist.is_blacklisted(peer):
641 print(f"WARNING: peer='{peer}' is blacklisted - SKIPPED!")
644 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
647 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")