1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 from urllib.parse import urlparse
23 from fba.helpers import blacklist
24 from fba.helpers import config
25 from fba.helpers import tidyup
26 from fba.helpers import version
28 from fba.http import network
30 from fba.models import instances
32 from fba.networks import lemmy
33 from fba.networks import misskey
34 from fba.networks import peertube
36 # "rel" identifiers (no real URLs)
37 nodeinfo_identifier = [
38 "https://nodeinfo.diaspora.software/ns/schema/2.1",
39 "https://nodeinfo.diaspora.software/ns/schema/2.0",
40 "https://nodeinfo.diaspora.software/ns/schema/1.1",
41 "https://nodeinfo.diaspora.software/ns/schema/1.0",
42 "http://nodeinfo.diaspora.software/ns/schema/2.1",
43 "http://nodeinfo.diaspora.software/ns/schema/2.0",
44 "http://nodeinfo.diaspora.software/ns/schema/1.1",
45 "http://nodeinfo.diaspora.software/ns/schema/1.0",
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
50 if not isinstance(domain, str):
51 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
53 raise ValueError("Parameter 'domain' is empty")
54 elif domain.lower() != domain:
55 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
56 elif not validators.domain(domain.split("/")[0]):
57 raise ValueError(f"domain='{domain}' is not a valid domain")
58 elif domain.endswith(".arpa"):
59 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
60 elif domain.endswith(".tld"):
61 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
62 elif not isinstance(origin, str) and origin is not None:
63 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
64 elif software is None:
65 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
66 instances.set_last_instance_fetch(domain)
68 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
71 software = determine_software(domain, path)
72 except network.exceptions as exception:
73 # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type")
76 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
77 elif not isinstance(software, str):
78 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
79 elif not isinstance(command, str):
80 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
82 raise ValueError("Parameter 'command' is empty")
83 elif not validators.domain(domain.split("/")[0]):
84 raise ValueError(f"domain='{domain}' is not a valid domain")
85 elif domain.endswith(".arpa"):
86 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
87 elif domain.endswith(".tld"):
88 raise ValueError(f"domain='{domain}' is a fake domain")
90 if not instances.is_registered(domain):
91 # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
92 instances.add(domain, origin, command, path, software)
94 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
95 instances.set_last_instance_fetch(domain)
97 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
98 peerlist = fetch_peers(domain, software)
101 print("ERROR: Cannot fetch peers:", domain)
103 elif instances.has_pending(domain):
104 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
105 instances.update_data(domain)
107 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
108 for instance in peerlist:
109 # DEBUG: print(f"DEBUG: instance='{instance}'")
111 # Skip "None" types as tidup.domain() cannot parse them
114 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
115 instance = tidyup.domain(instance)
116 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
119 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
121 elif not validators.domain(instance.split("/")[0]):
122 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
124 elif instance.endswith(".arpa"):
125 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
127 elif blacklist.is_blacklisted(instance):
128 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
130 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
131 # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!")
133 elif instance.endswith(".tld"):
134 # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!")
136 elif not instances.is_registered(instance):
137 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
138 instances.add(instance, domain, command)
140 # DEBUG: print("DEBUG: EXIT!")
142 def fetch_peers(domain: str, software: str) -> list:
143 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!")
144 if not isinstance(domain, str):
145 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
147 raise ValueError("Parameter 'domain' is empty")
148 elif domain.lower() != domain:
149 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
150 elif not validators.domain(domain.split("/")[0]):
151 raise ValueError(f"domain='{domain}' is not a valid domain")
152 elif domain.endswith(".arpa"):
153 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
154 elif domain.endswith(".tld"):
155 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
156 elif not isinstance(software, str) and software is not None:
157 raise ValueError(f"software[]='{type(software)}' is not 'str'")
159 if software == "misskey":
160 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
161 return misskey.fetch_peers(domain)
162 elif software == "lemmy":
163 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
164 return lemmy.fetch_peers(domain)
165 elif software == "peertube":
166 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
167 return peertube.fetch_peers(domain)
169 # Init peers variable
172 # No CSRF by default, you don't have to add network.api_headers by yourself here
176 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
177 headers = csrf.determine(domain, dict())
178 except network.exceptions as exception:
179 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
180 instances.set_last_error(domain, exception)
183 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
184 data = network.get_json_api(
186 "/api/v1/instance/peers",
188 (config.get("connection_timeout"), config.get("read_timeout"))
191 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
192 if "error_message" in data:
193 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
194 data = network.get_json_api(
198 (config.get("connection_timeout"), config.get("read_timeout"))
201 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
202 if "error_message" in data:
203 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
204 elif "federated_instances" in data["json"]:
205 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
206 peers = peers + add_peers(data["json"]["federated_instances"])
207 # DEBUG: print("DEBUG: Added instance(s) to peers")
209 message = "JSON response does not contain 'federated_instances' or 'error_message'"
210 print(f"WARNING: {message},domain='{domain}'")
211 instances.set_last_error(domain, message)
212 elif isinstance(data["json"], list):
213 # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
216 print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'")
218 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
219 instances.set_total_peers(domain, peers)
221 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
224 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
225 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
226 if not isinstance(domain, str):
227 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
229 raise ValueError("Parameter 'domain' is empty")
230 elif domain.lower() != domain:
231 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
232 elif not validators.domain(domain.split("/")[0]):
233 raise ValueError(f"domain='{domain}' is not a valid domain")
234 elif domain.endswith(".arpa"):
235 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
236 elif domain.endswith(".tld"):
237 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
238 elif not isinstance(path, str) and path is not None:
239 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
241 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
242 nodeinfo = fetch_wellknown_nodeinfo(domain)
244 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
245 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
246 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
247 return nodeinfo["json"]
249 # No CSRF by default, you don't have to add network.api_headers by yourself here
254 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
255 headers = csrf.determine(domain, dict())
256 except network.exceptions as exception:
257 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
258 instances.set_last_error(domain, exception)
261 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
262 "exception" : exception,
266 "/nodeinfo/2.1.json",
268 "/nodeinfo/2.0.json",
274 for request in request_paths:
275 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
276 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
277 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
278 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
279 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...")
280 components = urlparse(path)
281 path = components.path
283 data = network.get_json_api(
287 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
290 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
291 if "error_message" not in data:
292 # DEBUG: print("DEBUG: Success:", request)
293 instances.set_detection_mode(domain, "STATIC_CHECK")
294 instances.set_nodeinfo_url(domain, request)
297 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
299 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
302 def fetch_wellknown_nodeinfo(domain: str) -> dict:
303 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
304 if not isinstance(domain, str):
305 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
307 raise ValueError("Parameter 'domain' is empty")
308 elif domain.lower() != domain:
309 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
310 elif not validators.domain(domain.split("/")[0]):
311 raise ValueError(f"domain='{domain}' is not a valid domain")
312 elif domain.endswith(".arpa"):
313 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
314 elif domain.endswith(".tld"):
315 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
317 # No CSRF by default, you don't have to add network.api_headers by yourself here
321 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
322 headers = csrf.determine(domain, dict())
323 except network.exceptions as exception:
324 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
325 instances.set_last_error(domain, exception)
328 "error_message": type(exception),
329 "exception" : exception,
332 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
333 data = network.get_json_api(
335 "/.well-known/nodeinfo",
337 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
340 if "error_message" not in data:
341 nodeinfo = data["json"]
342 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
343 if "links" in nodeinfo:
344 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
345 for link in nodeinfo["links"]:
346 # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'")
347 if not isinstance(link, dict) or not "rel" in link:
348 print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found")
349 elif link["rel"] in nodeinfo_identifier:
350 # Default is that 'href' has a complete URL, but some hosts don't send that
352 components = urlparse(link["href"])
354 # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'")
355 if components.scheme == "" and components.netloc == "":
356 # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'")
357 url = f"https://{domain}{url}"
358 components = urlparse(url)
360 if not validators.domain(components.netloc):
361 print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
363 elif domain.endswith(".arpa"):
364 print(f"WARNING: domain='{domain}' is a domain for reversed IP addresses - SKIPPED!")
366 elif domain.endswith(".tld"):
367 print(f"WARNING: domain='{domain}' is a fake domain - SKIPPED!")
369 elif blacklist.is_blacklisted(components.netloc):
370 # DEBUG: print(f"DEBUG: components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
373 # DEBUG: print("DEBUG: Fetching nodeinfo from:", url)
374 data = network.fetch_api_url(
376 (config.get("connection_timeout"), config.get("read_timeout"))
379 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
380 if "error_message" not in data and "json" in data:
381 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
382 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
383 instances.set_nodeinfo_url(domain, link["href"])
386 instances.set_last_error(domain, data)
388 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
390 print("WARNING: nodeinfo does not contain 'links':", domain)
392 # DEBUG: print("DEBUG: Returning data[]:", type(data))
395 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
396 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
397 if not isinstance(domain, str):
398 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
400 raise ValueError("Parameter 'domain' is empty")
401 elif domain.lower() != domain:
402 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
403 elif not validators.domain(domain.split("/")[0]):
404 raise ValueError(f"domain='{domain}' is not a valid domain")
405 elif domain.endswith(".arpa"):
406 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
407 elif domain.endswith(".tld"):
408 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
409 elif not isinstance(path, str):
410 raise ValueError(f"path[]='{type(path)}' is not 'str'")
412 raise ValueError("Parameter 'path' is empty")
414 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
417 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
418 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
420 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
421 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
422 # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...")
423 doc = bs4.BeautifulSoup(response.text, "html.parser")
425 # DEBUG: print("DEBUG: doc[]:", type(doc))
426 generator = doc.find("meta", {"name" : "generator"})
427 site_name = doc.find("meta", {"property": "og:site_name"})
429 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
430 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
431 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
432 software = tidyup.domain(generator.get("content"))
433 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
434 if software is not None and software != "":
435 print(f"INFO: domain='{domain}' is generated by '{software}'")
436 instances.set_detection_mode(domain, "GENERATOR")
437 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
438 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
439 software = tidyup.domain(site_name.get("content"))
440 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
441 if software is not None and software != "":
442 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
443 instances.set_detection_mode(domain, "SITE_NAME")
445 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
446 if isinstance(software, str) and software == "":
447 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
449 elif isinstance(software, str) and ("." in software or " " in software):
450 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
451 software = version.remove(software)
453 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
454 if isinstance(software, str) and "powered by " in software:
455 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
456 software = version.remove(version.strip_powered_by(software))
457 elif isinstance(software, str) and " hosted on " in software:
458 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
459 software = version.remove(version.strip_hosted_on(software))
460 elif isinstance(software, str) and " by " in software:
461 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
462 software = version.strip_until(software, " by ")
463 elif isinstance(software, str) and " see " in software:
464 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
465 software = version.strip_until(software, " see ")
467 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
470 def determine_software(domain: str, path: str = None) -> str:
471 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
472 if not isinstance(domain, str):
473 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
475 raise ValueError("Parameter 'domain' is empty")
476 elif domain.lower() != domain:
477 raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
478 elif not validators.domain(domain.split("/")[0]):
479 raise ValueError(f"domain='{domain}' is not a valid domain")
480 elif domain.endswith(".arpa"):
481 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
482 elif domain.endswith(".tld"):
483 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
484 elif not isinstance(path, str) and path is not None:
485 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
487 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
490 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
491 data = fetch_nodeinfo(domain, path)
493 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
494 if "exception" in data:
495 # Continue raising it
496 raise data["exception"]
497 elif "error_message" in data:
498 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
499 return fetch_generator_from_path(domain)
500 elif "status" in data and data["status"] == "error" and "message" in data:
501 print("WARNING: JSON response is an error:", data["message"])
502 instances.set_last_error(domain, data["message"])
503 return fetch_generator_from_path(domain)
504 elif "message" in data:
505 print("WARNING: JSON response contains only a message:", data["message"])
506 instances.set_last_error(domain, data["message"])
507 return fetch_generator_from_path(domain)
508 elif "software" not in data or "name" not in data["software"]:
509 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
510 software = fetch_generator_from_path(domain)
511 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'")
512 elif "software" in data and "name" in data["software"]:
513 # DEBUG: print("DEBUG: Found data[software][name] in JSON response")
514 software = data["software"]["name"]
517 # DEBUG: print("DEBUG: Returning None - EXIT!")
520 sofware = tidyup.domain(software)
521 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
523 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
524 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
526 elif software in ["hometown", "ecko"]:
527 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
528 software = "mastodon"
529 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
530 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
532 elif software == "runtube.re":
533 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
534 software = "peertube"
535 elif software == "nextcloud social":
536 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
537 software = "nextcloud"
538 elif software.find("/") > 0:
539 print("WARNING: Spliting of slash:", software)
540 software = tidyup.domain(software.split("/")[-1])
541 elif software.find("|") > 0:
542 print("WARNING: Spliting of pipe:", software)
543 software = tidyup.domain(software.split("|")[0])
544 elif "powered by" in software:
545 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
546 software = version.strip_powered_by(software)
547 elif isinstance(software, str) and " by " in software:
548 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
549 software = version.strip_until(software, " by ")
550 elif isinstance(software, str) and " see " in software:
551 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
552 software = version.strip_until(software, " see ")
554 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
556 print("WARNING: tidyup.domain() left no software name behind:", domain)
559 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
560 if str(software) == "":
561 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
562 software = fetch_generator_from_path(domain)
563 elif len(str(software)) > 0 and ("." in software or " " in software):
564 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
565 software = version.remove(software)
567 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
568 if isinstance(software, str) and "powered by" in software:
569 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
570 software = version.remove(version.strip_powered_by(software))
572 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
575 def find_domains(tag: bs4.element.Tag) -> list:
576 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
577 if not isinstance(tag, bs4.element.Tag):
578 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
579 elif len(tag.select("tr")) == 0:
580 raise KeyError("No table rows found in table!")
583 for element in tag.select("tr"):
584 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
585 if not element.find("td"):
586 # DEBUG: print("DEBUG: Skipping element, no <td> found")
589 domain = tidyup.domain(element.find("td").text)
590 reason = tidyup.reason(element.findAll("td")[1].text)
592 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
594 if not validators.domain(domain.split("/")[0]):
595 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
597 elif domain.endswith(".arpa"):
598 print(f"WARNING: domain='{domain}' is a domain for reversed IP addresses - SKIPPED!")
600 elif domain.endswith(".tld"):
601 print(f"WARNING: domain='{domain}' is a fake domain - SKIPPED!")
603 elif blacklist.is_blacklisted(domain):
604 # DEBUG: print(f"DEBUG: domain='{domain}' is blacklisted - SKIPPED!")
606 elif domain == "gab.com/.ai, develop.gab.com":
607 # DEBUG: print("DEBUG: Multiple domains detected in one row")
617 "domain": "develop.gab.com",
621 elif not validators.domain(domain.split("/")[0]):
622 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
625 # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...")
631 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
634 def add_peers(rows: dict) -> list:
635 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
637 for key in ["linked", "allowed", "blocked"]:
638 # DEBUG: print(f"DEBUG: Checking key='{key}'")
639 if key not in rows or rows[key] is None:
640 print(f"WARNING: Cannot find key='{key}' or it is NoneType - SKIPPED!")
643 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
644 for peer in rows[key]:
645 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
646 if isinstance(peer, dict) and "domain" in peer:
647 # DEBUG: print(f"DEBUG: peer[domain]='{peer['domain']}'")
648 peer = tidyup.domain(peer["domain"])
649 elif isinstance(peer, str):
650 # DEBUG: print(f"DEBUG: peer='{peer}'")
651 peer = tidyup.domain(peer)
653 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
655 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
656 if not validators.domain(peer):
657 print(f"WARNING: peer='{peer}' is not a valid domain - SKIPPED!")
659 elif peer.endswith(".arpa"):
660 print(f"WARNING: peer='{peer}' is a domain for reversed IP addresses -SKIPPED!")
662 elif peer.endswith(".tld"):
663 print(f"WARNING: peer='{peer}' is a fake domain - SKIPPED!")
665 elif blacklist.is_blacklisted(peer):
666 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted - SKIPPED!")
669 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
672 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")