1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 from urllib.parse import urlparse
21 from fba import blacklist
22 from fba import config
24 from fba import network
26 from fba.helpers import tidyup
27 from fba.helpers import version
29 from fba.models import instances
31 from fba.networks import lemmy
32 from fba.networks import misskey
33 from fba.networks import peertube
35 # "rel" identifiers (no real URLs)
36 nodeinfo_identifier = [
37 "https://nodeinfo.diaspora.software/ns/schema/2.1",
38 "https://nodeinfo.diaspora.software/ns/schema/2.0",
39 "https://nodeinfo.diaspora.software/ns/schema/1.1",
40 "https://nodeinfo.diaspora.software/ns/schema/1.0",
41 "http://nodeinfo.diaspora.software/ns/schema/2.1",
42 "http://nodeinfo.diaspora.software/ns/schema/2.0",
43 "http://nodeinfo.diaspora.software/ns/schema/1.1",
44 "http://nodeinfo.diaspora.software/ns/schema/1.0",
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
48 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
49 if not isinstance(domain, str):
50 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
52 raise ValueError("Parameter 'domain' is empty")
53 elif not validators.domain(domain.split("/")[0]):
54 raise ValueError(f"domain='{domain}' is not a valid domain")
55 elif domain.endswith(".arpa"):
56 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
57 elif domain.endswith(".tld"):
58 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
59 elif not isinstance(origin, str) and origin is not None:
60 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
61 elif software is None:
62 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
63 instances.set_last_instance_fetch(domain)
65 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
68 software = determine_software(domain, path)
69 except network.exceptions as exception:
70 # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type")
73 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
74 elif not isinstance(software, str):
75 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
76 elif not isinstance(command, str):
77 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
79 raise ValueError("Parameter 'command' is empty")
80 elif not validators.domain(domain.split("/")[0]):
81 raise ValueError(f"domain='{domain}' is not a valid domain")
82 elif domain.endswith(".arpa"):
83 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
84 elif domain.endswith(".tld"):
85 raise ValueError(f"domain='{domain}' is a fake domain")
87 if not instances.is_registered(domain):
88 # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
89 instances.add(domain, origin, command, path, software)
91 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
92 instances.set_last_instance_fetch(domain)
94 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
95 peerlist = fetch_peers(domain, software)
98 print("ERROR: Cannot fetch peers:", domain)
100 elif instances.has_pending(domain):
101 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
102 instances.update_data(domain)
104 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
105 for instance in peerlist:
106 # DEBUG: print(f"DEBUG: instance='{instance}'")
108 # Skip "None" types as tidup.domain() cannot parse them
111 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
112 instance = tidyup.domain(instance)
113 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
116 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
118 elif not validators.domain(instance.split("/")[0]):
119 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
121 elif instance.endswith(".arpa"):
122 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
124 elif blacklist.is_blacklisted(instance):
125 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
127 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
128 # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!")
130 elif instance.endswith(".tld"):
131 # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!")
133 elif not instances.is_registered(instance):
134 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
135 instances.add(instance, domain, command)
137 # DEBUG: print("DEBUG: EXIT!")
139 def fetch_peers(domain: str, software: str) -> list:
140 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!")
141 if not isinstance(domain, str):
142 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
144 raise ValueError("Parameter 'domain' is empty")
145 elif not validators.domain(domain.split("/")[0]):
146 raise ValueError(f"domain='{domain}' is not a valid domain")
147 elif domain.endswith(".arpa"):
148 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
149 elif domain.endswith(".tld"):
150 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
151 elif not isinstance(software, str) and software is not None:
152 raise ValueError(f"software[]='{type(software)}' is not 'str'")
154 if software == "misskey":
155 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
156 return misskey.fetch_peers(domain)
157 elif software == "lemmy":
158 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
159 return lemmy.fetch_peers(domain)
160 elif software == "peertube":
161 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
162 return peertube.fetch_peers(domain)
164 # Init peers variable
167 # No CSRF by default, you don't have to add network.api_headers by yourself here
171 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
172 headers = csrf.determine(domain, dict())
173 except network.exceptions as exception:
174 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
175 instances.set_last_error(domain, exception)
178 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
179 data = network.get_json_api(
181 "/api/v1/instance/peers",
183 (config.get("connection_timeout"), config.get("read_timeout"))
186 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
187 if "error_message" in data:
188 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
189 data = network.get_json_api(
193 (config.get("connection_timeout"), config.get("read_timeout"))
196 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
197 if "error_message" in data:
198 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
199 elif "federated_instances" in data["json"]:
200 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
201 peers = peers + add_peers(data["json"]["federated_instances"])
202 # DEBUG: print("DEBUG: Added instance(s) to peers")
204 message = "JSON response does not contain 'federated_instances' or 'error_message'"
205 print(f"WARNING: {message},domain='{domain}'")
206 instances.set_last_error(domain, message)
207 elif isinstance(data["json"], list):
208 # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
211 print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'")
213 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
214 instances.set_total_peers(domain, peers)
216 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
219 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
220 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
221 if not isinstance(domain, str):
222 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
224 raise ValueError("Parameter 'domain' is empty")
225 elif not validators.domain(domain.split("/")[0]):
226 raise ValueError(f"domain='{domain}' is not a valid domain")
227 elif domain.endswith(".arpa"):
228 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
229 elif domain.endswith(".tld"):
230 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
231 elif not isinstance(path, str) and path is not None:
232 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
234 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
235 nodeinfo = fetch_wellknown_nodeinfo(domain)
237 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
238 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
239 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
240 return nodeinfo["json"]
242 # No CSRF by default, you don't have to add network.api_headers by yourself here
247 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
248 headers = csrf.determine(domain, dict())
249 except network.exceptions as exception:
250 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
251 instances.set_last_error(domain, exception)
254 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
255 "exception" : exception,
259 "/nodeinfo/2.1.json",
261 "/nodeinfo/2.0.json",
267 for request in request_paths:
268 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
269 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
270 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
271 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
272 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...")
273 components = urlparse(path)
274 path = components.path
276 data = network.get_json_api(
280 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
283 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
284 if "error_message" not in data:
285 # DEBUG: print("DEBUG: Success:", request)
286 instances.set_detection_mode(domain, "STATIC_CHECK")
287 instances.set_nodeinfo_url(domain, request)
290 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
292 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
295 def fetch_wellknown_nodeinfo(domain: str) -> dict:
296 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
297 if not isinstance(domain, str):
298 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
300 raise ValueError("Parameter 'domain' is empty")
301 elif not validators.domain(domain.split("/")[0]):
302 raise ValueError(f"domain='{domain}' is not a valid domain")
303 elif domain.endswith(".arpa"):
304 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
305 elif domain.endswith(".tld"):
306 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
308 # No CSRF by default, you don't have to add network.api_headers by yourself here
312 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
313 headers = csrf.determine(domain, dict())
314 except network.exceptions as exception:
315 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
316 instances.set_last_error(domain, exception)
319 "error_message": type(exception),
320 "exception" : exception,
323 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
324 data = network.get_json_api(
326 "/.well-known/nodeinfo",
328 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
331 if "error_message" not in data:
332 nodeinfo = data["json"]
333 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
334 if "links" in nodeinfo:
335 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
336 for link in nodeinfo["links"]:
337 # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'")
338 if not isinstance(link, dict) or not "rel" in link:
339 print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found")
340 elif link["rel"] in nodeinfo_identifier:
341 # Default is that 'href' has a complete URL, but some hosts don't send that
343 components = urlparse(link["href"])
345 # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'")
346 if components.scheme == "" and components.netloc == "":
347 # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'")
348 url = f"https://{domain}{url}"
349 components = urlparse(url)
351 if blacklist.is_blacklisted(components.netloc):
352 print(f"WARNING: components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
354 elif not validators.domain(components.netloc):
355 print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
358 # DEBUG: print("DEBUG: Fetching nodeinfo from:", url)
359 data = network.fetch_api_url(
361 (config.get("connection_timeout"), config.get("read_timeout"))
364 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
365 if "error_message" not in data and "json" in data:
366 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
367 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
368 instances.set_nodeinfo_url(domain, link["href"])
371 instances.set_last_error(domain, data)
373 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
375 print("WARNING: nodeinfo does not contain 'links':", domain)
377 # DEBUG: print("DEBUG: Returning data[]:", type(data))
380 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
381 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
382 if not isinstance(domain, str):
383 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
385 raise ValueError("Parameter 'domain' is empty")
386 elif not validators.domain(domain.split("/")[0]):
387 raise ValueError(f"domain='{domain}' is not a valid domain")
388 elif domain.endswith(".arpa"):
389 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
390 elif domain.endswith(".tld"):
391 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
392 elif not isinstance(path, str):
393 raise ValueError(f"path[]='{type(path)}' is not 'str'")
395 raise ValueError("Parameter 'path' is empty")
397 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
400 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
401 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
403 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
404 if response.ok and response.status_code < 300 and len(response.text) > 0:
405 # DEBUG: print("DEBUG: Search for <meta name='generator'>:", domain)
406 doc = bs4.BeautifulSoup(response.text, "html.parser")
408 # DEBUG: print("DEBUG: doc[]:", type(doc))
409 generator = doc.find("meta", {"name" : "generator"})
410 site_name = doc.find("meta", {"property": "og:site_name"})
412 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
413 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
414 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
415 software = tidyup.domain(generator.get("content"))
416 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
417 if software is not None and software != "":
418 print(f"INFO: domain='{domain}' is generated by '{software}'")
419 instances.set_detection_mode(domain, "GENERATOR")
420 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
421 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
422 software = tidyup.domain(site_name.get("content"))
423 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
424 if software is not None and software != "":
425 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
426 instances.set_detection_mode(domain, "SITE_NAME")
428 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
429 if isinstance(software, str) and software == "":
430 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
432 elif isinstance(software, str) and ("." in software or " " in software):
433 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
434 software = version.remove(software)
436 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
437 if isinstance(software, str) and "powered by " in software:
438 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
439 software = version.remove(version.strip_powered_by(software))
440 elif isinstance(software, str) and " hosted on " in software:
441 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
442 software = version.remove(version.strip_hosted_on(software))
443 elif isinstance(software, str) and " by " in software:
444 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
445 software = version.strip_until(software, " by ")
446 elif isinstance(software, str) and " see " in software:
447 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
448 software = version.strip_until(software, " see ")
450 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
453 def determine_software(domain: str, path: str = None) -> str:
454 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
455 if not isinstance(domain, str):
456 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
458 raise ValueError("Parameter 'domain' is empty")
459 elif not isinstance(path, str) and path is not None:
460 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
462 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
465 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
466 data = fetch_nodeinfo(domain, path)
468 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
469 if "exception" in data:
470 # Continue raising it
471 raise data["exception"]
472 elif "error_message" in data:
473 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
474 return fetch_generator_from_path(domain)
475 elif "status" in data and data["status"] == "error" and "message" in data:
476 print("WARNING: JSON response is an error:", data["message"])
477 instances.set_last_error(domain, data["message"])
478 return fetch_generator_from_path(domain)
479 elif "message" in data:
480 print("WARNING: JSON response contains only a message:", data["message"])
481 instances.set_last_error(domain, data["message"])
482 return fetch_generator_from_path(domain)
483 elif "software" not in data or "name" not in data["software"]:
484 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
485 software = fetch_generator_from_path(domain)
486 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'")
487 elif "software" in data and "name" in data["software"]:
488 # DEBUG: print("DEBUG: Found data[software][name] in JSON response")
489 software = data["software"]["name"]
492 # DEBUG: print("DEBUG: Returning None - EXIT!")
495 sofware = tidyup.domain(software)
496 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
498 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
499 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
501 elif software in ["hometown", "ecko"]:
502 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
503 software = "mastodon"
504 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
505 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
507 elif software == "runtube.re":
508 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
509 software = "peertube"
510 elif software == "nextcloud social":
511 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
512 software = "nextcloud"
513 elif software.find("/") > 0:
514 print("WARNING: Spliting of slash:", software)
515 software = tidyup.domain(software.split("/")[-1])
516 elif software.find("|") > 0:
517 print("WARNING: Spliting of pipe:", software)
518 software = tidyup.domain(software.split("|")[0])
519 elif "powered by" in software:
520 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
521 software = version.strip_powered_by(software)
522 elif isinstance(software, str) and " by " in software:
523 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
524 software = version.strip_until(software, " by ")
525 elif isinstance(software, str) and " see " in software:
526 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
527 software = version.strip_until(software, " see ")
529 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
531 print("WARNING: tidyup.domain() left no software name behind:", domain)
534 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
535 if str(software) == "":
536 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
537 software = fetch_generator_from_path(domain)
538 elif len(str(software)) > 0 and ("." in software or " " in software):
539 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
540 software = version.remove(software)
542 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
543 if isinstance(software, str) and "powered by" in software:
544 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
545 software = version.remove(version.strip_powered_by(software))
547 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
550 def find_domains(tag: bs4.element.Tag) -> list:
551 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
552 if not isinstance(tag, bs4.element.Tag):
553 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
554 elif len(tag.select("tr")) == 0:
555 raise KeyError("No table rows found in table!")
558 for element in tag.select("tr"):
559 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
560 if not element.find("td"):
561 # DEBUG: print("DEBUG: Skipping element, no <td> found")
564 domain = tidyup.domain(element.find("td").text)
565 reason = tidyup.reason(element.findAll("td")[1].text)
567 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
569 if blacklist.is_blacklisted(domain):
570 print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!")
572 elif domain == "gab.com/.ai, develop.gab.com":
573 # DEBUG: print("DEBUG: Multiple domains detected in one row")
583 "domain": "develop.gab.com",
587 elif not validators.domain(domain.split("/")[0]):
588 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
591 # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...")
597 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
600 def add_peers(rows: dict) -> list:
601 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
603 for key in ["linked", "allowed", "blocked"]:
604 # DEBUG: print(f"DEBUG: Checking key='{key}'")
605 if key in rows and rows[key] is not None:
606 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
607 for peer in rows[key]:
608 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
609 peer = tidyup.domain(peer)
611 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
612 if blacklist.is_blacklisted(peer):
613 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
616 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
619 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")