1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
16 from urllib.parse import urlparse
23 from fba.helpers import blacklist
24 from fba.helpers import config
25 from fba.helpers import tidyup
26 from fba.helpers import version
28 from fba.http import network
30 from fba.models import instances
32 from fba.networks import lemmy
33 from fba.networks import misskey
34 from fba.networks import peertube
36 # "rel" identifiers (no real URLs)
37 nodeinfo_identifier = [
38 "https://nodeinfo.diaspora.software/ns/schema/2.1",
39 "https://nodeinfo.diaspora.software/ns/schema/2.0",
40 "https://nodeinfo.diaspora.software/ns/schema/1.1",
41 "https://nodeinfo.diaspora.software/ns/schema/1.0",
42 "http://nodeinfo.diaspora.software/ns/schema/2.1",
43 "http://nodeinfo.diaspora.software/ns/schema/2.0",
44 "http://nodeinfo.diaspora.software/ns/schema/1.1",
45 "http://nodeinfo.diaspora.software/ns/schema/1.0",
48 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 # DEBUG: print(f"DEBUG: domain='{domain}',origin='{origin}',software='{software}',path='{path}' - CALLED!")
50 if not isinstance(domain, str):
51 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
53 raise ValueError("Parameter 'domain' is empty")
54 elif not validators.domain(domain.split("/")[0]):
55 raise ValueError(f"domain='{domain}' is not a valid domain")
56 elif domain.endswith(".arpa"):
57 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
58 elif domain.endswith(".tld"):
59 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
60 elif not isinstance(origin, str) and origin is not None:
61 raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
62 elif software is None:
63 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
64 instances.set_last_instance_fetch(domain)
66 # DEBUG: print(f"DEBUG: software for domain='{domain}' is not set, determining ...")
69 software = determine_software(domain, path)
70 except network.exceptions as exception:
71 # DEBUG: print(f"DEBUG: Exception '{type(exception)}' during determining software type")
74 # DEBUG: print(f"DEBUG: Determined software='{software}' for domain='{domain}'")
75 elif not isinstance(software, str):
76 raise ValueError(f"Parameter software[]='{type(software)}' is not 'str'")
77 elif not isinstance(command, str):
78 raise ValueError(f"Parameter command[]='{type(command)}' is not 'str'")
80 raise ValueError("Parameter 'command' is empty")
81 elif not validators.domain(domain.split("/")[0]):
82 raise ValueError(f"domain='{domain}' is not a valid domain")
83 elif domain.endswith(".arpa"):
84 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
85 elif domain.endswith(".tld"):
86 raise ValueError(f"domain='{domain}' is a fake domain")
88 if not instances.is_registered(domain):
89 # DEBUG: print(f"DEBUG: Adding new domain='{domain}',origin='{origin}',command='{command}',path='{path}',software='{software}'")
90 instances.add(domain, origin, command, path, software)
92 # DEBUG: print(f"DEBUG: Updating last_instance_fetch for domain='{domain}' ...")
93 instances.set_last_instance_fetch(domain)
95 # DEBUG: print("DEBUG: Fetching instances for domain:", domain, software)
96 peerlist = fetch_peers(domain, software)
99 print("ERROR: Cannot fetch peers:", domain)
101 elif instances.has_pending(domain):
102 # DEBUG: print(f"DEBUG: domain='{domain}' has pending nodeinfo data, flushing ...")
103 instances.update_data(domain)
105 print(f"INFO: Checking {len(peerlist)} instances from domain='{domain}' ...")
106 for instance in peerlist:
107 # DEBUG: print(f"DEBUG: instance='{instance}'")
109 # Skip "None" types as tidup.domain() cannot parse them
112 # DEBUG: print(f"DEBUG: instance='{instance}' - BEFORE")
113 instance = tidyup.domain(instance)
114 # DEBUG: print(f"DEBUG: instance='{instance}' - AFTER")
117 print(f"WARNING: Empty instance after tidyup.domain(), domain='{domain}'")
119 elif not validators.domain(instance.split("/")[0]):
120 print(f"WARNING: Bad instance='{instance}' from domain='{domain}',origin='{origin}'")
122 elif instance.endswith(".arpa"):
123 print(f"WARNING: instance='{instance}' is a reversed .arpa domain and should not be used generally.")
125 elif blacklist.is_blacklisted(instance):
126 # DEBUG: print("DEBUG: instance is blacklisted:", instance)
128 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0:
129 # DEBUG: print(f"DEBUG: instance='{instance}' is a link to a single user profile - SKIPPED!")
131 elif instance.endswith(".tld"):
132 # DEBUG: print(f"DEBUG: instance='{instance}' is a fake domain - SKIPPED!")
134 elif not instances.is_registered(instance):
135 # DEBUG: print("DEBUG: Adding new instance:", instance, domain)
136 instances.add(instance, domain, command)
138 # DEBUG: print("DEBUG: EXIT!")
140 def fetch_peers(domain: str, software: str) -> list:
141 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',software='{software}' - CALLED!")
142 if not isinstance(domain, str):
143 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
145 raise ValueError("Parameter 'domain' is empty")
146 elif not validators.domain(domain.split("/")[0]):
147 raise ValueError(f"domain='{domain}' is not a valid domain")
148 elif domain.endswith(".arpa"):
149 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
150 elif domain.endswith(".tld"):
151 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
152 elif not isinstance(software, str) and software is not None:
153 raise ValueError(f"software[]='{type(software)}' is not 'str'")
155 if software == "misskey":
156 # DEBUG: print(f"DEBUG: Invoking misskey.fetch_peers({domain}) ...")
157 return misskey.fetch_peers(domain)
158 elif software == "lemmy":
159 # DEBUG: print(f"DEBUG: Invoking lemmy.fetch_peers({domain}) ...")
160 return lemmy.fetch_peers(domain)
161 elif software == "peertube":
162 # DEBUG: print(f"DEBUG: Invoking peertube.fetch_peers({domain}) ...")
163 return peertube.fetch_peers(domain)
165 # Init peers variable
168 # No CSRF by default, you don't have to add network.api_headers by yourself here
172 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
173 headers = csrf.determine(domain, dict())
174 except network.exceptions as exception:
175 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_peers,{__name__}) - EXIT!")
176 instances.set_last_error(domain, exception)
179 # DEBUG: print(f"DEBUG: Fetching peers from '{domain}',software='{software}' ...")
180 data = network.get_json_api(
182 "/api/v1/instance/peers",
184 (config.get("connection_timeout"), config.get("read_timeout"))
187 # DEBUG: print(f"DEBUG: data[]='{type(data)}'")
188 if "error_message" in data:
189 # DEBUG: print("DEBUG: Was not able to fetch peers, trying alternative ...")
190 data = network.get_json_api(
194 (config.get("connection_timeout"), config.get("read_timeout"))
197 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
198 if "error_message" in data:
199 print(f"WARNING: Could not reach any JSON API at domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
200 elif "federated_instances" in data["json"]:
201 # DEBUG: print(f"DEBUG: Found federated_instances for domain='{domain}'")
202 peers = peers + add_peers(data["json"]["federated_instances"])
203 # DEBUG: print("DEBUG: Added instance(s) to peers")
205 message = "JSON response does not contain 'federated_instances' or 'error_message'"
206 print(f"WARNING: {message},domain='{domain}'")
207 instances.set_last_error(domain, message)
208 elif isinstance(data["json"], list):
209 # DEBUG print("DEBUG: Querying API was successful:", domain, len(data['json']))
212 print(f"WARNING: Cannot parse data[json][]='{type(data['json'])}'")
214 # DEBUG: print(f"DEBUG: Adding '{len(peers)}' for domain='{domain}'")
215 instances.set_total_peers(domain, peers)
217 # DEBUG: print("DEBUG: Returning peers[]:", type(peers))
220 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
221 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
222 if not isinstance(domain, str):
223 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
225 raise ValueError("Parameter 'domain' is empty")
226 elif not validators.domain(domain.split("/")[0]):
227 raise ValueError(f"domain='{domain}' is not a valid domain")
228 elif domain.endswith(".arpa"):
229 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
230 elif domain.endswith(".tld"):
231 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
232 elif not isinstance(path, str) and path is not None:
233 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
235 # DEBUG: print(f"DEBUG: Fetching nodeinfo from domain='{domain}' ...")
236 nodeinfo = fetch_wellknown_nodeinfo(domain)
238 # DEBUG: print(f"DEBUG: nodeinfo[{type(nodeinfo)}]({len(nodeinfo)}='{nodeinfo}'")
239 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
240 # DEBUG: print(f"DEBUG: Found nodeinfo[json]()={len(nodeinfo['json'])} - EXIT!")
241 return nodeinfo["json"]
243 # No CSRF by default, you don't have to add network.api_headers by yourself here
248 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
249 headers = csrf.determine(domain, dict())
250 except network.exceptions as exception:
251 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (nodeinfo,{__name__}) - EXIT!")
252 instances.set_last_error(domain, exception)
255 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
256 "exception" : exception,
260 "/nodeinfo/2.1.json",
262 "/nodeinfo/2.0.json",
268 for request in request_paths:
269 # DEBUG: print(f"DEBUG: path[{type(path)}]='{path}',request='{request}'")
270 if path is None or path == request or path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
271 # DEBUG: print(f"DEBUG: Fetching request='{request}' from domain='{domain}' ...")
272 if path == f"http://{domain}{path}" or path == f"https://{domain}{path}":
273 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' has protocol in path, splitting ...")
274 components = urlparse(path)
275 path = components.path
277 data = network.get_json_api(
281 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
284 # DEBUG: print(f"DEBUG: response.ok={response.ok},response.status_code={response.status_code},data[]='{type(data)}'")
285 if "error_message" not in data:
286 # DEBUG: print("DEBUG: Success:", request)
287 instances.set_detection_mode(domain, "STATIC_CHECK")
288 instances.set_nodeinfo_url(domain, request)
291 print(f"WARNING: Failed fetching nodeinfo from domain='{domain}',status_code='{data['status_code']}',error_message='{data['error_message']}'")
293 # DEBUG: print(f"DEBUG: data()={len(data)} - EXIT!")
296 def fetch_wellknown_nodeinfo(domain: str) -> dict:
297 # DEBUG: print(f"DEBUG: domain='{domain}' - CALLED!")
298 if not isinstance(domain, str):
299 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
301 raise ValueError("Parameter 'domain' is empty")
302 elif not validators.domain(domain.split("/")[0]):
303 raise ValueError(f"domain='{domain}' is not a valid domain")
304 elif domain.endswith(".arpa"):
305 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
306 elif domain.endswith(".tld"):
307 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
309 # No CSRF by default, you don't have to add network.api_headers by yourself here
313 # DEBUG: print(f"DEBUG: Checking CSRF for domain='{domain}'")
314 headers = csrf.determine(domain, dict())
315 except network.exceptions as exception:
316 print(f"WARNING: Exception '{type(exception)}' during checking CSRF (fetch_wellknown_nodeinfo,{__name__}) - EXIT!")
317 instances.set_last_error(domain, exception)
320 "error_message": type(exception),
321 "exception" : exception,
324 # DEBUG: print("DEBUG: Fetching .well-known info for domain:", domain)
325 data = network.get_json_api(
327 "/.well-known/nodeinfo",
329 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
332 if "error_message" not in data:
333 nodeinfo = data["json"]
334 # DEBUG: print("DEBUG: Found entries:", len(nodeinfo), domain)
335 if "links" in nodeinfo:
336 # DEBUG: print("DEBUG: Found links in nodeinfo():", len(nodeinfo["links"]))
337 for link in nodeinfo["links"]:
338 # DEBUG: print(f"DEBUG: link[{type(link)}]='{link}'")
339 if not isinstance(link, dict) or not "rel" in link:
340 print(f"WARNING: link[]='{type(link)}' is not 'dict' or no element 'rel' found")
341 elif link["rel"] in nodeinfo_identifier:
342 # Default is that 'href' has a complete URL, but some hosts don't send that
344 components = urlparse(link["href"])
346 # DEBUG: print(f"DEBUG: components[{type(components)}]='{components}'")
347 if components.scheme == "" and components.netloc == "":
348 # DEBUG: print(f"DEBUG: link[href]='{link['href']}' has no scheme and host name in it, prepending from domain='{domain}'")
349 url = f"https://{domain}{url}"
350 components = urlparse(url)
352 if blacklist.is_blacklisted(components.netloc):
353 print(f"WARNING: components.netloc='{components.netloc}' is blacklisted - SKIPPED!")
355 elif not validators.domain(components.netloc):
356 print(f"WARNING: components.netloc='{components.netloc}' is not a valid domain - SKIPPED!")
359 # DEBUG: print("DEBUG: Fetching nodeinfo from:", url)
360 data = network.fetch_api_url(
362 (config.get("connection_timeout"), config.get("read_timeout"))
365 # DEBUG: print("DEBUG: href,data[]:", link["href"], type(data))
366 if "error_message" not in data and "json" in data:
367 # DEBUG: print("DEBUG: Found JSON nodeinfo():", len(data))
368 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
369 instances.set_nodeinfo_url(domain, link["href"])
372 instances.set_last_error(domain, data)
374 print("WARNING: Unknown 'rel' value:", domain, link["rel"])
376 print("WARNING: nodeinfo does not contain 'links':", domain)
378 # DEBUG: print("DEBUG: Returning data[]:", type(data))
381 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
382 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
383 if not isinstance(domain, str):
384 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
386 raise ValueError("Parameter 'domain' is empty")
387 elif not validators.domain(domain.split("/")[0]):
388 raise ValueError(f"domain='{domain}' is not a valid domain")
389 elif domain.endswith(".arpa"):
390 raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
391 elif domain.endswith(".tld"):
392 raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
393 elif not isinstance(path, str):
394 raise ValueError(f"path[]='{type(path)}' is not 'str'")
396 raise ValueError("Parameter 'path' is empty")
398 # DEBUG: print(f"DEBUG: domain='{domain}',path='{path}' - CALLED!")
401 # DEBUG: print(f"DEBUG: Fetching path='{path}' from '{domain}' ...")
402 response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
404 # DEBUG: print("DEBUG: domain,response.ok,response.status_code,response.text[]:", domain, response.ok, response.status_code, type(response.text))
405 if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
406 # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...")
407 doc = bs4.BeautifulSoup(response.text, "html.parser")
409 # DEBUG: print("DEBUG: doc[]:", type(doc))
410 generator = doc.find("meta", {"name" : "generator"})
411 site_name = doc.find("meta", {"property": "og:site_name"})
413 # DEBUG: print(f"DEBUG: generator='{generator}',site_name='{site_name}'")
414 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
415 # DEBUG: print("DEBUG: Found generator meta tag:", domain)
416 software = tidyup.domain(generator.get("content"))
417 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
418 if software is not None and software != "":
419 print(f"INFO: domain='{domain}' is generated by '{software}'")
420 instances.set_detection_mode(domain, "GENERATOR")
421 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
422 # DEBUG: print("DEBUG: Found property=og:site_name:", domain)
423 software = tidyup.domain(site_name.get("content"))
424 # DEBUG: print(f"DEBUG: software[{type(software)}]='{software}'")
425 if software is not None and software != "":
426 print(f"INFO: domain='{domain}' has og:site_name='{software}'")
427 instances.set_detection_mode(domain, "SITE_NAME")
429 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
430 if isinstance(software, str) and software == "":
431 # DEBUG: print(f"DEBUG: Corrected empty string to None for software of domain='{domain}'")
433 elif isinstance(software, str) and ("." in software or " " in software):
434 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
435 software = version.remove(software)
437 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
438 if isinstance(software, str) and "powered by " in software:
439 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
440 software = version.remove(version.strip_powered_by(software))
441 elif isinstance(software, str) and " hosted on " in software:
442 # DEBUG: print(f"DEBUG: software='{software}' has 'hosted on' in it")
443 software = version.remove(version.strip_hosted_on(software))
444 elif isinstance(software, str) and " by " in software:
445 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
446 software = version.strip_until(software, " by ")
447 elif isinstance(software, str) and " see " in software:
448 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
449 software = version.strip_until(software, " see ")
451 # DEBUG: print(f"DEBUG: software='{software}' - EXIT!")
454 def determine_software(domain: str, path: str = None) -> str:
455 # DEBUG: print(f"DEBUG: domain({len(domain)})='{domain}',path='{path}' - CALLED!")
456 if not isinstance(domain, str):
457 raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
459 raise ValueError("Parameter 'domain' is empty")
460 elif not isinstance(path, str) and path is not None:
461 raise ValueError(f"Parameter path[]='{type(path)}' is not 'str'")
463 # DEBUG: print("DEBUG: Determining software for domain,path:", domain, path)
466 # DEBUG: print(f"DEBUG: Fetching nodeinfo from '{domain}' ...")
467 data = fetch_nodeinfo(domain, path)
469 # DEBUG: print(f"DEBUG: data[{type(data)}]='{data}'")
470 if "exception" in data:
471 # Continue raising it
472 raise data["exception"]
473 elif "error_message" in data:
474 # DEBUG: print(f"DEBUG: Returned error_message during fetching nodeinfo: '{data['error_message']}',status_code='{data['status_code']}'")
475 return fetch_generator_from_path(domain)
476 elif "status" in data and data["status"] == "error" and "message" in data:
477 print("WARNING: JSON response is an error:", data["message"])
478 instances.set_last_error(domain, data["message"])
479 return fetch_generator_from_path(domain)
480 elif "message" in data:
481 print("WARNING: JSON response contains only a message:", data["message"])
482 instances.set_last_error(domain, data["message"])
483 return fetch_generator_from_path(domain)
484 elif "software" not in data or "name" not in data["software"]:
485 # DEBUG: print(f"DEBUG: JSON response from domain='{domain}' does not include [software][name], fetching / ...")
486 software = fetch_generator_from_path(domain)
487 # DEBUG: print(f"DEBUG: Generator for domain='{domain}' is: '{software}'")
488 elif "software" in data and "name" in data["software"]:
489 # DEBUG: print("DEBUG: Found data[software][name] in JSON response")
490 software = data["software"]["name"]
493 # DEBUG: print("DEBUG: Returning None - EXIT!")
496 sofware = tidyup.domain(software)
497 # DEBUG: print("DEBUG: sofware after tidyup.domain():", software)
499 if software in ["akkoma", "rebased", "akkounfucked", "ched"]:
500 # DEBUG: print("DEBUG: Setting pleroma:", domain, software)
502 elif software in ["hometown", "ecko"]:
503 # DEBUG: print("DEBUG: Setting mastodon:", domain, software)
504 software = "mastodon"
505 elif software in ["slipfox calckey", "calckey", "groundpolis", "foundkey", "cherrypick", "meisskey", "magnetar", "keybump"]:
506 # DEBUG: print("DEBUG: Setting misskey:", domain, software)
508 elif software == "runtube.re":
509 # DEBUG: print("DEBUG: Setting peertube:", domain, software)
510 software = "peertube"
511 elif software == "nextcloud social":
512 # DEBUG: print("DEBUG: Setting nextcloud:", domain, software)
513 software = "nextcloud"
514 elif software.find("/") > 0:
515 print("WARNING: Spliting of slash:", software)
516 software = tidyup.domain(software.split("/")[-1])
517 elif software.find("|") > 0:
518 print("WARNING: Spliting of pipe:", software)
519 software = tidyup.domain(software.split("|")[0])
520 elif "powered by" in software:
521 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
522 software = version.strip_powered_by(software)
523 elif isinstance(software, str) and " by " in software:
524 # DEBUG: print(f"DEBUG: software='{software}' has ' by ' in it")
525 software = version.strip_until(software, " by ")
526 elif isinstance(software, str) and " see " in software:
527 # DEBUG: print(f"DEBUG: software='{software}' has ' see ' in it")
528 software = version.strip_until(software, " see ")
530 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
532 print("WARNING: tidyup.domain() left no software name behind:", domain)
535 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
536 if str(software) == "":
537 # DEBUG: print(f"DEBUG: software for '{domain}' was not detected, trying generator ...")
538 software = fetch_generator_from_path(domain)
539 elif len(str(software)) > 0 and ("." in software or " " in software):
540 # DEBUG: print(f"DEBUG: software='{software}' may contain a version number, domain='{domain}', removing it ...")
541 software = version.remove(software)
543 # DEBUG: print(f"DEBUG: software[]='{type(software)}'")
544 if isinstance(software, str) and "powered by" in software:
545 # DEBUG: print(f"DEBUG: software='{software}' has 'powered by' in it")
546 software = version.remove(version.strip_powered_by(software))
548 # DEBUG: print("DEBUG: Returning domain,software:", domain, software)
551 def find_domains(tag: bs4.element.Tag) -> list:
552 # DEBUG: print(f"DEBUG: tag[]='{type(tag)}' - CALLED!")
553 if not isinstance(tag, bs4.element.Tag):
554 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
555 elif len(tag.select("tr")) == 0:
556 raise KeyError("No table rows found in table!")
559 for element in tag.select("tr"):
560 # DEBUG: print(f"DEBUG: element[]='{type(element)}'")
561 if not element.find("td"):
562 # DEBUG: print("DEBUG: Skipping element, no <td> found")
565 domain = tidyup.domain(element.find("td").text)
566 reason = tidyup.reason(element.findAll("td")[1].text)
568 # DEBUG: print(f"DEBUG: domain='{domain}',reason='{reason}'")
570 if blacklist.is_blacklisted(domain):
571 print(f"WARNING: domain='{domain}' is blacklisted - SKIPPED!")
573 elif domain == "gab.com/.ai, develop.gab.com":
574 # DEBUG: print("DEBUG: Multiple domains detected in one row")
584 "domain": "develop.gab.com",
588 elif not validators.domain(domain.split("/")[0]):
589 print(f"WARNING: domain='{domain}' is not a valid domain - SKIPPED!")
592 # DEBUG: print(f"DEBUG: Adding domain='{domain}',reason='{reason}' ...")
598 # DEBUG: print(f"DEBUG: domains()={len(domains)} - EXIT!")
601 def add_peers(rows: dict) -> list:
602 # DEBUG: print(f"DEBUG: rows()={len(rows)} - CALLED!")
604 for key in ["linked", "allowed", "blocked"]:
605 # DEBUG: print(f"DEBUG: Checking key='{key}'")
606 if key in rows and rows[key] is not None:
607 # DEBUG: print(f"DEBUG: Adding {len(rows[key])} peer(s) to peers list ...")
608 for peer in rows[key]:
609 # DEBUG: print(f"DEBUG: peer='{peer}' - BEFORE!")
610 peer = tidyup.domain(peer)
612 # DEBUG: print(f"DEBUG: peer='{peer}' - AFTER!")
613 if blacklist.is_blacklisted(peer):
614 # DEBUG: print(f"DEBUG: peer='{peer}' is blacklisted, skipped!")
617 # DEBUG: print(f"DEBUG: Adding peer='{peer}' ...")
620 # DEBUG: print(f"DEBUG: peers()={len(peers)} - EXIT!")