1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
27 from fba.helpers import config
28 from fba.helpers import cookies
29 from fba.helpers import domain as domain_helper
30 from fba.helpers import software as software_helper
31 from fba.helpers import tidyup
32 from fba.helpers import version
34 from fba.http import network
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
44 logging.basicConfig(level=logging.INFO)
45 logger = logging.getLogger(__name__)
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif software is None:
61 logger.debug("Software for domain='%s' is not set, determining ...", domain)
62 software = determine_software(domain, path)
63 except network.exceptions as exception:
64 logger.warning("Exception '%s' during determining software type", type(exception))
65 instances.set_last_error(domain, exception)
67 logger.debug("Determined software='%s' for domain='%s'", software, domain)
68 elif not isinstance(software, str):
69 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
70 elif not isinstance(path, str) and path is not None:
71 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
73 logger.debug("Checking if domain='%s' is registered ...", domain)
74 if not instances.is_registered(domain):
75 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
76 instances.add(domain, origin, command, path, software)
78 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
79 instances.set_last_instance_fetch(domain)
83 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
84 peerlist = fetch_peers(domain, software, origin)
85 except network.exceptions as exception:
86 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
88 logger.debug("peerlist[]='%s'", type(peerlist))
89 if isinstance(peerlist, list):
90 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
91 instances.set_total_peers(domain, peerlist)
93 logger.debug("peerlist[]='%s'", type(peerlist))
94 if peerlist is None or len(peerlist) == 0:
95 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
97 if instances.has_pending(domain):
98 logger.debug("Flushing updates for domain='%s' ...", domain)
99 instances.update_data(domain)
101 logger.debug("Invoking cookies.clear(%s) ...", domain)
102 cookies.clear(domain)
105 logger.debug("EXIT!")
108 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
109 for instance in peerlist:
110 logger.debug("instance='%s'", instance)
111 if instance is None or instance == "":
112 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
115 logger.debug("instance='%s' - BEFORE!", instance)
116 instance = tidyup.domain(instance)
117 logger.debug("instance='%s' - AFTER!", instance)
120 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
123 logger.debug("instance='%s' - BEFORE!", instance)
124 instance = instance.encode("idna").decode("utf-8")
125 logger.debug("instance='%s' - AFTER!", instance)
127 if not utils.is_domain_wanted(instance):
128 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
130 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
131 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
133 elif instance.find("/tag/") > 0:
134 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
136 elif not instances.is_registered(instance):
137 logger.debug("Checking if domain='%s' has pending updates ...", domain)
138 if instances.has_pending(domain):
139 logger.debug("Flushing updates for domain='%s' ...", domain)
140 instances.update_data(domain)
142 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
143 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
144 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
145 fetch_instances(instance, domain, None, command, path)
147 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
148 instances.add(instance, domain, command)
150 logger.debug("Invoking cookies.clear(%s) ...", domain)
151 cookies.clear(domain)
153 logger.debug("Checking if domain='%s' has pending updates ...", domain)
154 if instances.has_pending(domain):
155 logger.debug("Flushing updates for domain='%s' ...", domain)
156 instances.update_data(domain)
159 logger.debug("EXIT!")
161 def fetch_peers(domain: str, software: str, origin: str) -> list:
162 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
163 domain_helper.raise_on(domain)
165 if not isinstance(software, str) and software is not None:
166 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
167 elif not isinstance(origin, str) and origin is not None:
168 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
169 elif isinstance(origin, str) and origin == "":
170 raise ValueError("Parameter 'origin' is empty")
172 if software == "misskey":
173 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
174 return misskey.fetch_peers(domain)
175 elif software == "lemmy":
176 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
177 return lemmy.fetch_peers(domain, origin)
178 elif software == "peertube":
179 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
180 return peertube.fetch_peers(domain)
182 # No CSRF by default, you don't have to add network.api_headers by yourself here
186 logger.debug("Checking CSRF for domain='%s'", domain)
187 headers = csrf.determine(domain, dict())
188 except network.exceptions as exception:
189 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s)", type(exception), __name__)
190 instances.set_last_error(domain, exception)
192 logger.debug("Returning empty list ... - EXIT!")
196 "/api/v1/instance/peers",
200 # Init peers variable
203 logger.debug("Checking %d paths ...", len(paths))
205 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
206 data = network.get_json_api(
210 (config.get("connection_timeout"), config.get("read_timeout"))
213 logger.debug("data[]='%s'", type(data))
214 if "error_message" in data:
215 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
216 instances.set_last_error(domain, data)
217 elif "json" in data and len(data["json"]) > 0:
218 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
221 logger.debug("Marking domain='%s' as successfully handled ...", domain)
222 instances.set_success(domain)
225 if not isinstance(peers, list):
226 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
229 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
230 instances.set_total_peers(domain, peers)
232 logger.debug("peers()=%d - EXIT!", len(peers))
235 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
236 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
237 domain_helper.raise_on(domain)
239 if not isinstance(path, str) and path is not None:
240 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
242 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
243 nodeinfo = fetch_wellknown_nodeinfo(domain)
245 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
246 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
247 logger.debug("Invoking instances.set_last_nodeinfo(%s) ...", domain)
248 instances.set_last_nodeinfo(domain)
250 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
253 # No CSRF by default, you don't have to add network.api_headers by yourself here
258 logger.debug("Checking CSRF for domain='%s'", domain)
259 headers = csrf.determine(domain, dict())
260 except network.exceptions as exception:
261 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
262 instances.set_last_error(domain, exception)
263 instances.set_software(domain, None)
264 instances.set_detection_mode(domain, None)
265 instances.set_nodeinfo_url(domain, None)
268 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
269 "exception" : exception,
273 "/nodeinfo/2.1.json",
275 "/nodeinfo/2.0.json",
277 "/nodeinfo/1.0.json",
282 for request in request_paths:
283 logger.debug("request='%s'", request)
284 http_url = f"http://{domain}{path}"
285 https_url = f"https://{domain}{path}"
287 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
288 if path is None or path in [request, http_url, https_url]:
289 logger.debug("path='%s',http_url='%s',https_url='%s'", path, http_url, https_url)
290 if path in [http_url, https_url]:
291 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
292 components = urlparse(path)
293 path = components.path
295 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
296 data = network.get_json_api(
300 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
303 logger.debug("data[]='%s'", type(data))
304 if "error_message" not in data and "json" in data:
305 logger.debug("Success: request='%s' - Setting detection_mode=STATIC_CHECK ...", request)
306 instances.set_last_nodeinfo(domain)
307 instances.set_detection_mode(domain, "STATIC_CHECK")
308 instances.set_nodeinfo_url(domain, request)
311 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
313 logger.debug("data()=%d - EXIT!", len(data))
316 def fetch_wellknown_nodeinfo(domain: str) -> dict:
317 logger.debug("domain='%s' - CALLED!", domain)
318 domain_helper.raise_on(domain)
320 # "rel" identifiers (no real URLs)
321 nodeinfo_identifier = [
322 "https://nodeinfo.diaspora.software/ns/schema/2.1",
323 "http://nodeinfo.diaspora.software/ns/schema/2.1",
324 "https://nodeinfo.diaspora.software/ns/schema/2.0",
325 "http://nodeinfo.diaspora.software/ns/schema/2.0",
326 "https://nodeinfo.diaspora.software/ns/schema/1.1",
327 "http://nodeinfo.diaspora.software/ns/schema/1.1",
328 "https://nodeinfo.diaspora.software/ns/schema/1.0",
329 "http://nodeinfo.diaspora.software/ns/schema/1.0",
332 # No CSRF by default, you don't have to add network.api_headers by yourself here
336 logger.debug("Checking CSRF for domain='%s'", domain)
337 headers = csrf.determine(domain, dict())
338 except network.exceptions as exception:
339 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
340 instances.set_last_error(domain, exception)
343 "error_message": type(exception),
344 "exception" : exception,
347 logger.debug("Fetching .well-known info for domain='%s'", domain)
348 data = network.get_json_api(
350 "/.well-known/nodeinfo",
352 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
355 logger.debug("data[]='%s'", type(data))
356 if "error_message" not in data:
357 nodeinfo = data["json"]
359 logger.debug("Marking domain='%s' as successfully handled ...", domain)
360 instances.set_success(domain)
362 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
363 if "links" in nodeinfo:
364 logger.debug("Found nodeinfo[links]()=%d record(s),", len(nodeinfo["links"]))
365 for niid in nodeinfo_identifier:
368 logger.debug("Checking niid='%s' ...", niid)
369 for link in nodeinfo["links"]:
370 logger.debug("link[%s]='%s'", type(link), link)
371 if not isinstance(link, dict) or not "rel" in link:
372 logger.debug("link[]='%s' is not of type 'dict' or no element 'rel' found - SKIPPED!", type(link))
374 elif link["rel"] != niid:
375 logger.debug("link[re]='%s' does not matched niid='%s' - SKIPPED!", link["rel"], niid)
377 elif "href" not in link:
378 logger.warning("link[rel]='%s' has no element 'href' - SKIPPED!", link["rel"])
380 elif link["href"] is None:
381 logger.debug("link[href] is None, link[rel]='%s' - SKIPPED!", link["rel"])
384 # Default is that 'href' has a complete URL, but some hosts don't send that
385 logger.debug("link[rel]='%s' matches niid='%s'", link["rel"], niid)
387 components = urlparse(url)
389 logger.debug("components[%s]='%s'", type(components), components)
390 if components.scheme == "" and components.netloc == "":
391 logger.warning("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
392 url = f"https://{domain}{url}"
393 components = urlparse(url)
394 elif components.netloc == "":
395 logger.warning("link[href]='%s' has no netloc set, setting domain='%s'", link["href"], domain)
396 url = f"{components.scheme}://{domain}{components.path}"
397 components = urlparse(url)
399 logger.debug("components.netloc[]='%s'", type(components.netloc))
400 if not utils.is_domain_wanted(components.netloc):
401 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
404 logger.debug("Fetching nodeinfo from url='%s' ...", url)
405 data = network.fetch_api_url(
407 (config.get("connection_timeout"), config.get("read_timeout"))
410 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
411 if "error_message" not in data and "json" in data:
412 logger.debug("Found JSON data()=%d,link[href]='%s' - Setting detection_mode=AUTO_DISCOVERY ...", len(data), link["href"])
413 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
414 instances.set_nodeinfo_url(domain, link["href"])
416 logger.debug("Marking domain='%s' as successfully handled ...", domain)
417 instances.set_success(domain)
420 logger.debug("Setting last error for domain='%s',data[]='%s'", domain, type(data))
421 instances.set_last_error(domain, data)
423 logger.debug("data()=%d", len(data))
424 if "error_message" not in data and "json" in data:
425 logger.debug("Auto-discovery successful: domain='%s'", domain)
428 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
430 logger.debug("Returning data[]='%s' - EXIT!", type(data))
433 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
434 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
435 domain_helper.raise_on(domain)
437 if not isinstance(path, str):
438 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
440 raise ValueError("Parameter 'path' is empty")
442 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
445 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
446 response = network.fetch_response(
450 (config.get("connection_timeout"), config.get("read_timeout")),
454 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
455 if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
456 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
457 doc = bs4.BeautifulSoup(response.text, "html.parser")
459 logger.debug("doc[]='%s'", type(doc))
460 generator = doc.find("meta", {"name" : "generator"})
461 site_name = doc.find("meta", {"property": "og:site_name"})
462 platform = doc.find("meta", {"property": "og:platform"})
464 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s'", type(generator), type(site_name), type(platform))
465 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
466 logger.debug("Found generator meta tag: domain='%s'", domain)
467 software = tidyup.domain(generator.get("content"))
469 logger.debug("software[%s]='%s'", type(software), software)
470 if software is not None and software != "":
471 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
472 instances.set_detection_mode(domain, "GENERATOR")
473 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
474 logger.debug("Found property=og:site_name, domain='%s'", domain)
475 software = tidyup.domain(site_name.get("content"))
477 logger.debug("software[%s]='%s'", type(software), software)
478 if software is not None and software != "":
479 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
480 instances.set_detection_mode(domain, "SITE_NAME")
481 elif isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
482 logger.debug("Found property=og:platform, domain='%s'", domain)
483 software = tidyup.domain(platform.get("content"))
485 logger.debug("software[%s]='%s'", type(software), software)
486 if software is not None and software != "":
487 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
488 instances.set_detection_mode(domain, "PLATFORM")
489 elif not domain_helper.is_in_url(domain, response.url):
490 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
492 components = urlparse(response.url)
494 logger.debug("components[]='%s'", type(components))
495 if not instances.is_registered(components.netloc):
496 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
497 fetch_instances(components.netloc, domain, None, "fetch_generator")
499 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
500 instances.set_last_error(domain, message)
501 instances.set_software(domain, None)
502 instances.set_detection_mode(domain, None)
503 instances.set_nodeinfo_url(domain, None)
505 raise requests.exceptions.TooManyRedirects(message)
507 logger.debug("software[]='%s'", type(software))
508 if isinstance(software, str) and software == "":
509 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
511 elif isinstance(software, str) and ("." in software or " " in software):
512 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
513 software = version.remove(software)
515 logger.debug("software[]='%s'", type(software))
516 if isinstance(software, str) and "powered by " in software:
517 logger.debug("software='%s' has 'powered by' in it", software)
518 software = version.remove(version.strip_powered_by(software))
519 elif isinstance(software, str) and " hosted on " in software:
520 logger.debug("software='%s' has 'hosted on' in it", software)
521 software = version.remove(version.strip_hosted_on(software))
522 elif isinstance(software, str) and " by " in software:
523 logger.debug("software='%s' has ' by ' in it", software)
524 software = version.strip_until(software, " by ")
525 elif isinstance(software, str) and " see " in software:
526 logger.debug("software='%s' has ' see ' in it", software)
527 software = version.strip_until(software, " see ")
529 logger.debug("software='%s' - EXIT!", software)
532 def determine_software(domain: str, path: str = None) -> str:
533 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
534 domain_helper.raise_on(domain)
536 if not isinstance(path, str) and path is not None:
537 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
539 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
542 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
543 data = fetch_nodeinfo(domain, path)
545 logger.debug("data[%s]='%s'", type(data), data)
546 if "exception" in data:
547 # Continue raising it
548 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
549 raise data["exception"]
550 elif "error_message" in data:
551 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
552 software = fetch_generator_from_path(domain)
553 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
555 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
558 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
559 software = fetch_generator_from_path(domain)
560 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
562 if "status" in data and data["status"] == "error" and "message" in data:
563 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
564 instances.set_last_error(domain, data["message"])
565 instances.set_detection_mode(domain, None)
566 instances.set_nodeinfo_url(domain, None)
567 software = fetch_generator_from_path(domain)
568 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
569 elif "software" in data and "name" in data["software"]:
570 logger.debug("Found data[json][software][name] in JSON response")
571 software = data["software"]["name"]
572 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
573 elif "message" in data:
574 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
575 instances.set_last_error(domain, data["message"])
576 instances.set_detection_mode(domain, None)
577 instances.set_nodeinfo_url(domain, None)
579 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
580 software = fetch_generator_from_path(domain)
581 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
582 elif "software" not in data or "name" not in data["software"]:
583 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
584 instances.set_detection_mode(domain, None)
585 instances.set_nodeinfo_url(domain, None)
587 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
588 software = fetch_generator_from_path(domain)
589 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
591 logger.debug("software[%s]='%s'", type(software), software)
593 logger.debug("Returning None - EXIT!")
596 logger.debug("software='%s'- BEFORE!", software)
597 software = software_helper.alias(software)
598 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
600 if str(software) == "":
601 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
602 software = fetch_generator_from_path(domain)
603 elif len(str(software)) > 0 and ("." in software or " " in software):
604 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
605 software = version.remove(software)
607 logger.debug("software[]='%s'", type(software))
608 if isinstance(software, str) and "powered by" in software:
609 logger.debug("software='%s' has 'powered by' in it", software)
610 software = version.remove(version.strip_powered_by(software))
612 logger.debug("software='%s' - EXIT!", software)
615 def find_domains(tag: bs4.element.Tag) -> list:
616 logger.debug("tag[]='%s' - CALLED!", type(tag))
617 if not isinstance(tag, bs4.element.Tag):
618 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
619 elif len(tag.select("tr")) == 0:
620 raise KeyError("No table rows found in table!")
623 for element in tag.select("tr"):
624 logger.debug("element[]='%s'", type(element))
625 if not element.find("td"):
626 logger.debug("Skipping element, no <td> found")
629 domain = tidyup.domain(element.find("td").text)
630 reason = tidyup.reason(element.findAll("td")[1].text)
632 logger.debug("domain='%s',reason='%s'", domain, reason)
634 if not utils.is_domain_wanted(domain):
635 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
637 elif domain == "gab.com/.ai, develop.gab.com":
638 logger.debug("Multiple domains detected in one row")
648 "domain": "develop.gab.com",
652 elif not validators.domain(domain.split("/")[0]):
653 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
656 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
662 logger.debug("domains()=%d - EXIT!", len(domains))
665 def add_peers(rows: dict) -> list:
666 logger.debug("rows[]='%s' - CALLED!", type(rows))
667 if not isinstance(rows, dict):
668 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
671 for key in ["linked", "allowed", "blocked"]:
672 logger.debug("Checking key='%s'", key)
673 if key not in rows or rows[key] is None:
674 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
677 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
678 for peer in rows[key]:
679 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
680 if peer is None or peer == "":
681 logger.debug("peer is empty - SKIPPED")
683 elif isinstance(peer, dict) and "domain" in peer:
684 logger.debug("peer[domain]='%s'", peer["domain"])
685 peer = tidyup.domain(peer["domain"])
686 elif isinstance(peer, str):
687 logger.debug("peer='%s'", peer)
688 peer = tidyup.domain(peer)
690 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
692 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
693 if not utils.is_domain_wanted(peer):
694 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
697 logger.debug("Appending peer='%s' ...", peer)
700 logger.debug("peers()=%d - EXIT!", len(peers))