1 # Copyright (C) 2023 Free Software Foundation
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU Affero General Public License as published
5 # by the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU Affero General Public License for more details.
13 # You should have received a copy of the GNU Affero General Public License
14 # along with this program. If not, see <https://www.gnu.org/licenses/>.
18 from urllib.parse import urlparse
27 from fba.helpers import config
28 from fba.helpers import cookies
29 from fba.helpers import domain as domain_helper
30 from fba.helpers import software as software_helper
31 from fba.helpers import tidyup
32 from fba.helpers import version
34 from fba.http import network
36 from fba.models import instances
38 from fba.networks import lemmy
39 from fba.networks import misskey
40 from fba.networks import peertube
44 logging.basicConfig(level=logging.INFO)
45 logger = logging.getLogger(__name__)
47 def fetch_instances(domain: str, origin: str, software: str, command: str, path: str = None):
49 logger.debug("domain='%s',origin='%s',software='%s',command='%s',path='%s',_DEPTH=%d - CALLED!", domain, origin, software, command, path, _DEPTH)
51 domain_helper.raise_on(domain)
53 if not isinstance(origin, str) and origin is not None:
54 raise ValueError(f"Parameter origin[]='{type(origin)}' is not of type 'str'")
55 elif not isinstance(command, str):
56 raise ValueError(f"Parameter command[]='{type(command)}' is not of type 'str'")
58 raise ValueError("Parameter 'command' is empty")
59 elif software is None:
61 logger.debug("Software for domain='%s' is not set, determining ...", domain)
62 software = determine_software(domain, path)
63 except network.exceptions as exception:
64 logger.warning("Exception '%s' during determining software type", type(exception))
65 instances.set_last_error(domain, exception)
67 logger.debug("Determined software='%s' for domain='%s'", software, domain)
68 elif not isinstance(software, str):
69 raise ValueError(f"Parameter software[]='{type(software)}' is not of type 'str'")
70 elif not isinstance(path, str) and path is not None:
71 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
73 logger.debug("Checking if domain='%s' is registered ...", domain)
74 if not instances.is_registered(domain):
75 logger.debug("Adding new domain='%s',origin='%s',command='%s',path='%s',software='%s'", domain, origin, command, path, software)
76 instances.add(domain, origin, command, path, software)
78 logger.debug("Updating last_instance_fetch for domain='%s' ...", domain)
79 instances.set_last_instance_fetch(domain)
83 logger.debug("Fetching instances for domain='%s',software='%s',origin='%s'", domain, software, origin)
84 peerlist = fetch_peers(domain, software, origin)
85 except network.exceptions as exception:
86 logger.warning("Cannot fetch peers from domain='%s',software='%s': '%s'", domain, software, type(exception))
88 logger.debug("peerlist[]='%s'", type(peerlist))
89 if isinstance(peerlist, list):
90 logger.debug("Invoking instances.set_total_peerlist(%s,%d) ...", domain, len(peerlist))
91 instances.set_total_peers(domain, peerlist)
93 logger.debug("peerlist[]='%s'", type(peerlist))
94 if peerlist is None or len(peerlist) == 0:
95 logger.warning("Cannot fetch peers: domain='%s',software='%s'", domain, software)
97 if instances.has_pending(domain):
98 logger.debug("Flushing updates for domain='%s' ...", domain)
99 instances.update_data(domain)
101 logger.debug("Invoking cookies.clear(%s) ...", domain)
102 cookies.clear(domain)
105 logger.debug("EXIT!")
108 logger.info("Checking %d instance(s) from domain='%s',software='%s' ...", len(peerlist), domain, software)
109 for instance in peerlist:
110 logger.debug("instance='%s'", instance)
111 if instance is None or instance == "":
112 logger.debug("instance[%s]='%s' is either None or empty - SKIPPED!", type(instance), instance)
115 logger.debug("instance='%s' - BEFORE!", instance)
116 instance = tidyup.domain(instance)
117 logger.debug("instance='%s' - AFTER!", instance)
120 logger.warning("Empty instance after tidyup.domain(), domain='%s'", domain)
123 logger.debug("instance='%s' - BEFORE!", instance)
124 instance = instance.encode("idna").decode("utf-8")
125 logger.debug("instance='%s' - AFTER!", instance)
127 if not utils.is_domain_wanted(instance):
128 logger.debug("instance='%s' is not wanted - SKIPPED!", instance)
130 elif instance.find("/profile/") > 0 or instance.find("/users/") > 0 or (instances.is_registered(instance.split("/")[0]) and instance.find("/c/") > 0):
131 logger.debug("instance='%s' is a link to a single user profile - SKIPPED!", instance)
133 elif instance.find("/tag/") > 0:
134 logger.debug("instance='%s' is a link to a tag - SKIPPED!", instance)
136 elif not instances.is_registered(instance):
137 logger.debug("Checking if domain='%s' has pending updates ...", domain)
138 if instances.has_pending(domain):
139 logger.debug("Flushing updates for domain='%s' ...", domain)
140 instances.update_data(domain)
142 logger.debug("instance='%s',origin='%s',_DEPTH=%d reached!", instance, origin, _DEPTH)
143 if _DEPTH <= config.get("max_crawl_depth") and len(peerlist) >= config.get("min_peers_length"):
144 logger.debug("Fetching instance='%s',origin='%s',command='%s',path='%s',_DEPTH=%d ...", instance, domain, command, path, _DEPTH)
145 fetch_instances(instance, domain, None, command, path)
147 logger.debug("Adding instance='%s',domain='%s',command='%s',_DEPTH=%d ...", instance, domain, command, _DEPTH)
148 instances.add(instance, domain, command)
150 logger.debug("Invoking cookies.clear(%s) ...", domain)
151 cookies.clear(domain)
153 logger.debug("Checking if domain='%s' has pending updates ...", domain)
154 if instances.has_pending(domain):
155 logger.debug("Flushing updates for domain='%s' ...", domain)
156 instances.update_data(domain)
159 logger.debug("EXIT!")
161 def fetch_peers(domain: str, software: str, origin: str) -> list:
162 logger.debug("domain='%s',software='%s',origin='%s' - CALLED!", domain, software, origin)
163 domain_helper.raise_on(domain)
165 if not isinstance(software, str) and software is not None:
166 raise ValueError(f"software[]='{type(software)}' is not of type 'str'")
168 if software == "misskey":
169 logger.debug("Invoking misskey.fetch_peers(%s) ...", domain)
170 return misskey.fetch_peers(domain)
171 elif software == "lemmy":
172 logger.debug("Invoking lemmy.fetch_peers(%s,%s) ...", domain, origin)
173 return lemmy.fetch_peers(domain, origin)
174 elif software == "peertube":
175 logger.debug("Invoking peertube.fetch_peers(%s) ...", domain)
176 return peertube.fetch_peers(domain)
178 # No CSRF by default, you don't have to add network.api_headers by yourself here
182 logger.debug("Checking CSRF for domain='%s'", domain)
183 headers = csrf.determine(domain, dict())
184 except network.exceptions as exception:
185 logger.warning("Exception '%s' during checking CSRF (fetch_peers,%s) - EXIT!", type(exception), __name__)
186 instances.set_last_error(domain, exception)
190 "/api/v1/instance/peers",
194 # Init peers variable
197 logger.debug("Checking %d paths ...", len(paths))
199 logger.debug("Fetching path='%s' from domain='%s',software='%s' ...", path, domain, software)
200 data = network.get_json_api(
204 (config.get("connection_timeout"), config.get("read_timeout"))
207 logger.debug("data[]='%s'", type(data))
208 if "error_message" in data:
209 logger.debug("Was not able to fetch peers from path='%s',domain='%s' ...", path, domain)
210 instances.set_last_error(domain, data)
211 elif "json" in data and len(data["json"]) > 0:
212 logger.debug("Querying API path='%s' was successful: domain='%s',data[json][%s]()=%d", path, domain, type(data['json']), len(data['json']))
215 logger.debug("Marking domain='%s' as successfully handled ...", domain)
216 instances.set_success(domain)
219 if not isinstance(peers, list):
220 logger.warning("peers[]='%s' is not of type 'list', maybe bad API response?", type(peers))
223 logger.debug("Invoking instances.set_total_peers(%s,%d) ...", domain, len(peers))
224 instances.set_total_peers(domain, peers)
226 logger.debug("peers()=%d - EXIT!", len(peers))
229 def fetch_nodeinfo(domain: str, path: str = None) -> dict:
230 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
231 domain_helper.raise_on(domain)
233 if not isinstance(path, str) and path is not None:
234 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
236 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
237 nodeinfo = fetch_wellknown_nodeinfo(domain)
239 logger.debug("nodeinfo[%s](%d='%s'", type(nodeinfo), len(nodeinfo), nodeinfo)
240 if "error_message" not in nodeinfo and "json" in nodeinfo and len(nodeinfo["json"]) > 0:
241 logger.debug("Invoking instances.set_last_nodeinfo(%s) ...", domain)
242 instances.set_last_nodeinfo(domain)
244 logger.debug("Found nodeinfo[json]()=%d - EXIT!", len(nodeinfo['json']))
247 # No CSRF by default, you don't have to add network.api_headers by yourself here
252 logger.debug("Checking CSRF for domain='%s'", domain)
253 headers = csrf.determine(domain, dict())
254 except network.exceptions as exception:
255 logger.warning("Exception '%s' during checking CSRF (nodeinfo,%s) - EXIT!", type(exception), __name__)
256 instances.set_last_error(domain, exception)
257 instances.set_software(domain, None)
258 instances.set_detection_mode(domain, None)
259 instances.set_nodeinfo_url(domain, None)
262 "error_message": f"exception[{type(exception)}]='{str(exception)}'",
263 "exception" : exception,
267 "/nodeinfo/2.1.json",
269 "/nodeinfo/2.0.json",
271 "/nodeinfo/1.0.json",
276 for request in request_paths:
277 logger.debug("request='%s'", request)
278 http_url = f"http://{domain}{path}"
279 https_url = f"https://{domain}{path}"
281 logger.debug("path[%s]='%s',request='%s',http_url='%s',https_url='%s'", type(path), path, request, http_url, https_url)
282 if path is None or path in [request, http_url, https_url]:
283 logger.debug("path='%s',http_url='%s',https_url='%s'", path, http_url, https_url)
284 if path in [http_url, https_url]:
285 logger.debug("domain='%s',path='%s' has protocol in path, splitting ...", domain, path)
286 components = urlparse(path)
287 path = components.path
289 logger.debug("Fetching request='%s' from domain='%s' ...", request, domain)
290 data = network.get_json_api(
294 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
297 logger.debug("data[]='%s'", type(data))
298 if "error_message" not in data and "json" in data:
299 logger.debug("Success: request='%s' - Setting detection_mode=STATIC_CHECK ...", request)
300 instances.set_last_nodeinfo(domain)
301 instances.set_detection_mode(domain, "STATIC_CHECK")
302 instances.set_nodeinfo_url(domain, request)
305 logger.warning("Failed fetching nodeinfo from domain='%s',status_code='%s',error_message='%s'", domain, data['status_code'], data['error_message'])
307 logger.debug("data()=%d - EXIT!", len(data))
310 def fetch_wellknown_nodeinfo(domain: str) -> dict:
311 logger.debug("domain='%s' - CALLED!", domain)
312 domain_helper.raise_on(domain)
314 # "rel" identifiers (no real URLs)
315 nodeinfo_identifier = [
316 "https://nodeinfo.diaspora.software/ns/schema/2.1",
317 "http://nodeinfo.diaspora.software/ns/schema/2.1",
318 "https://nodeinfo.diaspora.software/ns/schema/2.0",
319 "http://nodeinfo.diaspora.software/ns/schema/2.0",
320 "https://nodeinfo.diaspora.software/ns/schema/1.1",
321 "http://nodeinfo.diaspora.software/ns/schema/1.1",
322 "https://nodeinfo.diaspora.software/ns/schema/1.0",
323 "http://nodeinfo.diaspora.software/ns/schema/1.0",
326 # No CSRF by default, you don't have to add network.api_headers by yourself here
330 logger.debug("Checking CSRF for domain='%s'", domain)
331 headers = csrf.determine(domain, dict())
332 except network.exceptions as exception:
333 logger.warning("Exception '%s' during checking CSRF (fetch_wellknown_nodeinfo,%s) - EXIT!", type(exception), __name__)
334 instances.set_last_error(domain, exception)
337 "error_message": type(exception),
338 "exception" : exception,
341 logger.debug("Fetching .well-known info for domain='%s'", domain)
342 data = network.get_json_api(
344 "/.well-known/nodeinfo",
346 (config.get("nodeinfo_connection_timeout"), config.get("nodeinfo_read_timeout"))
349 logger.debug("data[]='%s'", type(data))
350 if "error_message" not in data:
351 nodeinfo = data["json"]
353 logger.debug("Marking domain='%s' as successfully handled ...", domain)
354 instances.set_success(domain)
356 logger.debug("Found entries: nodeinfo()=%d,domain='%s'", len(nodeinfo), domain)
357 if "links" in nodeinfo:
358 logger.debug("Found nodeinfo[links]()=%d record(s),", len(nodeinfo["links"]))
359 for niid in nodeinfo_identifier:
362 logger.debug("Checking niid='%s' ...", niid)
363 for link in nodeinfo["links"]:
364 logger.debug("link[%s]='%s'", type(link), link)
365 if not isinstance(link, dict) or not "rel" in link:
366 logger.debug("link[]='%s' is not of type 'dict' or no element 'rel' found - SKIPPED!", type(link))
368 elif link["rel"] != niid:
369 logger.debug("link[re]='%s' does not matched niid='%s' - SKIPPED!", link["rel"], niid)
371 elif "href" not in link:
372 logger.warning("link[rel]='%s' has no element 'href' - SKIPPED!", link["rel"])
374 elif link["href"] is None:
375 logger.debug("link[href] is None, link[rel]='%s' - SKIPPED!", link["rel"])
378 # Default is that 'href' has a complete URL, but some hosts don't send that
379 logger.debug("link[rel]='%s' matches niid='%s'", link["rel"], niid)
381 components = urlparse(url)
383 logger.debug("components[%s]='%s'", type(components), components)
384 if components.scheme == "" and components.netloc == "":
385 logger.warning("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
386 url = f"https://{domain}{url}"
387 components = urlparse(url)
388 elif components.netloc == "":
389 logger.warning("link[href]='%s' has no netloc set, setting domain='%s'", link["href"], domain)
390 url = f"{components.scheme}://{domain}{components.path}"
391 components = urlparse(url)
393 logger.debug("components.netloc[]='%s'", type(components.netloc))
394 if not utils.is_domain_wanted(components.netloc):
395 logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
398 logger.debug("Fetching nodeinfo from url='%s' ...", url)
399 data = network.fetch_api_url(
401 (config.get("connection_timeout"), config.get("read_timeout"))
404 logger.debug("link[href]='%s',data[]='%s'", link["href"], type(data))
405 if "error_message" not in data and "json" in data:
406 logger.debug("Found JSON data()=%d,link[href]='%s' - Setting detection_mode=AUTO_DISCOVERY ...", len(data), link["href"])
407 instances.set_detection_mode(domain, "AUTO_DISCOVERY")
408 instances.set_nodeinfo_url(domain, link["href"])
410 logger.debug("Marking domain='%s' as successfully handled ...", domain)
411 instances.set_success(domain)
414 logger.debug("Setting last error for domain='%s',data[]='%s'", domain, type(data))
415 instances.set_last_error(domain, data)
417 logger.debug("data()=%d", len(data))
418 if "error_message" not in data and "json" in data:
419 logger.debug("Auto-discovery successful: domain='%s'", domain)
422 logger.warning("nodeinfo does not contain 'links': domain='%s'", domain)
424 logger.debug("Returning data[]='%s' - EXIT!", type(data))
427 def fetch_generator_from_path(domain: str, path: str = "/") -> str:
428 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
429 domain_helper.raise_on(domain)
431 if not isinstance(path, str):
432 raise ValueError(f"path[]='{type(path)}' is not of type 'str'")
434 raise ValueError("Parameter 'path' is empty")
436 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
439 logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
440 response = network.fetch_response(
443 (config.get("connection_timeout"), config.get("read_timeout")),
447 logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
448 if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and domain_helper.is_in_url(domain, response.url):
449 logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
450 doc = bs4.BeautifulSoup(response.text, "html.parser")
452 logger.debug("doc[]='%s'", type(doc))
453 generator = doc.find("meta", {"name" : "generator"})
454 site_name = doc.find("meta", {"property": "og:site_name"})
455 platform = doc.find("meta", {"property": "og:platform"})
457 logger.debug("generator[]='%s',site_name[]='%s',platform[]='%s'", type(generator), type(site_name), type(platform))
458 if isinstance(generator, bs4.element.Tag) and isinstance(generator.get("content"), str):
459 logger.debug("Found generator meta tag: domain='%s'", domain)
460 software = tidyup.domain(generator.get("content"))
462 logger.debug("software[%s]='%s'", type(software), software)
463 if software is not None and software != "":
464 logger.info("domain='%s' is generated by software='%s' - Setting detection_mode=GENERATOR ...", domain, software)
465 instances.set_detection_mode(domain, "GENERATOR")
466 elif isinstance(site_name, bs4.element.Tag) and isinstance(site_name.get("content"), str):
467 logger.debug("Found property=og:site_name, domain='%s'", domain)
468 software = tidyup.domain(site_name.get("content"))
470 logger.debug("software[%s]='%s'", type(software), software)
471 if software is not None and software != "":
472 logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
473 instances.set_detection_mode(domain, "SITE_NAME")
474 elif isinstance(platform, bs4.element.Tag) and isinstance(platform.get("content"), str):
475 logger.debug("Found property=og:platform, domain='%s'", domain)
476 software = tidyup.domain(platform.get("content"))
478 logger.debug("software[%s]='%s'", type(software), software)
479 if software is not None and software != "":
480 logger.debug("domain='%s' has og:platform='%s' - Setting detection_mode=PLATFORM ...", domain, software)
481 instances.set_detection_mode(domain, "PLATFORM")
482 elif not domain_helper.is_in_url(domain, response.url):
483 logger.warning("domain='%s' doesn't match response.url='%s', maybe redirect to other domain?", domain, response.url)
485 components = urlparse(response.url)
487 logger.debug("components[]='%s'", type(components))
488 if not instances.is_registered(components.netloc):
489 logger.info("components.netloc='%s' is not registered, adding ...", components.netloc)
490 fetch_instances(components.netloc, domain, None, "fetch_generator")
492 message = f"Redirect from domain='{domain}' to response.url='{response.url}'"
493 instances.set_last_error(domain, message)
494 instances.set_software(domain, None)
495 instances.set_detection_mode(domain, None)
496 instances.set_nodeinfo_url(domain, None)
498 raise requests.exceptions.TooManyRedirects(message)
500 logger.debug("software[]='%s'", type(software))
501 if isinstance(software, str) and software == "":
502 logger.debug("Corrected empty string to None for software of domain='%s'", domain)
504 elif isinstance(software, str) and ("." in software or " " in software):
505 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
506 software = version.remove(software)
508 logger.debug("software[]='%s'", type(software))
509 if isinstance(software, str) and "powered by " in software:
510 logger.debug("software='%s' has 'powered by' in it", software)
511 software = version.remove(version.strip_powered_by(software))
512 elif isinstance(software, str) and " hosted on " in software:
513 logger.debug("software='%s' has 'hosted on' in it", software)
514 software = version.remove(version.strip_hosted_on(software))
515 elif isinstance(software, str) and " by " in software:
516 logger.debug("software='%s' has ' by ' in it", software)
517 software = version.strip_until(software, " by ")
518 elif isinstance(software, str) and " see " in software:
519 logger.debug("software='%s' has ' see ' in it", software)
520 software = version.strip_until(software, " see ")
522 logger.debug("software='%s' - EXIT!", software)
525 def determine_software(domain: str, path: str = None) -> str:
526 logger.debug("domain='%s',path='%s' - CALLED!", domain, path)
527 domain_helper.raise_on(domain)
529 if not isinstance(path, str) and path is not None:
530 raise ValueError(f"Parameter path[]='{type(path)}' is not of type 'str'")
532 logger.debug("Determining software for domain='%s',path='%s'", domain, path)
535 logger.debug("Fetching nodeinfo from domain='%s' ...", domain)
536 data = fetch_nodeinfo(domain, path)
538 logger.debug("data[%s]='%s'", type(data), data)
539 if "exception" in data:
540 # Continue raising it
541 logger.debug("data()=%d contains exception='%s' - raising ...", len(data), type(data["exception"]))
542 raise data["exception"]
543 elif "error_message" in data:
544 logger.debug("Returned error_message during fetching nodeinfo: '%s',status_code=%d", data['error_message'], data['status_code'])
545 software = fetch_generator_from_path(domain)
546 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
548 logger.debug("domain='%s',path='%s',data[json] found ...", domain, path)
551 logger.debug("JSON response from domain='%s' does not include [software][name], fetching / ...", domain)
552 software = fetch_generator_from_path(domain)
553 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
555 if "status" in data and data["status"] == "error" and "message" in data:
556 logger.warning("JSON response is an error: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
557 instances.set_last_error(domain, data["message"])
558 instances.set_detection_mode(domain, None)
559 instances.set_nodeinfo_url(domain, None)
560 software = fetch_generator_from_path(domain)
561 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
562 elif "software" in data and "name" in data["software"]:
563 logger.debug("Found data[json][software][name] in JSON response")
564 software = data["software"]["name"]
565 logger.debug("software[%s]='%s' - FOUND!", type(software), software)
566 elif "message" in data:
567 logger.warning("JSON response contains only a message: '%s' - Resetting detection_mode,nodeinfo_url ...", data["message"])
568 instances.set_last_error(domain, data["message"])
569 instances.set_detection_mode(domain, None)
570 instances.set_nodeinfo_url(domain, None)
572 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
573 software = fetch_generator_from_path(domain)
574 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
575 elif "software" not in data or "name" not in data["software"]:
576 logger.debug("JSON response from domain='%s' does not include [software][name] - Resetting detection_mode,nodeinfo_url ...", domain)
577 instances.set_detection_mode(domain, None)
578 instances.set_nodeinfo_url(domain, None)
580 logger.debug("Invoking fetch_generator_from_path(%s) ...", domain)
581 software = fetch_generator_from_path(domain)
582 logger.debug("Generator for domain='%s' is: '%s'", domain, software)
584 logger.debug("software[%s]='%s'", type(software), software)
586 logger.debug("Returning None - EXIT!")
589 logger.debug("software='%s'- BEFORE!", software)
590 software = software_helper.alias(software)
591 logger.debug("software['%s']='%s' - AFTER!", type(software), software)
593 if str(software) == "":
594 logger.debug("software for domain='%s' was not detected, trying generator ...", domain)
595 software = fetch_generator_from_path(domain)
596 elif len(str(software)) > 0 and ("." in software or " " in software):
597 logger.debug("software='%s' may contain a version number, domain='%s', removing it ...", software, domain)
598 software = version.remove(software)
600 logger.debug("software[]='%s'", type(software))
601 if isinstance(software, str) and "powered by" in software:
602 logger.debug("software='%s' has 'powered by' in it", software)
603 software = version.remove(version.strip_powered_by(software))
605 logger.debug("software='%s' - EXIT!", software)
608 def find_domains(tag: bs4.element.Tag) -> list:
609 logger.debug("tag[]='%s' - CALLED!", type(tag))
610 if not isinstance(tag, bs4.element.Tag):
611 raise ValueError(f"Parameter tag[]='{type(tag)}' is not type of bs4.element.Tag")
612 elif len(tag.select("tr")) == 0:
613 raise KeyError("No table rows found in table!")
616 for element in tag.select("tr"):
617 logger.debug("element[]='%s'", type(element))
618 if not element.find("td"):
619 logger.debug("Skipping element, no <td> found")
622 domain = tidyup.domain(element.find("td").text)
623 reason = tidyup.reason(element.findAll("td")[1].text)
625 logger.debug("domain='%s',reason='%s'", domain, reason)
627 if not utils.is_domain_wanted(domain):
628 logger.debug("domain='%s' is blacklisted - SKIPPED!", domain)
630 elif domain == "gab.com/.ai, develop.gab.com":
631 logger.debug("Multiple domains detected in one row")
641 "domain": "develop.gab.com",
645 elif not validators.domain(domain.split("/")[0]):
646 logger.warning("domain='%s' is not a valid domain - SKIPPED!", domain)
649 logger.debug("Adding domain='%s',reason='%s' ...", domain, reason)
655 logger.debug("domains()=%d - EXIT!", len(domains))
658 def add_peers(rows: dict) -> list:
659 logger.debug("rows[]='%s' - CALLED!", type(rows))
660 if not isinstance(rows, dict):
661 raise ValueError(f"Parameter rows[]='{type(rows)}' is not of type 'dict'")
664 for key in ["linked", "allowed", "blocked"]:
665 logger.debug("Checking key='%s'", key)
666 if key not in rows or rows[key] is None:
667 logger.debug("Cannot find key='%s' or it is NoneType - SKIPPED!", key)
670 logger.debug("Adding %d peer(s) to peers list ...", len(rows[key]))
671 for peer in rows[key]:
672 logger.debug("peer[%s]='%s' - BEFORE!", type(peer), peer)
673 if peer is None or peer == "":
674 logger.debug("peer is empty - SKIPPED")
676 elif isinstance(peer, dict) and "domain" in peer:
677 logger.debug("peer[domain]='%s'", peer["domain"])
678 peer = tidyup.domain(peer["domain"])
679 elif isinstance(peer, str):
680 logger.debug("peer='%s'", peer)
681 peer = tidyup.domain(peer)
683 raise ValueError(f"peer[]='{type(peer)}' is not supported,key='{key}'")
685 logger.debug("peer[%s]='%s' - AFTER!", type(peer), peer)
686 if not utils.is_domain_wanted(peer):
687 logger.debug("peer='%s' is not wanted - SKIPPED!", peer)
690 logger.debug("Appending peer='%s' ...", peer)
693 logger.debug("peers()=%d - EXIT!", len(peers))