fba/csrf.py

   1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
   2 # Copyright (C) 2023 Free Software Foundation
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published
   6 # by the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
  16
  17 import bs4
  18 import reqto
  19 import validators
  20
  21
  22 from fba.helpers import config
  23 from fba.helpers import cookies
  24
  25 from fba.http import network
  26
  27 def determine(domain: str, headers: dict) -> dict:
  28     # DEBUG: print(f"DEBUG: domain='{domain}',headers()={len(headers)} - CALLED!")
  29     if not isinstance(domain, str):
  30         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
  31     elif domain == "":
  32         raise ValueError("Parameter 'domain' is empty")
  33     elif not validators.domain(domain.split("/")[0]):
  34         raise ValueError(f"domain='{domain}' is not a valid domain")
  35     elif domain.endswith(".arpa"):
  36         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
  37     elif domain.endswith(".tld"):
  38         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
  39     elif not isinstance(headers, dict):
  40         raise ValueError(f"Parameter headers[]='{type(headers)}' is not 'dict'")
  41
  42     # Default headers with no CSRF
  43     reqheaders = headers
  44
  45     # Fetch / to check for meta tag indicating csrf
  46     # DEBUG: print(f"DEBUG: Fetching / from domain='{domain}' for CSRF check ...")
  47     response = reqto.get(
  48         f"https://{domain}/",
  49         headers=network.web_headers,
  50         timeout=(config.get("connection_timeout"), config.get("read_timeout"))
  51     )
  52
  53     # DEBUG: print(f"DEBUG: response.ok='{response.ok}',response.status_code={response.status_code},response.text()={len(response.text)}")
  54     if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
  55         # Save cookies
  56         # DEBUG: print(f"DEBUG: Parsing response.text()={len(response.text)} Bytes ...")
  57         cookies.store(domain, response.cookies.get_dict())
  58
  59         # Parse text
  60         meta = bs4.BeautifulSoup(
  61             response.text,
  62             "html.parser"
  63         )
  64         # DEBUG: print(f"DEBUG: meta[]='{type(meta)}'")
  65         tag = meta.find("meta", attrs={"name": "csrf-token"})
  66
  67         # DEBUG: print(f"DEBUG: tag={tag}")
  68         if tag is not None:
  69             # DEBUG: print(f"DEBUG: Adding CSRF token='{tag['content']}' for domain='{domain}'")
  70             reqheaders["X-CSRF-Token"] = tag["content"]
  71
  72     # DEBUG: print(f"DEBUG: reqheaders()={len(reqheaders)} - EXIT!")
  73     return reqheaders