"log_level" : "info",
"host" : "127.0.0.1",
"port" : 8069,
- "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/113.0",
+ "useragent" : "Mozilla/5.0 (Windows NT 10.0; rv:113.0) Gecko/20100101 Firefox/113.0",
"connection_timeout": 30,
"read_timeout" : 5,
"hostname" : "fba.ryona.agency",
database.cursor.execute("SELECT domain, software FROM instances WHERE software = ?", [args.software])
else:
logger.info("Fetching domains for recently updated ...")
- database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL", [time.time() - config.get("recheck_block")])
+ database.cursor.execute("SELECT domain, software FROM instances WHERE last_nodeinfo < ? OR last_nodeinfo IS NULL AND software IS NULL AND last_status_code < 999", [time.time() - config.get("recheck_block")])
domains = database.cursor.fetchall()
import logging
+from urllib.parse import urlparse
+
import bs4
import reqto
from fba.http import network
+from fba.models import instances
+
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
response = reqto.get(
f"https://{domain}/",
headers=network.web_headers,
- timeout=(config.get("connection_timeout"), config.get("read_timeout")),
- allow_redirects=False
+ timeout=(config.get("connection_timeout"), config.get("read_timeout"))
)
+ components = urlparse(response.url)
logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
- if response.ok and response.status_code < 300 and response.text != "" and response.text.find("<html") > 0:
+ if response.ok and response.status_code < 300 and response.text.strip() != "" and response.text.find("<html") > 0 and domain == components.netloc:
# Save cookies
logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
cookies.store(domain, response.cookies.get_dict())
if tag is not None:
logger.debug("Adding CSRF token='%s' for domain='%s'", tag["content"], domain)
reqheaders["X-CSRF-Token"] = tag["content"]
+ elif domain != components.netloc:
+ logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc)
+ instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'")
logger.debug("reqheaders()=%d - EXIT!", len(reqheaders))
return reqheaders
logger.debug("components[%s]='%s'", type(components), components)
if components.scheme == "" and components.netloc == "":
- logger.debug("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
+ logger.warning("link[href]='%s' has no scheme and host name in it, prepending from domain='%s'", link['href'], domain)
url = f"https://{domain}{url}"
components = urlparse(url)
+ elif components.netloc == "":
+ logger.warning("link[href]='%s' has no netloc set, setting domain='%s'", link["href"], domain)
+ url = f"{components.scheme}://{domain}{components.path}"
+ components = urlparse(url)
if not utils.is_domain_wanted(components.netloc):
logger.debug("components.netloc='%s' is not wanted - SKIPPED!", components.netloc)
software = None
logger.debug("Fetching path='%s' from domain='%s' ...", path, domain)
- response = network.fetch_response(domain, path, network.web_headers, (config.get("connection_timeout"), config.get("read_timeout")))
+ response = network.fetch_response(
+ domain, path,
+ network.web_headers,
+ (config.get("connection_timeout"), config.get("read_timeout")),
+ allow_redirects=True
+ )
+ components = urlparse(response.url)
logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
- if response.ok and response.status_code < 300 and response.text.find("<html") > 0:
+ if response.ok and response.status_code < 300 and response.text.find("<html") > 0 and components.netloc == domain:
logger.debug("Parsing response.text()=%d Bytes ...", len(response.text))
doc = bs4.BeautifulSoup(response.text, "html.parser")
if software is not None and software != "":
logger.debug("domain='%s' has og:site_name='%s' - Setting detection_mode=SITE_NAME ...", domain, software)
instances.set_detection_mode(domain, "SITE_NAME")
+ elif domain != components.netloc:
+ logger.warning("domain='%s' doesn't match components.netloc='%s', maybe redirect to other domain?", domain, components.netloc)
+ instances.set_last_error(domain, f"Redirect from domain='{domain}' to components.netloc='{components.netloc}'")
logger.debug("software[]='%s'", type(software))
if isinstance(software, str) and software == "":
logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
if not response.ok or response.status_code >= 300 or len(response.text.strip()) == 0:
- logger.warning("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text))
+ logger.debug("Cannot query JSON API: domain='%s',path='%s',data()=%d,response.status_code=%d,response.text()=%d", domain, path, len(data), response.status_code, len(response.text))
json_reply["status_code"] = response.status_code
json_reply["error_message"] = response.reason
instances.set_last_error(domain, response)
logger.debug("response.ok='%s',response.status_code=%d,response.text()=%d", response.ok, response.status_code, len(response.text))
if not response.ok or response.status_code >= 300 or len(response.text) == 0:
- logger.warning("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text))
+ logger.debug("Cannot query JSON API: domain='%s',path='%s',response.status_code=%d,response.text()=%d", domain, path, response.status_code, len(response.text))
json_reply["status_code"] = response.status_code
json_reply["error_message"] = response.reason
instances.set_last_error(domain, response)
return True
-def fetch_response(domain: str, path: str, headers: dict, timeout: tuple) -> requests.models.Response:
- logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s' - CALLED!", domain, path, len(headers), timeout)
+def fetch_response(domain: str, path: str, headers: dict, timeout: tuple, allow_redirects: bool = False) -> requests.models.Response:
+ logger.debug("domain='%s',path='%s',headers()=%d,timeout='%s',allow_redirects='%s' - CALLED!", domain, path, len(headers), timeout, allow_redirects)
domain_helper.raise_on(domain)
if not isinstance(path, str):
headers=headers,
timeout=timeout,
cookies=cookies.get_all(domain),
- allow_redirects=False
+ allow_redirects=allow_redirects
)
except exceptions as exception: