]> git.mxchange.org Git - fba.git/commitdiff
Continued:
authorRoland Häder <roland@mxchange.org>
Mon, 24 Jul 2023 14:35:53 +0000 (16:35 +0200)
committerRoland Häder <roland@mxchange.org>
Mon, 24 Jul 2023 14:43:32 +0000 (16:43 +0200)
- added command fetch_relay() for fetching instances from ActivityPub relays
  which show their peers in index page (/)
- added grid.tf as this flooded a lot "testing/developing" sub domains

fba/boot.py
fba/commands.py
fba/helpers/blacklist.py

index 7bd816aab3d8399723007ef4f6aee704c3e5fb7a..abc94acde76a8d3a0578e180093ac16dd5ff859e 100644 (file)
@@ -219,6 +219,8 @@ def init_parser():
         help="Fetches instances from ActivityPub relays",
     )
     parser.set_defaults(command=commands.fetch_relays)
+    parser.add_argument("--domain", help="Instance name (aka. 'relay')")
+    parser.add_argument("--force", action="store_true", help="Forces update of data, no matter what.")
 
     logger.debug("EXIT!")
 
index d94966f7f00d60f41571ae31f75e6065af56f155..7b9849d393ae2ec1979fbaf6616286e4ef317458 100644 (file)
@@ -36,6 +36,7 @@ from fba import utils
 from fba.helpers import blacklist
 from fba.helpers import config
 from fba.helpers import cookies
+from fba.helpers import dicts as dict_helper
 from fba.helpers import locking
 from fba.helpers import processing
 from fba.helpers import software as software_helper
@@ -121,7 +122,7 @@ def fetch_pixelfed_api(args: argparse.Namespace) -> int:
         return list()
 
     try:
-        logger.debug("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
+        logger.info("Fetching JSON from pixelfed.org API, headers()=%d ...", len(headers))
         fetched = network.get_json_api(
             source_domain,
             "/api/v1/servers/all.json?scope=All&country=all&language=all",
@@ -569,7 +570,12 @@ def fetch_todon_wiki(args: argparse.Namespace) -> int:
         "reject": list(),
     }
 
-    raw = utils.fetch_url(f"https://{source_domain}/todon/domainblocks", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
+    logger.debug("Fetching domainblocks from source_domain='%s'", source_domain)
+    raw = utils.fetch_url(
+        f"https://{source_domain}/todon/domainblocks",
+        network.web_headers,
+        (config.get("connection_timeout"), config.get("read_timeout"))
+    ).text
     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
 
     doc = bs4.BeautifulSoup(raw, "html.parser")
@@ -672,7 +678,12 @@ def fetch_cs(args: argparse.Namespace):
         logger.debug("source_domain='%s' has not been recently used, marking ...", source_domain)
         sources.update(source_domain)
 
-    raw = utils.fetch_url(f"https://{source_domain}/chaossocial/meta/master/federation.md", network.web_headers, (config.get("connection_timeout"), config.get("read_timeout"))).text
+    logger.info("Fetching federation.md from source_domain='%s' ...", source_domain)
+    raw = utils.fetch_url(
+        f"https://{source_domain}/chaossocial/meta/master/federation.md",
+        network.web_headers,
+        (config.get("connection_timeout"), config.get("read_timeout"))
+    ).text
     logger.debug("raw()=%d,raw[]='%s'", len(raw), type(raw))
 
     doc = bs4.BeautifulSoup(markdown.markdown(raw, extensions=extensions), features="html.parser")
@@ -1727,6 +1738,7 @@ def fetch_instances_social(args: argparse.Namespace) -> int:
         "Authorization": f"Bearer {config.get('instances_social_api_key')}",
     }
 
+    logger.info("Fetching list from source_domain='%s' ...", source_domain)
     fetched = network.get_json_api(
         source_domain,
         "/api/1.0/instances/list?count=0&sort_by=name",
@@ -1787,14 +1799,138 @@ def fetch_instances_social(args: argparse.Namespace) -> int:
 def fetch_relays(args: argparse.Namespace) -> int:
     logger.debug("args[]='%s' - CALLED!", type(args))
 
-    database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
+    if args.domain is not None and args.domain != "":
+        database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay') AND domain = ? LIMIT 1", [args.domain])
+    else:
+        database.cursor.execute("SELECT domain, software FROM instances WHERE software IN ('activityrelay', 'aoderelay', 'selective-relay')")
 
     domains = list()
-
     rows = database.cursor.fetchall()
+
     logger.info("Checking %d relays ...", len(rows))
     for row in rows:
-        logger.debug("Fetching peers from row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
+        logger.debug("row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
+        if not args.force and instances.is_recent(row["domain"]):
+            logger.debug("row[domain]='%s' has been recently fetched - SKIPPED!", row["domain"])
+            continue
+
+        try:
+            logger.info("Fetching / from relay row[domain]='%s',row[software]='%s' ...", row["domain"], row["software"])
+            raw = utils.fetch_url(
+                f"https://{row['domain']}",
+                network.web_headers,
+                (config.get("connection_timeout"), config.get("read_timeout"))
+            ).text
+            logger.debug("raw[%s]()=%d", type(raw), len(raw))
+        except network.exceptions as exception:
+            logger.warning("Exception '%s' during fetching from relay '%s': '%s'", type(exception), row["domain"], str(exception))
+            instances.set_last_error(row["domain"], exception)
+            instances.set_last_instance_fetch(row["domain"])
+            instances.update_data(row["domain"])
+            continue
+
+        doc = bs4.BeautifulSoup(raw, features="html.parser")
+        logger.debug("doc[]='%s'", type(doc))
+
+        logger.debug("row[software]='%s'", row["software"])
+        if row["software"] == "activityrelay":
+            logger.debug("Checking row[domain]='%s' ...", row["domain"])
+            tags = doc.findAll("p")
+
+            logger.debug("Checking %d paragraphs ...", len(tags))
+            for tag in tags:
+                logger.debug("tag[]='%s'", type(tag))
+                if len(tag.contents) == 0:
+                    logger.debug("tag='%s' is an empty tag - SKIPPED!", tag)
+                    continue
+                elif "registered instances" not in tag.contents[0]:
+                    logger.debug("Skipping paragraph, text not found.")
+                    continue
+
+                logger.debug("Found tag.contents[0][]='%s'", tag.contents[0])
+                for domain in tag.contents:
+                    logger.debug("domain[%s]='%s'", type(domain), domain)
+                    if not isinstance(domain, bs4.element.NavigableString) or "registered instances" in domain:
+                        continue
+
+                    domain = str(domain)
+                    if not utils.is_domain_wanted(domain):
+                        logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+                        continue
+
+                    logger.debug("domain='%s' - BEFORE!", domain)
+                    domain = tidyup.domain(domain)
+                    logger.debug("domain='%s' - AFTER!", domain)
+
+                    if domain == "":
+                        logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
+                        continue
+                    elif instances.is_registered(domain):
+                        logger.debug("domain='%s' is already registered - SKIPPED!", domain)
+                        continue
+                    elif dict_helper.has_key(domains, "domain", domain):
+                        logger.debug("domain='%s' already added", domain)
+                        continue
+
+                    logger.debug("Appending domain='%s',origin='%s',software='%s' ...", domain, row["domain"], row["software"])
+                    domains.append({
+                        "domain": domain,
+                        "origin": row["domain"],
+                    })
+        elif row["software"] in ["aoderelay", "selective-relay"]:
+            logger.debug("Checking row[domain]='%s' ...", row["domain"])
+            if row["software"] == "aoderelay":
+                tags = doc.findAll("section", {"class": "instance"})
+            else:
+                tags = doc.find("div", {"id": "instances"}).findAll("li")
+
+            logger.debug("Checking %d tags ...", len(tags))
+            for tag in tags:
+                logger.debug("tag[]='%s'", type(tag))
+
+                link = tag.find("a")
+                logger.debug("link[%s]='%s'", type(link), link)
+                if link is None:
+                    logger.warning("tag='%s' has no a-tag ...", tag)
+                    continue
+
+                components = urlparse(link["href"])
+                domain = components.netloc.lower()
+
+                if not utils.is_domain_wanted(domain):
+                    logger.debug("domain='%s' is not wanted - SKIPPED!", domain)
+                    continue
+
+                logger.debug("domain='%s' - BEFORE!", domain)
+                domain = tidyup.domain(domain)
+                logger.debug("domain='%s' - AFTER!", domain)
+
+                if domain == "":
+                    logger.debug("Empty domain after tidyup.domain() from origin='%s' - SKIPPED!", row["domain"])
+                    continue
+                elif instances.is_registered(domain):
+                    logger.debug("domain='%s' is already registered - SKIPPED!", domain)
+                    continue
+                elif dict_helper.has_key(domains, "domain", domain):
+                    logger.debug("domain='%s' already added", domain)
+                    continue
+
+                logger.debug("Appending domain='%s',origin='%s',software='%s'", domain, row["domain"], row["software"])
+                domains.append({
+                    "domain": domain,
+                    "origin": row["domain"],
+                })
+        else:
+            logger.warning("row[domain]='%s',row[software]='%s' is not supported", row["domain"], row["software"])
+
+        logger.debug("Updating last_instance_fetch for row[domain]='%s' ...", row["domain"])
+        instances.set_last_instance_fetch(row["domain"])
+        instances.update_data(row["domain"])
+
+    logger.info("Found %d domains to add ...", len(domains))
+    for row in domains:
+        logger.info("Fetching row[domain]='%s',row[origin]='%s' ...", row["domain"], row["origin"])
+        federation.fetch_instances(row["domain"], row["origin"], None, inspect.currentframe().f_code.co_name)
 
     logger.debug("Success! - EXIT!")
     return 0
index d2426b0b7887f52ab2976ac24e4782ee8ccd749d..891a337ee7a0f018b861cea8020c51fbaa9e63a5 100644 (file)
@@ -32,6 +32,7 @@ _blacklist = {
     "lhr.life"            : "Floods federation tables with fake nodes",
     "localhost.run"       : "Floods federation tables with fake nodes",
     "loca.lt"             : "Floods federation tables with fake nodes",
+    "grid.tf"             : "Floods federation tables with fake nodes",
     "ngrok.io"            : "Testing/developing instances shouldn't be part of public instances",
     "ngrok.app"           : "Testing/developing instances shouldn't be part of public instances",
     "ngrok-free.app"      : "Testing/developing instances shouldn't be part of public instances",