# along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging
+import validators
from urllib.parse import urlparse
logger.debug("Marking domain='%s' as successfully handled ...", domain)
instances.set_success(domain)
- logger.debug("Found infos[links]()=%d record(s),", len(infos["links"]))
+ logger.debug("Checking %d nodeinfo ids ...", len(_nodeinfo_identifier))
for niid in _nodeinfo_identifier:
data = dict()
- logger.debug("Checking niid='%s' ...", niid)
+ logger.debug("Checking niid='%s' for infos[links]()=%d ...", niid, len(infos["links"]))
for link in infos["links"]:
logger.debug("link[%s]='%s'", type(link), link)
if not isinstance(link, dict) or not "rel" in link:
logger.warning("link[rel]='%s' has no element 'href' - SKIPPED!", link["rel"])
continue
elif link["href"] in [None, ""]:
- logger.debug("link[href]='%s',link[rel]='%s' - SKIPPED!", link["href"], link["rel"])
+ logger.debug("link[href]='%s' is empty, link[rel]='%s' - SKIPPED!", link["href"], link["rel"])
+ continue
+ elif not validators.url(link["href"]):
+ logger.warning("link[href]='%s' is not a valid domain - SKIPPED!", link["href"])
continue
# Default is that 'href' has a complete URL, but some hosts don't send that
logger.debug("link[rel]='%s' matches niid='%s'", link["rel"], niid)
url = link["href"].lower()
+
+ logger.debug("Parsing url='%s' ...", url)
components = urlparse(url)
logger.debug("components[%s]='%s'", type(components), components)