fba/networks/mastodon.py

   1 # Fedi API Block - An aggregator for fetching blocking data from fediverse nodes
   2 # Copyright (C) 2023 Free Software Foundation
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU Affero General Public License as published
   6 # by the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU Affero General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
  16
  17 import inspect
  18 import logging
  19
  20 import bs4
  21 import validators
  22
  23 from fba import csrf
  24 from fba import fba
  25
  26 from fba.helpers import blacklist
  27 from fba.helpers import config
  28 from fba.helpers import tidyup
  29
  30 from fba.http import network
  31
  32 from fba.models import blocks
  33 from fba.models import instances
  34
  35 logging.basicConfig(level=logging.INFO)
  36 logger = logging.getLogger(__name__)
  37
  38 # Language mapping X -> English
  39 language_mapping = {
  40     # English -> English
  41     "Silenced instances"            : "Silenced servers",
  42     "Suspended instances"           : "Suspended servers",
  43     "Limited instances"             : "Limited servers",
  44     "Filtered media"                : "Filtered media",
  45     # Mappuing German -> English
  46     "Gesperrte Server"              : "Suspended servers",
  47     "Gefilterte Medien"             : "Filtered media",
  48     "Stummgeschaltete Server"       : "Silenced servers",
  49     # Japanese -> English
  50     "停止済みのサーバー"            : "Suspended servers",
  51     "制限中のサーバー"              : "Limited servers",
  52     "メディアを拒否しているサーバー": "Filtered media",
  53     "サイレンス済みのサーバー"      : "Silenced servers",
  54     # ??? -> English
  55     "שרתים מושעים"                  : "Suspended servers",
  56     "מדיה מסוננת"                   : "Filtered media",
  57     "שרתים מוגבלים"                 : "Silenced servers",
  58     # French -> English
  59     "Serveurs suspendus"            : "Suspended servers",
  60     "Médias filtrés"                : "Filtered media",
  61     "Serveurs limités"              : "Limited servers",
  62     "Serveurs modérés"              : "Limited servers",
  63 }
  64
  65 def fetch_blocks_from_about(domain: str) -> dict:
  66     logger.debug("domain(%d)='%s' - CALLED!", len(domain), domain)
  67     if not isinstance(domain, str):
  68         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
  69     elif domain == "":
  70         raise ValueError("Parameter 'domain' is empty")
  71     elif domain.lower() != domain:
  72         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
  73     elif not validators.domain(domain.split("/")[0]):
  74         raise ValueError(f"domain='{domain}' is not a valid domain")
  75     elif domain.endswith(".arpa"):
  76         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
  77     elif domain.endswith(".tld"):
  78         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
  79
  80     logger.debug("Fetching mastodon blocks from domain:", domain)
  81     doc = None
  82     for path in ["/about/more", "/about"]:
  83         try:
  84             logger.debug(f"Fetching path='{path}' from domain='{domain}' ...")
  85             doc = bs4.BeautifulSoup(
  86                 network.fetch_response(
  87                     domain,
  88                     path,
  89                     network.web_headers,
  90                     (config.get("connection_timeout"), config.get("read_timeout"))
  91                 ).text,
  92                 "html.parser",
  93             )
  94
  95             if len(doc.find_all("h3")) > 0:
  96                 logger.debug(f"path='{path}' had some headlines - BREAK!")
  97                 break
  98
  99         except network.exceptions as exception:
 100             logger.warning(f"Cannot fetch from domain='{domain}',exception='{type(exception)}'")
 101             instances.set_last_error(domain, exception)
 102             break
 103
 104     blocklist = {
 105         "Suspended servers": [],
 106         "Filtered media"   : [],
 107         "Limited servers"  : [],
 108         "Silenced servers" : [],
 109     }
 110
 111     logger.debug("doc[]='%'", type(doc))
 112     if doc is None:
 113         logger.warning(f"Cannot fetch any /about pages for domain='{domain}' - EXIT!")
 114         return blocklist
 115
 116     for header in doc.find_all("h3"):
 117         header_text = tidyup.reason(header.text)
 118
 119         logger.debug("header_text='%s'", header_text)
 120         if header_text in language_mapping:
 121             logger.debug("header_text='%s'", header_text)
 122             header_text = language_mapping[header_text]
 123         else:
 124             logger.warning(f"header_text='{header_text}' not found in language mapping table")
 125
 126         if header_text in blocklist or header_text.lower() in blocklist:
 127             # replaced find_next_siblings with find_all_next to account for instances that e.g. hide lists in dropdown menu
 128             for line in header.find_all_next("table")[0].find_all("tr")[1:]:
 129                 blocklist[header_text].append({
 130                     "domain": tidyup.domain(line.find("span").text),
 131                     "hash"  : tidyup.domain(line.find("span")["title"][9:]),
 132                     "reason": tidyup.reason(line.find_all("td")[1].text),
 133                 })
 134         else:
 135             logger.warning(f"header_text='{header_text}' not found in blocklist()={len(blocklist)}")
 136
 137     logger.debug("Returning blocklist for domain:", domain)
 138     return {
 139         "reject"        : blocklist["Suspended servers"],
 140         "media_removal" : blocklist["Filtered media"],
 141         "followers_only": blocklist["Limited servers"] + blocklist["Silenced servers"],
 142     }
 143
 144 def fetch_blocks(domain: str, origin: str, nodeinfo_url: str):
 145     logger.debug(f"domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}' - CALLED!")
 146     if not isinstance(domain, str):
 147         raise ValueError(f"Parameter domain[]='{type(domain)}' is not 'str'")
 148     elif domain == "":
 149         raise ValueError("Parameter 'domain' is empty")
 150     elif domain.lower() != domain:
 151         raise ValueError(f"Parameter domain='{domain}' must be all lower-case")
 152     elif not validators.domain(domain.split("/")[0]):
 153         raise ValueError(f"domain='{domain}' is not a valid domain")
 154     elif domain.endswith(".arpa"):
 155         raise ValueError(f"domain='{domain}' is a domain for reversed IP addresses, please don't crawl them!")
 156     elif domain.endswith(".tld"):
 157         raise ValueError(f"domain='{domain}' is a fake domain, please don't crawl them!")
 158     elif not isinstance(origin, str) and origin is not None:
 159         raise ValueError(f"Parameter origin[]='{type(origin)}' is not 'str'")
 160     elif origin == "":
 161         raise ValueError("Parameter 'origin' is empty")
 162     elif not isinstance(nodeinfo_url, str):
 163         raise ValueError(f"Parameter nodeinfo_url[]='{type(nodeinfo_url)}' is not 'str'")
 164     elif nodeinfo_url == "":
 165         raise ValueError("Parameter 'nodeinfo_url' is empty")
 166
 167     # No CSRF by default, you don't have to add network.api_headers by yourself here
 168     headers = tuple()
 169
 170     try:
 171         logger.debug(f"Checking CSRF for domain='{domain}'")
 172         headers = csrf.determine(domain, dict())
 173     except network.exceptions as exception:
 174         logger.warning(f"Exception '{type(exception)}' during checking CSRF (fetch_blocks,{__name__}) - EXIT!")
 175         instances.set_last_error(domain, exception)
 176         return
 177
 178     try:
 179         # json endpoint for newer mastodongs
 180         found_blocks = list()
 181         blocklist = list()
 182
 183         rows = {
 184             "reject"        : [],
 185             "media_removal" : [],
 186             "followers_only": [],
 187             "report_removal": [],
 188         }
 189
 190         logger.debug("Querying API domain_blocks:", domain)
 191         data = network.get_json_api(
 192             domain,
 193             "/api/v1/instance/domain_blocks",
 194             headers,
 195             (config.get("connection_timeout"), config.get("read_timeout"))
 196         )
 197
 198         logger.debug("data[]='%s'", type(data))
 199         if "error_message" in data:
 200             logger.debug(f"Was not able to fetch domain_blocks from domain='{domain}': status_code='{data['status_code']}',error_message='{data['error_message']}'")
 201             instances.set_last_error(domain, data)
 202             return
 203         elif "json" in data and "error" in data["json"]:
 204             logger.warning(f"JSON API returned error message: '{data['json']['error']}'")
 205             instances.set_last_error(domain, data)
 206             return
 207         else:
 208             # Getting blocklist
 209             blocklist = data["json"]
 210
 211         if len(blocklist) > 0:
 212             logger.info("Checking %d entries from domain='%s' ...", len(blocklist), domain)
 213             for block in blocklist:
 214                 # Check type
 215                 logger.debug(f"block[]='{type(block)}'")
 216                 if not isinstance(block, dict):
 217                     logger.debug(f"block[]='{type(block)}' is of type 'dict' - SKIPPED!")
 218                     continue
 219
 220                 # Map block -> entry
 221                 logger.debug(f"block[{type(block)}]='{block}'")
 222                 entry = {
 223                     "domain": block["domain"],
 224                     "hash"  : block["digest"],
 225                     "reason": block["comment"] if "comment" in block else None
 226                 }
 227
 228                 logger.debug("severity,domain,hash,comment:", block['severity'], block['domain'], block['digest'], block['comment'])
 229                 if block['severity'] == 'suspend':
 230                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
 231                     rows['reject'].append(entry)
 232                 elif block['severity'] == 'silence':
 233                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
 234                     rows['followers_only'].append(entry)
 235                 elif block['severity'] == 'reject_media':
 236                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
 237                     rows['media_removal'].append(entry)
 238                 elif block['severity'] == 'reject_reports':
 239                     logger.debug(f"Adding entry='{entry}' with severity='{block['severity']}' ...")
 240                     rows['report_removal'].append(entry)
 241                 else:
 242                     logger.warning(f"Unknown severity='{block['severity']}', domain='{block['domain']}'")
 243         else:
 244             logger.debug(f"domain='{domain}' has returned zero rows, trying /about/more page ...")
 245             rows = fetch_blocks_from_about(domain)
 246
 247         logger.info("Checking %d entries from domain='%s' ...", len(rows.items()), domain)
 248         for block_level, blocklist in rows.items():
 249             logger.debug("domain,block_level,blocklist():", domain, block_level, len(blocklist))
 250             block_level = tidyup.domain(block_level)
 251
 252             logger.debug("AFTER-block_level:", block_level)
 253             if block_level == "":
 254                 logger.warning("block_level is empty, domain:", domain)
 255                 continue
 256             elif block_level == "accept":
 257                 logger.debug(f"domain='{domain}' skipping block_level='accept'")
 258                 continue
 259
 260             logger.debug(f"Checking {len(blocklist)} entries from domain='{domain}',block_level='{block_level}' ...")
 261             for block in blocklist:
 262                 logger.debug(f"block[]='{type(block)}'")
 263                 blocked, blocked_hash, reason = block.values()
 264                 logger.debug(f"blocked='{blocked}',blocked_hash='{blocked_hash}',reason='{reason}':")
 265                 blocked = tidyup.domain(blocked)
 266                 reason  = tidyup.reason(reason) if reason is not None and reason != "" else None
 267                 logger.debug(f"blocked='{blocked}',reason='{reason}' - AFTER!")
 268
 269                 if blocked == "":
 270                     logger.warning("blocked is empty, domain='%s'", domain)
 271                     continue
 272                 elif blacklist.is_blacklisted(blocked):
 273                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
 274                     continue
 275                 elif blocked.count("*") > 0:
 276                     # Doing the hash search for instance names as well to tidy up DB
 277                     row = instances.deobscure("*", blocked, blocked_hash)
 278
 279                     logger.debug("row[]='%s'", type(row))
 280                     if row is None:
 281                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
 282                         continue
 283
 284                     logger.debug("Updating domain: ", row[0])
 285                     blocked      = row[0]
 286                     origin       = row[1]
 287                     nodeinfo_url = row[2]
 288                 elif blocked.count("?") > 0:
 289                     # Doing the hash search for instance names as well to tidy up DB
 290                     row = instances.deobscure("?", blocked, blocked_hash)
 291
 292                     logger.debug("row[]='%s'", type(row))
 293                     if row is None:
 294                         logger.warning(f"Cannot deobsfucate blocked='{blocked}',blocked_hash='{blocked_hash}' - SKIPPED!")
 295                         continue
 296
 297                     logger.debug("Updating domain: ", row[0])
 298                     blocked      = row[0]
 299                     origin       = row[1]
 300                     nodeinfo_url = row[2]
 301
 302                 logger.debug("Looking up instance by domain:", blocked)
 303                 if not validators.domain(blocked):
 304                     logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
 305                     continue
 306                 elif blocked.endswith(".arpa"):
 307                     logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
 308                     continue
 309                 elif blocked.endswith(".tld"):
 310                     logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
 311                     continue
 312                 elif blacklist.is_blacklisted(blocked):
 313                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
 314                     continue
 315                 elif not instances.is_registered(blocked):
 316                     logger.debug(f"Domain blocked='{blocked}' wasn't found, adding ..., domain='{domain}',origin='{origin}',nodeinfo_url='{nodeinfo_url}'")
 317                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
 318
 319                 logger.debug("Looking up instance by domain:", blocked)
 320                 if not validators.domain(blocked):
 321                     logger.warning(f"blocked='{blocked}' is not a valid domain name - SKIPPED!")
 322                     continue
 323                 elif blocked.endswith(".arpa"):
 324                     logger.warning(f"blocked='{blocked}' is a reversed .arpa domain and should not be used generally.")
 325                     continue
 326                 elif blocked.endswith(".tld"):
 327                     logger.warning(f"blocked='{blocked}' is a fake domain, please don't crawl them!")
 328                     continue
 329                 elif blacklist.is_blacklisted(blocked):
 330                     logger.debug("blocked='%s' is blacklisted - SKIPPED!", blocked)
 331                     continue
 332                 elif not instances.is_registered(blocked):
 333                     logger.debug("Hash wasn't found, adding:", blocked, domain)
 334                     instances.add(blocked, domain, inspect.currentframe().f_code.co_name, nodeinfo_url)
 335
 336                 if not blocks.is_instance_blocked(domain, blocked, block_level):
 337                     logger.debug("Blocking:", domain, blocked, block_level)
 338                     blocks.add_instance(domain, blocked, reason, block_level)
 339
 340                     if block_level == "reject":
 341                         found_blocks.append({
 342                             "blocked": blocked,
 343                             "reason" : reason
 344                         })
 345                 else:
 346                     logger.debug(f"Updating block last seen and reason for domain='{domain}',blocked='{blocked}' ...")
 347                     blocks.update_last_seen(domain, blocked, block_level)
 348                     blocks.update_reason(reason, domain, blocked, block_level)
 349
 350         logger.debug("Committing changes ...")
 351         fba.connection.commit()
 352     except network.exceptions as exception:
 353         logger.warning(f"domain='{domain}',exception[{type(exception)}]:'{str(exception)}'")
 354         instances.set_last_error(domain, exception)
 355
 356     logger.debug("EXIT!")