Cleanup and documentation of common_ensure_session

[quix0rs-gnu-social.git] / lib / util.php
diff --git a/lib/util.php b/lib/util.php

index fd903d5505a2db28a60f5f277489742516236b93..f2e09daa936fd66801cda26371734bb144297c15 100644 (file)
--- a/lib/util.php
+++ b/lib/util.php
@@ -197,7 +197,7 @@ function common_language()
      if (common_config('site', 'langdetect')) {
          $httplang = isset($_SERVER['HTTP_ACCEPT_LANGUAGE']) ? $_SERVER['HTTP_ACCEPT_LANGUAGE'] : null;
          if (!empty($httplang)) {
-            $language = client_prefered_language($httplang);
+            $language = client_preferred_language($httplang);
              if ($language)
                return $language;
          }
@@ -264,30 +264,36 @@ function common_logged_in()
      return (!is_null(common_current_user()));
  }
  
+function common_local_referer()
+{
+    return isset($_SERVER['HTTP_REFERER'])
+            && parse_url($_SERVER['HTTP_REFERER'], PHP_URL_HOST) === common_config('site', 'server');
+}
+
  function common_have_session()
  {
      return (0 != strcmp(session_id(), ''));
  }
  
+/**
+ * Make sure session is started and handled by
+ * the correct handler.
+ */
  function common_ensure_session()
  {
-    $c = null;
-    if (array_key_exists(session_name(), $_COOKIE)) {
-        $c = $_COOKIE[session_name()];
-    }
      if (!common_have_session()) {
          if (common_config('sessions', 'handle')) {
-            Session::setSaveHandler();
+            session_set_save_handler(new InternalSessionHandler(), true);
          }
-       if (array_key_exists(session_name(), $_GET)) {
-           $id = $_GET[session_name()];
-       } else if (array_key_exists(session_name(), $_COOKIE)) {
-           $id = $_COOKIE[session_name()];
-       }
-       if (isset($id)) {
-           session_id($id);
-       }
-        @session_start();
+        if (array_key_exists(session_name(), $_GET)) {
+            $id = $_GET[session_name()];
+        } else if (array_key_exists(session_name(), $_COOKIE)) {
+            $id = $_COOKIE[session_name()];
+        }
+        if (isset($id)) {
+            session_id($id);
+        }
+        session_start();
          if (!isset($_SESSION['started'])) {
              $_SESSION['started'] = time();
              if (!empty($id)) {
@@ -354,7 +360,7 @@ function common_set_cookie($key, $value, $expiration=0)
                       $expiration,
                       $cookiepath,
                       $server,
-                     common_config('site', 'ssl')=='always');
+                     GNUsocial::useHTTPS());
  }
  
  define('REMEMBERME', 'rememberme');
@@ -575,25 +581,60 @@ function common_canonical_email($email)
      return $email;
  }
  
-function common_purify($html)
+function common_to_alphanumeric($str)
+{
+    $filtered = preg_replace('/[^A-Za-z0-9]\s*/', '', $str);
+    if (strlen($filtered) < 1) {
+        throw new Exception('Filtered string was zero-length.');
+    }
+    return $filtered;
+}
+
+function common_purify($html, array $args=array())
  {
-    require_once INSTALLDIR.'/extlib/htmLawed/htmLawed.php';
+    require_once INSTALLDIR.'/extlib/HTMLPurifier/HTMLPurifier.auto.php';
  
-    $config = array('safe' => 1,    // means that elements=* means elements=*-applet-embed-iframe-object-script or so
-                    'elements' => '*',
-                    'deny_attribute' => 'id,style,on*');
+    $cfg = HTMLPurifier_Config::createDefault();
+    /**
+     * rel values that should be avoided since they can be used to infer
+     * information about the _current_ page, not the h-entry:
+     *
+     *      directory, home, license, payment
+     *
+     * Source: http://microformats.org/wiki/rel
+     */
+    $cfg->set('Attr.AllowedRel', ['bookmark', 'enclosure', 'nofollow', 'tag', 'noreferrer']);
+    $cfg->set('HTML.ForbiddenAttributes', array('style'));  // id, on* etc. are already filtered by default
+    $cfg->set('URI.AllowedSchemes', array_fill_keys(common_url_schemes(), true));
+    if (isset($args['URI.Base'])) {
+        $cfg->set('URI.Base', $args['URI.Base']);   // if null this is like unsetting it I presume
+        $cfg->set('URI.MakeAbsolute', !is_null($args['URI.Base']));   // if we have a URI base, convert relative URLs to absolute ones.
+    }
+    if (common_config('cache', 'dir')) {
+        $cfg->set('Cache.SerializerPath', common_config('cache', 'dir'));
+    }
+    // if you don't want to use the default cache dir for htmlpurifier, set it specifically as $config['htmlpurifier']['Cache.SerializerPath'] = '/tmp'; or something.
+    foreach (common_config('htmlpurifier') as $key=>$val) {
+        $cfg->set($key, $val);
+    }
  
-    // Remove more elements than what the 'safe' filter gives (elements must be '*' before this)
-    // http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
+    // Remove more elements than what the default filter removes, default in GNU social are remotely
+    // linked resources such as img, video, audio
+    $forbiddenElements = array();
      foreach (common_config('htmlfilter') as $tag=>$filter) {
          if ($filter === true) {
-            $config['elements'] .= "-{$tag}";
+            $forbiddenElements[] = $tag;
          }
      }
+    $cfg->set('HTML.ForbiddenElements', $forbiddenElements);
  
      $html = common_remove_unicode_formatting($html);
  
-    return htmLawed($html, $config);
+    $purifier = new HTMLPurifier($cfg);
+    $purified = $purifier->purify($html);
+    Event::handle('EndCommonPurify', array(&$purified, $html));
+    
+    return $purified;
  }
  
  function common_remove_unicode_formatting($text)
@@ -665,7 +706,7 @@ function common_linkify_mention(array $mention)
          $xs = new XMLStringer(false);
  
          $attrs = array('href' => $mention['url'],
-                       'class' => 'h-card '.$mention['type']);
+                       'class' => 'h-card u-url p-nickname '.$mention['type']);
  
          if (!empty($mention['title'])) {
              $attrs['title'] = $mention['title'];
@@ -681,6 +722,23 @@ function common_linkify_mention(array $mention)
      return $output;
  }
  
+function common_get_attentions($text, Profile $sender, Notice $parent=null)
+{
+    $mentions = common_find_mentions($text, $sender, $parent);
+    $atts = array();
+    foreach ($mentions as $mention) {
+        foreach ($mention['mentioned'] as $mentioned) {
+            $atts[$mentioned->getUri()] = $mentioned->getObjectType();
+        }
+    }
+    if ($parent instanceof Notice) {
+        $parentAuthor = $parent->getProfile();
+        // afaik groups can't be authors
+        $atts[$parentAuthor->getUri()] = ActivityObject::PERSON;
+    }
+    return $atts;
+}
+
  /**
   * Find @-mentions in the given text, using the given notice object as context.
   * References will be resolved with common_relative_profile() against the user
@@ -704,22 +762,17 @@ function common_find_mentions($text, Profile $sender, Notice $parent=null)
      if (Event::handle('StartFindMentions', array($sender, $text, &$mentions))) {
          // Get the context of the original notice, if any
          $origMentions = array();
-
          // Does it have a parent notice for context?
          if ($parent instanceof Notice) {
-            $ids = $parent->getReplies();   // replied-to _profile ids_
-
-            foreach ($ids as $id) {
-                try {
-                    $repliedTo = Profile::getByID($id);
-                    $origMentions[$repliedTo->getNickname()] = $repliedTo;
-                } catch (NoResultException $e) {
-                    // continue foreach
+            foreach ($parent->getAttentionProfiles() as $repliedTo) {
+                if (!$repliedTo->isPerson()) {
+                    continue;
                  }
+                $origMentions[$repliedTo->id] = $repliedTo;
              }
          }
  
-        $matches = common_find_mentions_raw($text);
+        $matches = common_find_mentions_raw($text, '@');
  
          foreach ($matches as $match) {
              try {
@@ -729,25 +782,35 @@ function common_find_mentions($text, Profile $sender, Notice $parent=null)
                  continue;
              }
  
-            // Try to get a profile for this nickname.
-            // Start with conversation context, then go to
-            // sender context.
+                       // primarily mention the profiles mentioned in the parent
+            $mention_found_in_origMentions = false;
+            foreach($origMentions as $origMentionsId=>$origMention) {
+                if($origMention->getNickname() == $nickname) {
+                    $mention_found_in_origMentions = $origMention;
+                    // don't mention same twice! the parent might have mentioned 
+                    // two users with same nickname on different instances
+                    unset($origMentions[$origMentionsId]);
+                    break;
+                }
+            }
  
-            if ($parent instanceof Notice && $parent->getProfile()->getNickname() === $nickname) {
+            // Try to get a profile for this nickname.
+            // Start with parents mentions, then go to parents sender context
+            if ($mention_found_in_origMentions) {
+                $mentioned = $mention_found_in_origMentions;            
+            } else if ($parent instanceof Notice && $parent->getProfile()->getNickname() === $nickname) {
                  $mentioned = $parent->getProfile();
-            } else if (!empty($origMentions) &&
-                       array_key_exists($nickname, $origMentions)) {
-                $mentioned = $origMentions[$nickname];
              } else {
                  // sets to null if no match
                  $mentioned = common_relative_profile($sender, $nickname);
              }
  
              if ($mentioned instanceof Profile) {
-                $user = User::getKV('id', $mentioned->id);
-
                  try {
-                    $url = $mentioned->getUrl();
+                    $url = $mentioned->getUri();    // prefer the URI as URL, if it is one.
+                    if (!common_valid_http_url($url)) {
+                        $url = $mentioned->getUrl();
+                    }
                  } catch (InvalidUrlException $e) {
                      $url = common_local_url('userbyid', array('id' => $mentioned->getID()));
                  }
@@ -766,7 +829,7 @@ function common_find_mentions($text, Profile $sender, Notice $parent=null)
  
          // @#tag => mention of all subscriptions tagged 'tag'
  
-        preg_match_all('/(?:^|[\s\.\,\:\;]+)@#([\pL\pN_\-\.]{1,64})/',
+        preg_match_all('/'.Nickname::BEFORE_MENTIONS.'@#([\pL\pN_\-\.]{1,64})/',
                         $text, $hmatches, PREG_OFFSET_CAPTURE);
          foreach ($hmatches[1] as $hmatch) {
              $tag = common_canonical_tag($hmatch[0]);
@@ -788,9 +851,8 @@ function common_find_mentions($text, Profile $sender, Notice $parent=null)
                                  'url' => $url);
          }
  
-        preg_match_all('/(?:^|[\s\.\,\:\;]+)!(' . Nickname::DISPLAY_FMT . ')/',
-                       $text, $hmatches, PREG_OFFSET_CAPTURE);
-        foreach ($hmatches[1] as $hmatch) {
+        $hmatches = common_find_mentions_raw($text, '!');
+        foreach ($hmatches as $hmatch) {
              $nickname = Nickname::normalize($hmatch[0]);
              $group = User_group::getForNickname($nickname, $sender);
  
@@ -820,9 +882,10 @@ function common_find_mentions($text, Profile $sender, Notice $parent=null)
   * Should generally not be called directly; for use in common_find_mentions.
   *
   * @param string $text
+ * @param string $preMention Character(s) that signals a mention ('@', '!'...)
   * @return array of PCRE match arrays
   */
-function common_find_mentions_raw($text)
+function common_find_mentions_raw($text, $preMention='@')
  {
      $tmatches = array();
      preg_match_all('/^T (' . Nickname::DISPLAY_FMT . ') /',
@@ -831,7 +894,8 @@ function common_find_mentions_raw($text)
                     PREG_OFFSET_CAPTURE);
  
      $atmatches = array();
-    preg_match_all('/(?:^|\s+)@(' . Nickname::DISPLAY_FMT . ')\b/',
+    // the regexp's "(?!\@)" makes sure it doesn't matches the single "@remote" in "@remote@server.com"
+    preg_match_all('/'.Nickname::BEFORE_MENTIONS.preg_quote($preMention, '/').'(' . Nickname::DISPLAY_FMT . ')\b(?!\@)/',
                     $text,
                     $atmatches,
                     PREG_OFFSET_CAPTURE);
@@ -853,6 +917,50 @@ function common_render_text($text)
      return $text;
  }
  
+define('_URL_SCHEME_COLON_DOUBLE_SLASH', 1);
+define('_URL_SCHEME_SINGLE_COLON', 2);
+define('_URL_SCHEME_NO_DOMAIN', 4);
+define('_URL_SCHEME_COLON_COORDINATES', 8);
+
+function common_url_schemes($filter=null)
+{
+    // TODO: move these to $config
+    $schemes = [
+                'http'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'https'     => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'ftp'       => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'ftps'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'mms'       => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'rtsp'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'gopher'    => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'news'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'nntp'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'telnet'    => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'wais'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'file'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'prospero'  => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'webcal'    => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'irc'       => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'ircs'      => _URL_SCHEME_COLON_DOUBLE_SLASH,
+                'aim'       => _URL_SCHEME_SINGLE_COLON,
+                'bitcoin'   => _URL_SCHEME_SINGLE_COLON,
+                'fax'       => _URL_SCHEME_SINGLE_COLON,
+                'jabber'    => _URL_SCHEME_SINGLE_COLON,
+                'mailto'    => _URL_SCHEME_SINGLE_COLON,
+                'tel'       => _URL_SCHEME_SINGLE_COLON,
+                'xmpp'      => _URL_SCHEME_SINGLE_COLON,
+                'magnet'    => _URL_SCHEME_NO_DOMAIN,
+                'geo'       => _URL_SCHEME_COLON_COORDINATES,
+                ];
+
+    return array_keys(
+            array_filter($schemes,
+                function ($scheme) use ($filter) {
+                    return is_null($filter) || ($scheme & $filter);
+                })
+            );
+}
+
  /**
   * Find links in the given text and pass them to the given callback function.
   *
@@ -861,6 +969,13 @@ function common_render_text($text)
   * @param mixed $arg: optional argument will be passed on to the callback
   */
  function common_replace_urls_callback($text, $callback, $arg = null) {
+    $geouri_labeltext_regex = '\pN\pL\-';
+    $geouri_mark_regex = '\-\_\.\!\~\*\\\'\(\)';    // the \\\' is really pretty
+    $geouri_unreserved_regex = '\pN\pL' . $geouri_mark_regex;
+    $geouri_punreserved_regex = '\[\]\:\&\+\$';
+    $geouri_pctencoded_regex = '(?:\%[0-9a-fA-F][0-9a-fA-F])';
+    $geouri_paramchar_regex = $geouri_unreserved_regex . $geouri_punreserved_regex; //FIXME: add $geouri_pctencoded_regex here so it works
+
      // Start off with a regex
      $regex = '#'.
      '(?:^|[\s\<\>\(\)\[\]\{\}\\\'\\\";]+)(?![\@\!\#])'.
@@ -868,9 +983,9 @@ function common_replace_urls_callback($text, $callback, $arg = null) {
          '(?:'.
              '(?:'. //Known protocols
                  '(?:'.
-                    '(?:(?:https?|ftps?|mms|rtsp|gopher|news|nntp|telnet|wais|file|prospero|webcal|ircs?)://)'.
+                    '(?:(?:' . implode('|', common_url_schemes(_URL_SCHEME_COLON_DOUBLE_SLASH)) . ')://)'.
                      '|'.
-                    '(?:(?:mailto|aim|tel|xmpp):)'.
+                    '(?:(?:' . implode('|', common_url_schemes(_URL_SCHEME_SINGLE_COLON)) . '):)'.
                  ')'.
                  '(?:[\pN\pL\-\_\+\%\~]+(?::[\pN\pL\-\_\+\%\~]+)?\@)?'. //user:pass@
                  '(?:'.
@@ -881,11 +996,26 @@ function common_replace_urls_callback($text, $callback, $arg = null) {
                      ')'.
                  ')'.
              ')'.
-            '|(?:(?:magnet):)'. // URLs without domain name
-            '|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'. //IPv4
-            '|(?:'. //IPv6
-                '\[?(?:(?:(?:[0-9A-Fa-f]{1,4}:){7}(?:(?:[0-9A-Fa-f]{1,4})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){6}(?::|(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})|(?::[0-9A-Fa-f]{1,4})))|(?:(?:[0-9A-Fa-f]{1,4}:){5}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){4}(?::[0-9A-Fa-f]{1,4}){0,1}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){3}(?::[0-9A-Fa-f]{1,4}){0,2}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){2}(?::[0-9A-Fa-f]{1,4}){0,3}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:)(?::[0-9A-Fa-f]{1,4}){0,4}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?::(?::[0-9A-Fa-f]{1,4}){0,5}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})))\]?(?<!:)'.
+            '|(?:'.
+                '(?:' . implode('|', common_url_schemes(_URL_SCHEME_COLON_COORDINATES)) . '):'.
+                // There's an order that must be followed here too, if ;crs= is used, it must precede ;u=
+                // Also 'crsp' (;crs=$crsp) must match $geouri_labeltext_regex
+                // Also 'uval' (;u=$uval) must be a pnum: \-?[0-9]+
+                '(?:'.
+                    '(?:[0-9]+(?:\.[0-9]+)?(?:\,[0-9]+(?:\.[0-9]+)?){1,2})'.    // 1(.23)?(,4(.56)){1,2}
+                    '(?:\;(?:['.$geouri_labeltext_regex.']+)(?:\=['.$geouri_paramchar_regex.']+)*)*'.
+                ')'.
              ')'.
+            // URLs without domain name, like magnet:?xt=...
+            '|(?:(?:' . implode('|', common_url_schemes(_URL_SCHEME_NO_DOMAIN)) . '):(?=\?))'.  // zero-length lookahead requires ? after :
+            (common_config('linkify', 'bare_ipv4')   // Convert IPv4 addresses to hyperlinks
+                ? '|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
+                : '').
+            (common_config('linkify', 'bare_ipv6')   // Convert IPv6 addresses to hyperlinks
+                ? '|(?:'. //IPv6
+                    '\[?(?:(?:(?:[0-9A-Fa-f]{1,4}:){7}(?:(?:[0-9A-Fa-f]{1,4})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){6}(?::|(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})|(?::[0-9A-Fa-f]{1,4})))|(?:(?:[0-9A-Fa-f]{1,4}:){5}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){4}(?::[0-9A-Fa-f]{1,4}){0,1}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){3}(?::[0-9A-Fa-f]{1,4}){0,2}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:){2}(?::[0-9A-Fa-f]{1,4}){0,3}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:[0-9A-Fa-f]{1,4}:)(?::[0-9A-Fa-f]{1,4}){0,4}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?::(?::[0-9A-Fa-f]{1,4}){0,5}(?:(?::(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})?)|(?:(?::[0-9A-Fa-f]{1,4}){1,2})))|(?:(?:(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})(?:\.(?:25[0-5]|2[0-4]\d|[01]?\d{1,2})){3})))\]?(?<!:)'.
+                    ')'
+                : '').
              (common_config('linkify', 'bare_domains')
                  ? '|(?:'. //DNS
                      '(?:[\pN\pL\-\_\+\%\~]+(?:\:[\pN\pL\-\_\+\%\~]+)?\@)?'. //user:pass@
@@ -897,10 +1027,10 @@ function common_replace_urls_callback($text, $callback, $arg = null) {
          ')'.
          '(?:'.
              '(?:\:\d+)?'. //:port
-            '(?:/[\pN\pL$\,\!\(\)\.\:\-\_\+\/\=\&\;\%\~\*\$\+\'@]*)?'. // /path
-            '(?:\?[\pN\pL\$\,\!\(\)\.\:\-\_\+\/\=\&\;\%\~\*\$\+\'@\/]*)?'. // ?query string
-            '(?:\#[\pN\pL$\,\!\(\)\.\:\-\_\+\/\=\&\;\%\~\*\$\+\'\@/\?\#]*)?'. // #fragment
-        ')(?<![\?\.\,\#\,])'.
+            '(?:/['  . URL_REGEX_VALID_PATH_CHARS    . ']*)?'.  // path
+            '(?:\?[' . URL_REGEX_VALID_QSTRING_CHARS . ']*)?'.  // ?query string
+            '(?:\#[' . URL_REGEX_VALID_FRAGMENT_CHARS . ']*)?'. // #fragment
+        ')(?<!['. URL_REGEX_EXCLUDED_END_CHARS .'])'.
      ')'.
      '#ixu';
      //preg_match_all($regex,$text,$matches);
@@ -997,9 +1127,9 @@ function common_linkify($url) {
  
      // Check to see whether this is a known "attachment" URL.
  
-    $f = File::getKV('url', $longurl);
-
-    if (!$f instanceof File) {
+    try {
+        $f = File::getByUrl($longurl);
+    } catch (NoResultException $e) {
          if (common_config('attachments', 'process_links')) {
              // XXX: this writes to the database. :<
              try {
@@ -1023,17 +1153,7 @@ function common_linkify($url) {
          }
      }
  
-    // Add clippy
-    if ($is_attachment) {
-        $attrs['class'] = 'attachment';
-        if ($has_thumb) {
-            $attrs['class'] = 'attachment thumbnail';
-        }
-        $attrs['id'] = "attachment-{$attachment_id}";
-    }
-
      // Whether to nofollow
-
      $nf = common_config('nofollow', 'external');
  
      if ($nf == 'never') {
@@ -1042,6 +1162,16 @@ function common_linkify($url) {
          $attrs['rel'] = 'nofollow external';
      }
  
+    // Add clippy
+    if ($is_attachment) {
+        $attrs['class'] = 'attachment';
+        if ($has_thumb) {
+            $attrs['class'] = 'attachment thumbnail';
+        }
+        $attrs['id'] = "attachment-{$attachment_id}";
+        $attrs['rel'] .= ' noreferrer';
+    }
+
      return XMLStringer::estring('a', $attrs, $url);
  }
  
@@ -1249,9 +1379,7 @@ function common_local_url($action, $args=null, $params=null, $fragment=null, $ad
          $r = Router::get();
          $path = $r->build($action, $args, $params, $fragment);
  
-        $ssl = common_config('site', 'ssl') === 'always'
-                || GNUsocial::isHTTPS()
-                || common_is_sensitive($action);
+        $ssl = GNUsocial::useHTTPS();
  
          if (common_config('site','fancy')) {
              $url = common_path($path, $ssl, $addSession);
@@ -1267,35 +1395,11 @@ function common_local_url($action, $args=null, $params=null, $fragment=null, $ad
      return $url;
  }
  
-function common_is_sensitive($action)
-{
-    static $sensitive = array(
-        'login',
-        'register',
-        'passwordsettings',
-        'api',
-        'ApiOAuthRequestToken',
-        'ApiOAuthAccessToken',
-        'ApiOAuthAuthorize',
-        'ApiOAuthPin',
-        'showapplication'
-    );
-    $ssl = null;
-
-    if (Event::handle('SensitiveAction', array($action, &$ssl))) {
-        $ssl = in_array($action, $sensitive);
-    }
-
-    return $ssl;
-}
-
  function common_path($relative, $ssl=false, $addSession=true)
  {
      $pathpart = (common_config('site', 'path')) ? common_config('site', 'path')."/" : '';
  
-    if (($ssl && (common_config('site', 'ssl') === 'sometimes'))
-        || GNUsocial::isHTTPS()
-        || common_config('site', 'ssl') === 'always') {
+    if ($ssl && GNUsocial::useHTTPS()) {
          $proto = 'https';
          if (is_string(common_config('site', 'sslserver')) &&
              mb_strlen(common_config('site', 'sslserver')) > 0) {
@@ -1321,6 +1425,74 @@ function common_path($relative, $ssl=false, $addSession=true)
      return $proto.'://'.$serverpart.'/'.$pathpart.$relative;
  }
  
+// FIXME: Maybe this should also be able to handle non-fancy URLs with index.php?p=...
+function common_fake_local_fancy_url($url)
+{
+    /**
+     * This is a hacky fix to make URIs generated with "index.php/" match against
+     * locally stored URIs without that. So for example if the remote site is looking
+     * up the webfinger for some user and for some reason knows about https://some.example/user/1
+     * but we locally store and report only https://some.example/index.php/user/1 then they would
+     * dismiss the profile for not having an identified alias.
+     *
+     * There are various live instances where these issues occur, for various reasons.
+     * Most of them being users fiddling with configuration while already having
+     * started federating (distributing the URI to other servers) or maybe manually
+     * editing the local database.
+     */
+    if (!preg_match(
+                // [1] protocol part, we can only rewrite http/https anyway.
+                '/^(https?:\/\/)' .
+                // [2] site name.
+                // FIXME: Dunno how this acts if we're aliasing ourselves with a .onion domain etc.
+                '('.preg_quote(common_config('site', 'server'), '/').')' .
+                // [3] site path, or if that is empty just '/' (to retain the /)
+                '('.preg_quote(common_config('site', 'path') ?: '/', '/').')' .
+                // [4] + [5] extract index.php (+ possible leading double /) and the rest of the URL separately.
+                '(\/?index\.php\/)(.*)$/', $url, $matches)) {
+        // if preg_match failed to match
+        throw new Exception('No known change could be made to the URL.');
+    }
+
+    // now reconstruct the URL with everything except the "index.php/" part
+    $fancy_url = '';
+    foreach ([1,2,3,5] as $idx) {
+        $fancy_url .= $matches[$idx];
+    }
+    return $fancy_url;
+}
+
+// FIXME: Maybe this should also be able to handle non-fancy URLs with index.php?p=...
+function common_fake_local_nonfancy_url($url)
+{
+    /**
+     * This is a hacky fix to make URIs NOT generated with "index.php/" match against
+     * locally stored URIs WITH that. The reverse from the above.
+     *
+     * It will also "repair" index.php URLs with multiple / prepended. Like https://some.example///index.php/user/1
+     */
+    if (!preg_match(
+                // [1] protocol part, we can only rewrite http/https anyway.
+                '/^(https?:\/\/)' .
+                // [2] site name.
+                // FIXME: Dunno how this acts if we're aliasing ourselves with a .onion domain etc.
+                '('.preg_quote(common_config('site', 'server'), '/').')' .
+                // [3] site path, or if that is empty just '/' (to retain the /)
+                '('.preg_quote(common_config('site', 'path') ?: '/', '/').')' .
+                // [4] should be empty (might contain one or more / and then maybe also index.php). Will be overwritten.
+                // [5] will have the extracted actual URL part (besides site path)
+                '((?!index.php\/)\/*(?:index.php\/)?)(.*)$/', $url, $matches)) {
+        // if preg_match failed to match
+        throw new Exception('No known change could be made to the URL.');
+    }
+
+    $matches[4] = 'index.php/'; // inject the index.php/ rewritethingy
+
+    // remove the first element, which is the full matching string
+    array_shift($matches);
+    return implode($matches);
+}
+
  function common_inject_session($url, $serverpart = null)
  {
      if (!common_have_session()) {
@@ -1531,10 +1703,15 @@ function common_profile_url($nickname)
  
  /**
   * Should make up a reasonable root URL
+ *
+ * @param   bool    $tls    true or false to force TLS scheme, null to use server configuration
   */
-function common_root_url($ssl=false)
+function common_root_url($tls=null)
  {
-    $url = common_path('', $ssl, false);
+    if (is_null($tls)) {
+        $tls = GNUsocial::useHTTPS();
+    }
+    $url = common_path('', $tls, false);
      $i = strpos($url, '?');
      if ($i !== false) {
          $url = substr($url, 0, $i);
@@ -1733,6 +1910,10 @@ function common_log_objstring(&$object)
  
  function common_valid_http_url($url, $secure=false)
  {
+    if (empty($url)) {
+        return false;
+    }
+
      // If $secure is true, only allow https URLs to pass
      // (if false, we use '?' in 'https?' to say the 's' is optional)
      $regex = $secure ? '/^https$/' : '/^https?$/';
@@ -1827,14 +2008,24 @@ function common_accept_to_prefs($accept, $def = '*/*')
  }
  
  // Match by our supported file extensions
-function common_supported_ext_to_mime($fileext)
+function common_supported_filename_to_mime($filename)
  {
      // Accept a filename and take out the extension
-    if (strpos($fileext, '.') !== false) {
-        $fileext = substr(strrchr($fileext, '.'), 1);
+    if (strpos($filename, '.') === false) {
+        throw new ServerException(sprintf('No extension on filename: %1$s', _ve($filename)));
      }
  
+    $fileext = substr(strrchr($filename, '.'), 1);
+    return common_supported_ext_to_mime($fileext);
+}
+
+function common_supported_ext_to_mime($fileext)
+{
      $supported = common_config('attachments', 'supported');
+    if ($supported === true) {
+        // FIXME: Should we just accept the extension straight off when supported === true?
+        throw new UnknownExtensionMimeException($fileext);
+    }
      foreach($supported as $type => $ext) {
          if ($ext === $fileext) {
              return $type;
@@ -1848,13 +2039,15 @@ function common_supported_ext_to_mime($fileext)
  function common_supported_mime_to_ext($mimetype)
  {
      $supported = common_config('attachments', 'supported');
-    foreach($supported as $type => $ext) {
-        if ($mimetype === $type) {
-            return $ext;
+    if (is_array($supported)) {
+        foreach($supported as $type => $ext) {
+            if ($mimetype === $type) {
+                return $ext;
+            }
          }
      }
  
-    throw new ServerException('Unsupported MIME type');
+    throw new UnknownMimeExtensionException($mimetype);
  }
  
  // The MIME "media" is the part before the slash (video in video/webm)
@@ -1871,7 +2064,7 @@ function common_bare_mime($mimetype)
      if ($semicolon = mb_strpos($mimetype, ';')) {
          $mimetype = mb_substr($mimetype, 0, $semicolon);
      }
-    return $mimetype;
+    return trim($mimetype);
  }
  
  function common_mime_type_match($type, $avail)
@@ -2422,6 +2615,9 @@ function common_log_delta($comment=null)
  
  function common_strip_html($html, $trim=true, $save_whitespace=false)
  {
+    // first replace <br /> with \n
+    $html = preg_replace('/\<(\s*)?br(\s*)?\/?(\s*)?\>/i', "\n", $html); 
+    // then, unless explicitly avoided, remove excessive whitespace
      if (!$save_whitespace) {
          $html = preg_replace('/\s+/', ' ', $html);
      }
@@ -2436,4 +2632,9 @@ function html_sprintf()
          $args[$i] = htmlspecialchars($args[$i]);
      }
      return call_user_func_array('sprintf', $args);
-}
-\ No newline at end of file
+}
+
+function _ve($var)
+{
+    return var_export($var, true);
+}