return $params;
}
- /**
- * Normalizes smiley shortcodes into texts with no special symbols.
- *
- * @return array
- * 'texts' => smilie shortcut
- * 'icons' => icon url or an empty string
- * 'norms' => normalized shortcut
- */
- public static function getNormalizedList(): array
- {
- $smilies = self::getList();
- $norms = [];
- $icons = $smilies['icons'];
- foreach ($smilies['texts'] as $i => $shortcode) {
- // Extract urls
- $icon = $icons[$i];
- if (preg_match('/src="(.+?)"/', $icon, $match)) {
- $icon = $match[1];
- } else {
- $icon = '';
- }
- $icons[$i] = $icon;
-
- // Normalize name
- $norm = preg_replace('/[\s\-:#~]/', '', $shortcode);
- if (ctype_alnum($norm)) {
- $norms[] = $norm;
- } elseif (preg_match('#/smiley-(\w+)\.gif#', $icon, $match)) {
- $norms[] = $match[1];
- } else {
- $norms[] = 'smiley' . $i;
- }
- }
- $smilies['norms'] = $norms;
- return $smilies;
- }
-
/**
* Finds all used smilies (denoted by quoting colons like :heart:) in the provided text and normalizes their usages.
*
* @return array with smilie codes (colon included) as the keys, their image urls as values;
* the normalized string is put under the '' (empty string) key
*/
- public static function extractUsedSmilies(string $text): array
+ public static function extractUsedSmilies(string $text, string &$normalized = null): array
{
$emojis = [];
- $emojis[''] = BBCode::performWithEscapedTags($text, ['code'], function ($text) use (&$emojis) {
+ $normalized = BBCode::performWithEscapedTags($text, ['code'], function ($text) use (&$emojis) {
return BBCode::performWithEscapedTags($text, ['noparse', 'nobb', 'pre'], function ($text) use (&$emojis) {
if (strpos($text, '[nosmile]') !== false || self::noSmilies()) {
return $text;
}
- $smilies = self::getNormalizedList();
- $normalized = array_combine($smilies['texts'], $smilies['norms']);
+ $smilies = self::getList();
+ $normalized = [];
return self::performForEachWordMatch(
array_combine($smilies['texts'], $smilies['icons']),
$text,
function (string $name, string $image) use($normalized, &$emojis) {
- $name = $normalized[$name];
+ if (array_key_exists($name, $normalized)) {
+ return $normalized[$name];
+ }
if (preg_match('/src="(.+?)"/', $image, $match)) {
- $image = $match[1];
- $emojis[$name] = $image;
+ $url = $match[1];
+ // Image smilies, which should be normalized instead of being embedded for some protocols like ActivityPub.
+ // Normalize name
+ $norm = preg_replace('/[\s\-:#~]/', '', $name);
+ if (!ctype_alnum($norm)) {
+ if (preg_match('#/smiley-(\w+)\.gif#', $url, $match)) {
+ $norm = $match[1];
+ } else {
+ $norm = 'smiley' . count($normalized);
+ }
+ }
+ $shortcode = ':' . $norm . ':';
+ $normalized[$name] = $shortcode;
+ $emojis[$norm] = $url;
+ return $shortcode;
+ } else {
+ $normalized[$name] = $image;
+ // Probably text-substitution smilies (e.g., Unicode ones).
+ return $image;
}
- return ':' . $name . ':';
},
);
});
*/
private static function performForEachWordMatch(array $words, string $subject, callable $callback): string
{
- $offset = 0;
- $result = '';
- $processed = 0;
- // Learned from PHP's strtr implementation
- // Should probably improve performance once JIT-compiled
- $length_bitset = 0;
- $ord_bitset = 0;
+ $ord1_bitset = 0;
+ $ord2_bitset = 0;
+ $prefixes = [];
foreach ($words as $word => $_) {
- $length = strlen($word);
- if ($length <= 31) {
- $length_bitset |= 1 << $length;
+ if (strlen($word) < 2) {
+ continue;
+ }
+ $ord1 = ord($word[0]);
+ $ord2 = ord($word[1]);
+ // A smiley shortcode must not begin or end with whitespaces.
+ if (ctype_space($word[0]) || ctype_space($word[strlen($word) - 1])) {
+ continue;
+ }
+ $ord1_bitset |= 1 << ($ord1 & 31);
+ $ord2_bitset |= 1 << ($ord2 & 31);
+ if (!array_key_exists($word[0], $prefixes)) {
+ $prefixes[$word[0]] = [];
}
- $ord = ord($word);
- $ord_bitset |= 1 << ($ord & 31);
+ $prefixes[$word[0]][] = $word;
}
- while ($offset < strlen($subject) && preg_match('/\s+?(?=\S|$)/', $subject, $matches, PREG_OFFSET_CAPTURE, $offset)) {
- [$whitespaces, $next] = $matches[0];
- $word = substr($subject, $offset, $next - $offset);
-
- $shift = strlen($word);
- $ord = ord($word);
- if (($shift > 31 || ($length_bitset & (1 << $shift)))
- && ($ord_bitset & (1 << ($ord & 31)))
- && array_key_exists($word, $words)) {
- $result .= substr($subject, $processed, $offset - $processed);
- $result .= call_user_func($callback, $word, $words[$word]);
- $processed = $offset + strlen($word);
+ $slength = strlen($subject);
+ $result = '';
+ // $processed is used to delay string concatenation since appending a char every loop is inefficient.
+ $processed = 0;
+ // Find possible starting points for smilies.
+ // For built-in smilies, the two bitsets should make attempts quite efficient.
+ // However, presuming custom smilies follow the format of ":shortcode" or ":shortcode:",
+ // if the user adds more smilies (with addons), the second bitset may eventually become useless.
+ for ($i = 0; $i < $slength - 1; $i++) {
+ $c = $subject[$i];
+ $d = $subject[$i + 1];
+ if (($ord1_bitset & (1 << (ord($c) & 31))) && ($ord2_bitset & (1 << (ord($d) & 31))) && array_key_exists($c, $prefixes)) {
+ foreach ($prefixes[$c] as $word) {
+ $wlength = strlen($word);
+ if (substr($subject, $i, $wlength) === $word) {
+ // Check for boundaries
+ if (($i === 0 || ctype_space($subject[$i - 1]) || ctype_punct($subject[$i - 1]))
+ && ($i + $wlength >= $slength || ctype_space($subject[$i + $wlength]) || ctype_punct($subject[$i + $wlength]))) {
+ $result .= substr($subject, $processed, $i - $processed);
+ $result .= call_user_func($callback, $word, $words[$word]);
+ $i += $wlength;
+ $processed = $i;
+ $i--;
+ break;
+ }
+ }
+ }
}
- $offset = $next + strlen($whitespaces);
}
- $word = substr($subject, $offset);
- if (array_key_exists($word, $words)) {
- $result .= substr($subject, $processed, $offset - $processed);
- $result .= call_user_func($callback, $word, $words[$word]);
- } else {
+ if ($processed < $slength) {
$result .= substr($subject, $processed);
}
return $result;