Allow using punctuation chars as smiley delimiters

author gudzpoz <gudzpoz@live.com>

Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)

committer gudzpoz <gudzpoz@live.com>

Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)
author gudzpoz <gudzpoz@live.com>
Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)
committer gudzpoz <gudzpoz@live.com>
Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)
diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php

index 1aeab5b804fb5bdfe62fbfdc815781064735e3d0..9c357a9eb267d47e1927703e16f83751f596c2e5 100644 (file)
--- a/src/Content/Smilies.php
+++ b/src/Content/Smilies.php
@@ -197,11 +197,11 @@ class Smilies
          * @return array with smilie codes (colon included) as the keys, their image urls as values;
          *               the normalized string is put under the '' (empty string) key
          */
-       public static function extractUsedSmilies(string $text): array
+       public static function extractUsedSmilies(string $text, string &$normalized = null): array
         {
                 $emojis = [];
  
-               $emojis[''] = BBCode::performWithEscapedTags($text, ['code'], function ($text) use (&$emojis) {
+               $normalized = BBCode::performWithEscapedTags($text, ['code'], function ($text) use (&$emojis) {
                         return BBCode::performWithEscapedTags($text, ['noparse', 'nobb', 'pre'], function ($text) use (&$emojis) {
                                 if (strpos($text, '[nosmile]') !== false || self::noSmilies()) {
                                         return $text;
@@ -236,43 +236,69 @@ class Smilies
          */
         private static function performForEachWordMatch(array $words, string $subject, callable $callback): string
         {
-               $offset = 0;
-               $result = '';
-               $processed = 0;
-               // Learned from PHP's strtr implementation
-               // Should probably improve performance once JIT-compiled
-               $length_bitset = 0;
-               $ord_bitset = 0;
+               $ord1_bitset = 0;
+               $ord2_bitset = 0;
+               $prefixes = [];
                 foreach ($words as $word => $_) {
-                       $length = strlen($word);
-                       if ($length <= 31) {
-                               $length_bitset |= 1 << $length;
+                       if (strlen($word) < 2 || !ctype_graph($word)) {
+                               continue;
+                       }
+                       $ord1 = ord($word);
+                       $ord2 = ord($word[1]);
+                       $ord1_bitset |= 1 << ($ord1 & 31);
+                       $ord2_bitset |= 1 << ($ord2 & 31);
+                       if (!array_key_exists($word[0], $prefixes)) {
+                               $prefixes[$word[0]] = [];
                         }
-                       $ord = ord($word);
-                       $ord_bitset |= 1 << ($ord & 31);
+                       $prefixes[$word[0]][] = $word;
                 }
  
-               while ($offset < strlen($subject) && preg_match('/\s+?(?=\S|$)/', $subject, $matches, PREG_OFFSET_CAPTURE, $offset)) {
-                       [$whitespaces, $next] = $matches[0];
-                       $word = substr($subject, $offset, $next - $offset);
-
-                       $shift = strlen($word);
-                       $ord = ord($word);
-                       if (($shift > 31 || ($length_bitset & (1 << $shift)))
-                               && ($ord_bitset & (1 << ($ord & 31)))
-                               && array_key_exists($word, $words)) {
-                               $result .= substr($subject, $processed, $offset - $processed);
-                               $result .= call_user_func($callback, $word, $words[$word]);
-                               $processed = $offset + strlen($word);
+               $result = '';
+               $processed = 0;
+               $s_start = 0; // Segment start
+               // No spaces are allowed in smilies, so they can serve as delimiters.
+               // Splitting by some delimiters may not necessary though?
+               while (true) {
+                       if ($s_start >= strlen($subject)) {
+                               $result .= substr($subject, $processed);
+                               break;
                         }
-                       $offset = $next + strlen($whitespaces);
-               }
-               $word = substr($subject, $offset);
-               if (array_key_exists($word, $words)) {
-                       $result .= substr($subject, $processed, $offset - $processed);
-                       $result .= call_user_func($callback, $word, $words[$word]);
-               } else {
-                       $result .= substr($subject, $processed);
+                       if (preg_match('/\s+?(?=\S|$)/', $subject, $match, PREG_OFFSET_CAPTURE, $s_start)) {
+                               [$whitespaces, $s_end] = $match[0];
+                       } else {
+                               $s_end = strlen($subject);
+                               $whitespaces = '';
+                       }
+                       $s_length = $s_end - $s_start;
+                       if ($s_length > 1) {
+                               $segment = substr($subject, $s_start, $s_length);
+                               // Find possible starting points for smilies.
+                               // For built-in smilies, the two bitsets should make attempts quite efficient.
+                               // However, presuming custom smilies follow the format of ":shortcode" or ":shortcode:",
+                               // if the user adds more smilies (with addons), the second bitset may eventually become useless.
+                               for ($i = 0; $i < $s_length - 1; $i++) {
+                                       $c = $segment[$i];
+                                       $d = $segment[$i + 1];
+                                       if (($ord1_bitset & (1 << (ord($c) & 31))) && ($ord2_bitset & (1 << (ord($d) & 31))) && array_key_exists($c, $prefixes)) {
+                                               foreach ($prefixes[$c] as $word) {
+                                                       $wlength = strlen($word);
+                                                       if ($wlength <= $s_length - $i && substr($segment, $i, $wlength) === $word) {
+                                                               // Check for boundaries
+                                                               if (($i === 0 || ctype_space($segment[$i - 1]) || ctype_punct($segment[$i - 1]))
+                                                                       && ($i + $wlength >= $s_length || ctype_space($segment[$i + $wlength]) || ctype_punct($segment[$i + $wlength]))) {
+                                                                       $result .= substr($subject, $processed, $s_start - $processed + $i);
+                                                                       $result .= call_user_func($callback, $word, $words[$word]);
+                                                                       $i += $wlength;
+                                                                       $processed = $s_start + $i;
+                                                                       $i--;
+                                                                       break;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       $s_start = $s_end + strlen($whitespaces);
                 }
                 return $result;
         }
diff --git a/src/Factory/Api/Mastodon/Status.php b/src/Factory/Api/Mastodon/Status.php

index 6d45b4d9fe683e0099c76cd8cf86bdf9eeb3b321..5cd90ecb5f94d2dc2024e5f2bed982b7b68fe4a5 100644 (file)
--- a/src/Factory/Api/Mastodon/Status.php
+++ b/src/Factory/Api/Mastodon/Status.php
@@ -290,14 +290,12 @@ class Status extends BaseFactory
  
                 $emojis = null;
                 if (DI::baseUrl()->isLocalUrl($item['uri'])) {
-                       $used_smilies = Smilies::extractUsedSmilies($item['raw-body'] ?: $item['body']);
-                       // $used_smilies contains normalized texts
+                       $used_smilies = Smilies::extractUsedSmilies($item['raw-body'] ?: $item['body'], $normalized);
                         if ($item['raw-body']) {
-                               $item['raw-body'] = $used_smilies[''];
+                               $item['raw-body'] = $normalized;
                         } elseif ($item['body']) {
-                               $item['body'] = $used_smilies[''];
+                               $item['body'] = $normalized;
                         }
-                       unset($used_smilies['']);
                         $emojis = $this->mstdnEmojiFactory->createCollectionFromArray($used_smilies)->getArrayCopy(true);
                 } else {
                         if (preg_match_all("(\[emoji=(.*?)](.*?)\[/emoji])ism", $item['body'] ?: $item['raw-body'], $matches)) {
diff --git a/src/Protocol/ActivityPub/Transmitter.php b/src/Protocol/ActivityPub/Transmitter.php

index 56724e22d2cb95502d37efece2fff5e25c03e322..9d0d998f9d6dd56df96573126210823fc0796db7 100644 (file)
--- a/src/Protocol/ActivityPub/Transmitter.php
+++ b/src/Protocol/ActivityPub/Transmitter.php
@@ -1514,11 +1514,9 @@ class Transmitter
          * @param string $text Text containing tags like :tag:
          * @return string normalized text
          */
-       private static function addEmojiTags(array &$tags, string $text)
+       private static function addEmojiTags(array &$tags, string $text): string
         {
-               $emojis = Smilies::extractUsedSmilies($text);
-               $normalized = $emojis[''];
-               unset($emojis['']);
+               $emojis = Smilies::extractUsedSmilies($text, $normalized);
                 foreach ($emojis as $name => $url) {
                         $tags[] = [
                                 'type' => 'Emoji',
diff --git a/tests/datasets/api.fixture.php b/tests/datasets/api.fixture.php

index 876827d7486c499ec956a4fe3aebdf74e46ed641..2bf38a5e91fcfc1cde09bd7b11aacaf1a336ef70 100644 (file)
--- a/tests/datasets/api.fixture.php
+++ b/tests/datasets/api.fixture.php
@@ -371,7 +371,7 @@ return [
                 [
                         'uri-id' => 100,
                         'title'  => 'item_title',
-                       'body'   => ':like ~friendica no [code]:dislike[/code] :-p :-[',
+                       'body'   => ':like ~friendica no [code]:dislike[/code] :-p :-[ <3',
                         'plink'  => 'https://friendica.local/post/100',
                 ],
         ],
diff --git a/tests/src/Content/SmiliesTest.php b/tests/src/Content/SmiliesTest.php

index 67ba313fe6f4277548c0ecc3e9c09d1c074f7a03..e41e59ca82520e38f64b93d333ae09556a6748e4 100644 (file)
--- a/tests/src/Content/SmiliesTest.php
+++ b/tests/src/Content/SmiliesTest.php
@@ -147,7 +147,7 @@ class SmiliesTest extends FixtureTest
  
         public function dataReplace(): array
         {
-               return [
+               $data = [
                         'simple-1' => [
                                 'expected' => 'alt=":-p"',
                                 'body' => ':-p',
@@ -165,7 +165,7 @@ class SmiliesTest extends FixtureTest
                                 'body' => '~friendicaca',
                         ],
                         'symbol-boundary-1' => [
-                               'expected' => '(:-p)',
+                               'expected' => 'alt=":-p"',
                                 'body' => '(:-p)',
                         ],
                         'hearts-1' => [
@@ -185,6 +185,19 @@ class SmiliesTest extends FixtureTest
                                 'body' => '(3&lt;33)',
                         ],
                 ];
+               foreach ([':-[', ':-D', 'o.O'] as $emoji) {
+                       foreach (['A', '_', ':', '-'] as $prefix) {
+                               foreach (['', ' ', 'A', ':', '-'] as $suffix) {
+                                       $no_smile = ($prefix !== '' && ctype_alnum($prefix)) || ($suffix !== '' && ctype_alnum($suffix));
+                                       $s = $prefix . $emoji . $suffix;
+                                       $data[] = [
+                                               'expected' => $no_smile ? $s : 'alt="' . $emoji . '"',
+                                               'body' => $s,
+                                       ];
+                               }
+                       }
+               }
+               return $data;
         }
  
         /**
@@ -202,6 +215,11 @@ class SmiliesTest extends FixtureTest
         public function dataExtractUsedSmilies(): array
         {
                 return [
+                       'symbols' => [
+                               'expected' => ['p', 'heart', 'embarrassed', 'kiss'],
+                               'body' => ':-p &lt;3 ":-[:-"',
+                               'normalized' => ':p: :heart: ":embarrassed::kiss:',
+                       ],
                         'single-smiley' => [
                                 'expected' => ['like'],
                                 'body' => ':like',
@@ -239,11 +257,12 @@ class SmiliesTest extends FixtureTest
          */
         public function testExtractUsedSmilies(array $expected, string $body, string $normalized)
         {
-               $extracted = Smilies::extractUsedSmilies($body);
-               $this->assertEquals($normalized, $extracted['']);
-               foreach ($expected as $shortcode) {
-                       $this->assertArrayHasKey($shortcode, $extracted);
+               $extracted = Smilies::extractUsedSmilies($body, $converted);
+               $expected = array_fill_keys($expected, true);
+               $this->assertEquals($normalized, $converted);
+               foreach (array_keys($extracted) as $shortcode) {
+                       $this->assertArrayHasKey($shortcode, $expected);
                 }
-               $this->assertEquals(count($expected), count($extracted) - 1);
+               $this->assertEquals(count($expected), count($extracted));
         }
  }
diff --git a/tests/src/Factory/Api/Mastodon/StatusTest.php b/tests/src/Factory/Api/Mastodon/StatusTest.php

index 7593d9a32b08cc53875c044ea96d46c2fbe6ca5b..d150d85574b6c2a7fdbbf75d06ab57ac161a97c6 100644 (file)
--- a/tests/src/Factory/Api/Mastodon/StatusTest.php
+++ b/tests/src/Factory/Api/Mastodon/StatusTest.php
@@ -50,7 +50,7 @@ class StatusTest extends FixtureTest
                 $post = Post::selectFirst([], ['id' => 14]);
                 $this->assertNotNull($post);
                 $result = $this->status->createFromUriId($post['uri-id'])->toArray();
-               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed:', $result['content']);
+               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: ❤', $result['content']);
                 $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed'], true);
                 $this->assertEquals(count($emojis), count($result['emojis']));
                 foreach ($result['emojis'] as $emoji) {
diff --git a/tests/src/Protocol/ActivityPub/TransmitterTest.php b/tests/src/Protocol/ActivityPub/TransmitterTest.php

index c7a94bc59835b01722fe84e90e550f237e7f48a6..49b51da4b9ad93683faa518a2c2f984c86629996 100644 (file)
--- a/tests/src/Protocol/ActivityPub/TransmitterTest.php
+++ b/tests/src/Protocol/ActivityPub/TransmitterTest.php
@@ -42,7 +42,7 @@ class TransmitterTest extends FixtureTest
                 $note = Transmitter::createNote($post);
                 $this->assertNotNull($note);
  
-               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed:', $note['content']);
+               $this->assertEquals(':like: :friendica: no <code>:dislike</code> :p: :embarrassed: ❤', $note['content']);
                 $emojis = array_fill_keys(['like', 'friendica', 'p', 'embarrassed'], true);
                 $this->assertEquals(count($emojis), count($note['tag']));
                 foreach ($note['tag'] as $emoji) {
author	gudzpoz <gudzpoz@live.com>
	Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)
committer	gudzpoz <gudzpoz@live.com>
	Thu, 16 Nov 2023 05:31:31 +0000 (13:31 +0800)
src/Content/Smilies.php		patch \| blob \| history
src/Factory/Api/Mastodon/Status.php		patch \| blob \| history
src/Protocol/ActivityPub/Transmitter.php		patch \| blob \| history
tests/datasets/api.fixture.php		patch \| blob \| history
tests/src/Content/SmiliesTest.php		patch \| blob \| history
tests/src/Factory/Api/Mastodon/StatusTest.php		patch \| blob \| history
tests/src/Protocol/ActivityPub/TransmitterTest.php		patch \| blob \| history