From 43f9be367ff36129ad3c5550293252aa7e663e79 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 15 Nov 2023 16:19:05 +0000 Subject: [PATCH] Check for user defined channel matches before storing them --- database.sql | 12 +++- doc/database.md | 1 + doc/database/db_test-full-text-search.md | 23 ++++++ .../Repository/UserDefinedChannel.php | 48 ++++++++++++- src/Model/GServer.php | 4 +- src/Model/Post/Engagement.php | 70 ++++++++++++++++--- src/Model/Search.php | 35 +++++++++- src/Model/User.php | 11 ++- src/Module/Conversation/Timeline.php | 2 +- src/Protocol/ActivityPub/Processor.php | 51 +++++++++----- src/Protocol/Relay.php | 7 +- src/Worker/OptimizeTables.php | 1 + static/dbstructure.config.php | 13 +++- 13 files changed, 236 insertions(+), 42 deletions(-) create mode 100644 doc/database/db_test-full-text-search.md diff --git a/database.sql b/database.sql index 5c999da231..39a56e7966 100644 --- a/database.sql +++ b/database.sql @@ -1,6 +1,6 @@ -- ------------------------------------------ -- Friendica 2023.09-rc (Giant Rhubarb) --- DB_UPDATE_VERSION 1539 +-- DB_UPDATE_VERSION 1540 -- ------------------------------------------ @@ -1867,6 +1867,16 @@ CREATE TABLE IF NOT EXISTS `subscription` ( FOREIGN KEY (`uid`) REFERENCES `user` (`uid`) ON UPDATE RESTRICT ON DELETE CASCADE ) DEFAULT COLLATE utf8mb4_general_ci COMMENT='Push Subscription for the API'; +-- +-- TABLE test-full-text-search +-- +CREATE TABLE IF NOT EXISTS `test-full-text-search` ( + `pid` int unsigned NOT NULL DEFAULT 0 COMMENT 'Process id of the worker', + `searchtext` mediumtext COMMENT 'Simplified text for the full text search', + PRIMARY KEY(`pid`), + FULLTEXT INDEX `searchtext` (`searchtext`) +) DEFAULT COLLATE utf8mb4_general_ci COMMENT='Test for a full text search match in user defined channels before storing the message in the system'; + -- -- TABLE userd -- diff --git a/doc/database.md b/doc/database.md index 516be3bae1..0ed116bc73 100644 --- a/doc/database.md +++ b/doc/database.md @@ -86,6 +86,7 @@ Database Tables | [storage](help/database/db_storage) | Data stored by Database storage backend | | [subscription](help/database/db_subscription) | Push Subscription for the API | | [tag](help/database/db_tag) | tags and mentions | +| [test-full-text-search](help/database/db_test-full-text-search) | Test for a full text search match in user defined channels before storing the message in the system | | [user](help/database/db_user) | The local users | | [user-contact](help/database/db_user-contact) | User specific public contact data | | [user-gserver](help/database/db_user-gserver) | User settings about remote servers | diff --git a/doc/database/db_test-full-text-search.md b/doc/database/db_test-full-text-search.md new file mode 100644 index 0000000000..b6b9bd19e9 --- /dev/null +++ b/doc/database/db_test-full-text-search.md @@ -0,0 +1,23 @@ +Table test-full-text-search +=========== + +Test for a full text search match in user defined channels before storing the message in the system + +Fields +------ + +| Field | Description | Type | Null | Key | Default | Extra | +| ---------- | ---------------------------------------- | ------------ | ---- | --- | ------- | ----- | +| pid | Process id of the worker | int unsigned | NO | | 0 | | +| searchtext | Simplified text for the full text search | mediumtext | YES | | NULL | | + +Indexes +------------ + +| Name | Fields | +| ---------- | -------------------- | +| PRIMARY | pid | +| searchtext | FULLTEXT, searchtext | + + +Return to [database documentation](help/database) diff --git a/src/Content/Conversation/Repository/UserDefinedChannel.php b/src/Content/Conversation/Repository/UserDefinedChannel.php index 1014711765..a414f83c25 100644 --- a/src/Content/Conversation/Repository/UserDefinedChannel.php +++ b/src/Content/Conversation/Repository/UserDefinedChannel.php @@ -25,16 +25,23 @@ use Friendica\BaseCollection; use Friendica\Content\Conversation\Collection\UserDefinedChannels; use Friendica\Content\Conversation\Entity; use Friendica\Content\Conversation\Factory; +use Friendica\Core\PConfig\Capability\IManagePersonalConfigValues; use Friendica\Database\Database; +use Friendica\Model\User; use Psr\Log\LoggerInterface; class UserDefinedChannel extends \Friendica\BaseRepository { protected static $table_name = 'channel'; - public function __construct(Database $database, LoggerInterface $logger, Factory\UserDefinedChannel $factory) + /** @var IManagePersonalConfigValues */ + private $pConfig; + + public function __construct(Database $database, LoggerInterface $logger, Factory\UserDefinedChannel $factory, IManagePersonalConfigValues $pConfig) { parent::__construct($database, $logger, $factory); + + $this->pConfig = $pConfig; } /** @@ -89,7 +96,7 @@ class UserDefinedChannel extends \Friendica\BaseRepository */ public function deleteById(int $id, int $uid): bool { - return $this->db->delete('channel', ['id' => $id, 'uid' => $uid]); + return $this->db->delete(self::$table_name, ['id' => $id, 'uid' => $uid]); } /** @@ -130,4 +137,41 @@ class UserDefinedChannel extends \Friendica\BaseRepository return $Channel; } + + /** + * Checks, if one of the user defined channels matches with the given search text + * @todo To increase the performance, this functionality should be replaced with a single SQL call. + * + * @param string $searchtext + * @param string $language + * @return boolean + */ + public function match(string $searchtext, string $language): bool + { + if (!in_array($language, User::getLanguages())) { + $this->logger->debug('Unwanted language found. No matched channel found.', ['language' => $language, 'searchtext' => $searchtext]); + return false; + } + + $store = false; + $this->db->insert('test-full-text-search', ['pid' => getmypid(), 'searchtext' => $searchtext], Database::INSERT_UPDATE); + $channels = $this->db->select(self::$table_name, ['full-text-search', 'uid', 'label'], ["`full-text-search` != ?", '']); + while ($channel = $this->db->fetch($channels)) { + $channelsearchtext = $channel['full-text-search']; + foreach (['from', 'to', 'group', 'tag', 'network', 'platform', 'visibility'] as $keyword) { + $channelsearchtext = preg_replace('~(' . $keyword . ':.[\w@\.-]+)~', '"$1"', $channelsearchtext); + } + if ($this->db->exists('test-full-text-search', ["`pid` = ? AND MATCH (`searchtext`) AGAINST (? IN BOOLEAN MODE)", getmypid(), $channelsearchtext])) { + if (in_array($language, $this->pConfig->get($channel['uid'], 'channel', 'languages', [User::getLanguageCode($channel['uid'])]))) { + $store = true; + $this->logger->debug('Matching channel found.', ['uid' => $channel['uid'], 'label' => $channel['label'], 'language' => $language, 'channelsearchtext' => $channelsearchtext, 'searchtext' => $searchtext]); + break; + } + } + } + $this->db->close($channels); + + $this->db->delete('test-full-text-search', ['pid' => getmypid()]); + return $store; + } } diff --git a/src/Model/GServer.php b/src/Model/GServer.php index 4378814854..84d8dad8b6 100644 --- a/src/Model/GServer.php +++ b/src/Model/GServer.php @@ -240,7 +240,7 @@ class GServer } elseif (!empty($contact['baseurl'])) { $server = $contact['baseurl']; } elseif ($contact['network'] == Protocol::DIASPORA) { - $parts = parse_url($contact['url']); + $parts = (array)parse_url($contact['url']); unset($parts['path']); $server = (string)Uri::fromParts($parts); } else { @@ -589,7 +589,7 @@ class GServer if ((parse_url($url, PHP_URL_HOST) != parse_url($valid_url, PHP_URL_HOST)) && (parse_url($url, PHP_URL_PATH) != parse_url($valid_url, PHP_URL_PATH)) && (parse_url($url, PHP_URL_PATH) == '')) { Logger::debug('Found redirect. Mark old entry as failure and redirect to the basepath.', ['old' => $url, 'new' => $valid_url]); - $parts = parse_url($valid_url); + $parts = (array)parse_url($valid_url); unset($parts['path']); $valid_url = (string)Uri::fromParts($parts); diff --git a/src/Model/Post/Engagement.php b/src/Model/Post/Engagement.php index 74b479653c..4aaaf0e453 100644 --- a/src/Model/Post/Engagement.php +++ b/src/Model/Post/Engagement.php @@ -33,6 +33,7 @@ use Friendica\Model\Post; use Friendica\Model\Tag; use Friendica\Model\Verb; use Friendica\Protocol\Activity; +use Friendica\Protocol\ActivityPub\Receiver; use Friendica\Protocol\Relay; use Friendica\Util\DateTimeFormat; @@ -52,7 +53,7 @@ class Engagement } $parent = Post::selectFirst(['uri-id', 'created', 'author-id', 'owner-id', 'uid', 'private', 'contact-contact-type', 'language', 'network', - 'title', 'content-warning', 'body', 'author-contact-type', 'author-nick', 'author-addr', 'owner-contact-type', 'owner-nick', 'owner-addr'], + 'title', 'content-warning', 'body', 'author-contact-type', 'author-nick', 'author-addr', 'author-gsid', 'owner-contact-type', 'owner-nick', 'owner-addr'], ['uri-id' => $item['parent-uri-id']]); if ($parent['created'] < self::getCreationDateLimit(false)) { @@ -79,7 +80,14 @@ class Engagement $mediatype = self::getMediaType($item['parent-uri-id']); if (!$store) { - $mediatype = !empty($mediatype); + $store = !empty($mediatype); + } + + $searchtext = self::getSearchTextForItem($parent); + if (!$store) { + $content = trim(($parent['title'] ?? '') . ' ' . ($parent['content-warning'] ?? '') . ' ' . ($parent['body'] ?? '')); + $language = array_key_first(Item::getLanguageArray($content, 1, 0, $parent['author-id'])); + $store = DI::userDefinedChannel()->match($searchtext, $language); } $engagement = [ @@ -88,7 +96,7 @@ class Engagement 'contact-type' => $parent['contact-contact-type'], 'media-type' => $mediatype, 'language' => $parent['language'], - 'searchtext' => self::getSearchText($parent), + 'searchtext' => $searchtext, 'created' => $parent['created'], 'restricted' => !in_array($item['network'], Protocol::FEDERATED) || ($parent['private'] != Item::PUBLIC), 'comments' => DBA::count('post', ['parent-uri-id' => $item['parent-uri-id'], 'gravity' => Item::GRAVITY_COMMENT]), @@ -106,10 +114,56 @@ class Engagement Logger::debug('Engagement stored', ['fields' => $engagement, 'ret' => $ret]); } - private static function getSearchText(array $item): string + public static function getSearchTextForActivity(string $content, int $author_id, array $tags, array $receivers): string + { + $author = Contact::getById($author_id); + + $item = [ + 'uri-id' => 0, + 'network' => Protocol::ACTIVITYPUB, + 'title' => '', + 'content-warning' => '', + 'body' => $content, + 'private' => Item::PRIVATE, + 'author-id' => $author_id, + 'author-contact-type' => $author['contact-type'], + 'author-nick' => $author['nick'], + 'author-addr' => $author['addr'], + 'author-gsid' => $author['gsid'], + 'owner-id' => $author_id, + 'owner-contact-type' => $author['contact-type'], + 'owner-nick' => $author['nick'], + 'owner-addr' => $author['addr'], + ]; + + foreach ($receivers as $receiver) { + if ($receiver == Receiver::PUBLIC_COLLECTION) { + $item['private'] = Item::PUBLIC; + } + } + + return self::getSearchText($item, $receivers, $tags); + } + + private static function getSearchTextForItem(array $item): string + { + $receivers = array_column(Tag::getByURIId($item['uri-id'], [Tag::MENTION, Tag::IMPLICIT_MENTION, Tag::EXCLUSIVE_MENTION, Tag::AUDIENCE]), 'url'); + $tags = array_column(Tag::getByURIId($item['uri-id'], [Tag::HASHTAG]), 'name'); + return self::getSearchText($item, $receivers, $tags); + } + + private static function getSearchText(array $item, array $receivers, array $tags): string { $body = '[nosmile]network:' . $item['network']; + if (!empty($item['author-gsid'])) { + $gserver = DBA::selectFirst('gserver', ['platform'], ['id' => $item['author-gsid']]); + $platform = preg_replace( '/[\W]/', '', $gserver['platform'] ?? ''); + if (!empty($platform)) { + $body .= ' platform:' . $platform; + } + } + switch ($item['private']) { case Item::PUBLIC: $body .= ' visibility:public'; @@ -136,8 +190,8 @@ class Engagement } } - foreach (Tag::getByURIId($item['uri-id'], [Tag::MENTION, Tag::IMPLICIT_MENTION, Tag::EXCLUSIVE_MENTION, Tag::AUDIENCE]) as $tag) { - $contact = Contact::getByURL($tag['name'], false, ['nick', 'addr', 'contact-type']); + foreach ($receivers as $receiver) { + $contact = Contact::getByURL($receiver, false, ['nick', 'addr', 'contact-type']); if (empty($contact)) { continue; } @@ -149,8 +203,8 @@ class Engagement } } - foreach (Tag::getByURIId($item['uri-id'], [Tag::HASHTAG]) as $tag) { - $body .= ' tag:' . $tag['name']; + foreach ($tags as $tag) { + $body .= ' tag:' . $tag; } $body .= ' ' . $item['title'] . ' ' . $item['content-warning'] . ' ' . $item['body']; diff --git a/src/Model/Search.php b/src/Model/Search.php index 4b09af72d8..316e41c250 100644 --- a/src/Model/Search.php +++ b/src/Model/Search.php @@ -22,6 +22,8 @@ namespace Friendica\Model; use Friendica\Database\DBA; +use Friendica\DI; +use Friendica\Util\DateTimeFormat; /** * Model for DB specific logic for the search entity @@ -36,14 +38,43 @@ class Search */ public static function getUserTags(): array { - $termsStmt = DBA::p("SELECT DISTINCT(`term`) FROM `search`"); + $user_condition = ["`verified` AND NOT `blocked` AND NOT `account_removed` AND NOT `account_expired` AND `user`.`uid` > ?", 0]; - $tags = []; + $abandon_days = intval(DI::config()->get('system', 'account_abandon_days')); + if (!empty($abandon_days)) { + $user_condition = DBA::mergeConditions($user_condition, ["`last-activity` > ?", DateTimeFormat::utc('now - ' . $abandon_days . ' days')]); + } + $condition = $user_condition; + $condition[0] = "SELECT DISTINCT(`term`) FROM `search` INNER JOIN `user` ON `search`.`uid` = `user`.`uid` WHERE " . $user_condition[0]; + $sql = array_shift($condition); + $termsStmt = DBA::p($sql, $condition); + + $tags = []; while ($term = DBA::fetch($termsStmt)) { $tags[] = trim(mb_strtolower($term['term']), '#'); } DBA::close($termsStmt); + + $condition = $user_condition; + $condition[0] = "SELECT `include-tags` FROM `channel` INNER JOIN `user` ON `channel`.`uid` = `user`.`uid` WHERE " . $user_condition[0]; + $sql = array_shift($condition); + $channels = DBA::p($sql, $condition); + while ($channel = DBA::fetch($channels)) { + foreach (explode(',', $channel['include-tags']) as $tag) { + $tag = trim(mb_strtolower($tag)); + if (empty($tag)) { + continue; + } + if (!in_array($tag, $tags)) { + $tags[] = $tag; + } + } + } + DBA::close($channels); + + sort($tags); + return $tags; } } diff --git a/src/Model/User.php b/src/Model/User.php index 3368d42bd9..1d4c768aea 100644 --- a/src/Model/User.php +++ b/src/Model/User.php @@ -582,6 +582,12 @@ class User */ public static function getLanguages(): array { + $cachekey = 'user:getLanguages'; + $languages = DI::cache()->get($cachekey); + if (!is_null($languages)) { + return $languages; + } + $supported = array_keys(DI::l10n()->getLanguageCodes()); $languages = []; $uids = []; @@ -620,7 +626,10 @@ class User DBA::close($channels); ksort($languages); - return array_keys($languages); + $languages = array_keys($languages); + DI::cache()->set($cachekey, $languages); + + return $languages; } /** diff --git a/src/Module/Conversation/Timeline.php b/src/Module/Conversation/Timeline.php index 725634eb09..3e71caac7b 100644 --- a/src/Module/Conversation/Timeline.php +++ b/src/Module/Conversation/Timeline.php @@ -391,7 +391,7 @@ class Timeline extends BaseModule if (!empty($channel->fullTextSearch)) { $search = $channel->fullTextSearch; - foreach (['from', 'to', 'group', 'tag', 'network', 'visibility'] as $keyword) { + foreach (['from', 'to', 'group', 'tag', 'network', 'platform', 'visibility'] as $keyword) { $search = preg_replace('~(' . $keyword . ':.[\w@\.-]+)~', '"$1"', $search); } $condition = DBA::mergeConditions($condition, ["MATCH (`searchtext`) AGAINST (? IN BOOLEAN MODE)", $search]); diff --git a/src/Protocol/ActivityPub/Processor.php b/src/Protocol/ActivityPub/Processor.php index 49116444ad..d6140535b9 100644 --- a/src/Protocol/ActivityPub/Processor.php +++ b/src/Protocol/ActivityPub/Processor.php @@ -42,6 +42,7 @@ use Friendica\Model\Mail; use Friendica\Model\Tag; use Friendica\Model\User; use Friendica\Model\Post; +use Friendica\Model\Post\Engagement; use Friendica\Protocol\Activity; use Friendica\Protocol\ActivityPub; use Friendica\Protocol\Delivery; @@ -751,7 +752,7 @@ class Processor public static function addToFeaturedCollection(array $activity) { $post = self::getUriIdForFeaturedCollection($activity); - if (empty($post)) { + if (empty($post) || empty($post['author-id'])) { Queue::remove($activity); return; } @@ -1562,20 +1563,17 @@ class Processor return ''; } + $ldobject = JsonLD::compact($object); + $signer = []; - if (!empty($object['attributedTo'])) { - $attributed_to = $object['attributedTo']; - if (is_array($attributed_to)) { - $compacted = JsonLD::compact($object); - $attributed_to = JsonLD::fetchElement($compacted, 'as:attributedTo', '@id'); - } + $attributed_to = JsonLD::fetchElement($ldobject, 'as:attributedTo', '@id'); + if (!empty($attributed_to)) { $signer[] = $attributed_to; } - if (!empty($object['actor'])) { - $object_actor = $object['actor']; - } elseif (!empty($attributed_to)) { + $object_actor = JsonLD::fetchElement($ldobject, 'as:actor', '@id'); + if (!empty($attributed_to)) { $object_actor = $attributed_to; } else { // Shouldn't happen @@ -1591,8 +1589,6 @@ class Processor $actor = $object_actor; } - $ldobject = JsonLD::compact($object); - $type = JsonLD::fetchElement($ldobject, '@type'); $object_id = JsonLD::fetchElement($ldobject, 'as:object', '@id'); @@ -1607,10 +1603,11 @@ class Processor } $activity = $object; $ldactivity = $ldobject; - } else { + } elseif (!empty($object['id'])) { $activity = self::getActivityForObject($object, $actor); $ldactivity = JsonLD::compact($activity); - $object_id = $object['id']; + } else { + return null; } $ldactivity['recursion-depth'] = !empty($child['recursion-depth']) ? $child['recursion-depth'] + 1 : 0; @@ -1631,7 +1628,7 @@ class Processor if ($completion == Receiver::COMPLETION_RELAY) { $ldactivity['from-relay'] = $ldactivity['thread-completion']; - if (in_array($type, Receiver::CONTENT_TYPES) && !self::acceptIncomingMessage($ldactivity, $object_id)) { + if (in_array($type, Receiver::CONTENT_TYPES) && !self::acceptIncomingMessage($ldactivity)) { return null; } } @@ -1684,16 +1681,18 @@ class Processor * Test if incoming relay messages should be accepted * * @param array $activity activity array - * @param string $id object ID * @return boolean true if message is accepted */ - private static function acceptIncomingMessage(array $activity, string $id): bool + private static function acceptIncomingMessage(array $activity): bool { if (empty($activity['as:object'])) { + $id = JsonLD::fetchElement($activity, '@id'); Logger::info('No object field in activity - accepted', ['id' => $id]); return true; } + $id = JsonLD::fetchElement($activity, 'as:object', '@id'); + $replyto = JsonLD::fetchElement($activity['as:object'], 'as:inReplyTo', '@id'); $uriid = ItemURI::getIdByURI($replyto ?? ''); if (Post::exists(['uri-id' => $uriid])) { @@ -1731,7 +1730,23 @@ class Processor $languages = self::getPostLanguages($activity['as:object'] ?? ''); - return Relay::isSolicitedPost($messageTags, $content, $authorid, $id, Protocol::ACTIVITYPUB, $activity['thread-completion'] ?? 0, $languages); + $wanted = Relay::isSolicitedPost($messageTags, $content, $authorid, $id, Protocol::ACTIVITYPUB, $activity['from-relay'], $languages); + if ($wanted) { + return true; + } + + $receivers = []; + foreach (['as:to', 'as:cc', 'as:bto', 'as:bcc', 'as:audience'] as $element) { + $receiver_list = JsonLD::fetchElementArray($activity, $element, '@id'); + if (empty($receiver_list)) { + continue; + } + $receivers = array_merge($receivers, $receiver_list); + } + + $searchtext = Engagement::getSearchTextForActivity($content, $authorid, $messageTags, $receivers); + $language = array_key_first(Item::getLanguageArray($content, 1, 0, $authorid)); + return DI::userDefinedChannel()->match($searchtext, $language); } /** diff --git a/src/Protocol/Relay.php b/src/Protocol/Relay.php index 7d62ba576e..8761cd7f3d 100644 --- a/src/Protocol/Relay.php +++ b/src/Protocol/Relay.php @@ -193,12 +193,7 @@ class Relay } if (!empty($languages) || !empty($detected)) { - $cachekey = 'relay:isWantedLanguage'; - $user_languages = DI::cache()->get($cachekey); - if (is_null($user_languages)) { - $user_languages = User::getLanguages(); - DI::cache()->set($cachekey, $user_languages); - } + $user_languages = User::getLanguages(); foreach ($detected as $language) { if (in_array($language, $user_languages)) { diff --git a/src/Worker/OptimizeTables.php b/src/Worker/OptimizeTables.php index ad2ac444d1..db525fce2b 100644 --- a/src/Worker/OptimizeTables.php +++ b/src/Worker/OptimizeTables.php @@ -46,6 +46,7 @@ class OptimizeTables DBA::optimizeTable('parsed_url'); DBA::optimizeTable('session'); DBA::optimizeTable('post-engagement'); + DBA::optimizeTable('test-full-text-search'); if (DI::config()->get('system', 'optimize_all_tables')) { DBA::optimizeTable('apcontact'); diff --git a/static/dbstructure.config.php b/static/dbstructure.config.php index e0f98e97f8..1e72868860 100644 --- a/static/dbstructure.config.php +++ b/static/dbstructure.config.php @@ -56,7 +56,7 @@ use Friendica\Database\DBA; // This file is required several times during the test in DbaDefinition which justifies this condition if (!defined('DB_UPDATE_VERSION')) { - define('DB_UPDATE_VERSION', 1539); + define('DB_UPDATE_VERSION', 1540); } return [ @@ -1858,6 +1858,17 @@ return [ "uid_application-id" => ["uid", "application-id"], ] ], + "test-full-text-search" => [ + "comment" => "Test for a full text search match in user defined channels before storing the message in the system", + "fields" => [ + "pid" => ["type" => "int unsigned", "not null" => "1", "default" => "0", "comment" => "Process id of the worker"], + "searchtext" => ["type" => "mediumtext", "comment" => "Simplified text for the full text search"], + ], + "indexes" => [ + "PRIMARY" => ["pid"], + "searchtext" => ["FULLTEXT", "searchtext"], + ], + ], "userd" => [ "comment" => "Deleted usernames", "fields" => [ -- 2.39.5