Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Content Search: use AND operator by default and don't apply mask for phrases #6979

Merged
merged 24 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0b2ce57
Content Search: use AND operator by default and don't apply mask for …
yurabakhtin Apr 30, 2024
fcbe691
Fix test
yurabakhtin Apr 30, 2024
4f92d84
Fix SearchQueryTest
yurabakhtin Apr 30, 2024
a74a574
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin Apr 30, 2024
c233527
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin May 2, 2024
06c6fe0
Fix creating of FULLTEXT index for mysql 8+
yurabakhtin May 2, 2024
c75eee8
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin May 2, 2024
92e4881
Fix mysql search driver for short words
yurabakhtin May 3, 2024
4fc9d98
Improve keyword highlighting
yurabakhtin May 3, 2024
30379f1
Use fixed value instead of `mysql.ft_min_word_len`
yurabakhtin May 3, 2024
ce85d0c
Exclude highlighting on console request
yurabakhtin May 6, 2024
308e400
Escape special char `@` on mysql search driver
yurabakhtin May 6, 2024
ff2705f
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin May 7, 2024
7aed7d6
Escape special chars on mysql search driver
yurabakhtin May 7, 2024
3673239
Escape char `/` on mysql search driver
yurabakhtin May 7, 2024
b37078d
Escape quote chars on mysql search driver
yurabakhtin May 7, 2024
bda6c61
Improve word highlighting with quote or apostrophe
yurabakhtin May 7, 2024
a6bcf94
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin May 9, 2024
9cec0c6
Add code comment for search query and driver
yurabakhtin May 9, 2024
9faa03a
Merge branch 'develop' into enh/230-content-search-default-operator
yurabakhtin May 16, 2024
b850754
Refactor search query
yurabakhtin May 17, 2024
5bdb059
Fix test for ZendLucence Driver
yurabakhtin May 17, 2024
2d165ae
Fix ZendLucence Driver for phrases
yurabakhtin May 17, 2024
f21605e
Merge branch 'develop' into enh/230-content-search-default-operator
luke- May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG-DEV.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ HumHub Changelog
- Fix #6552: When a JS file has `module.initOnAjaxLoad = true;`, if the `initOnAjaxUrls` contains multiple params in the URL, the `init` function is not triggered
- Fix #7006: Disable `mustChangePassword` check for impersonated access tokens
- Fix #7004: Fix people filter by group
- Enh #6979: Content Search: use AND operator by default and don't apply mask for phrases
- Fix #7011: Fixed performance issue in `Members::getPrivilegedUserIds`
- Enh #7010: Rich text tables: Vertical align top instead of middle
- Enh #5310: Mobile - Zooming into pictures
Expand Down
139 changes: 88 additions & 51 deletions protected/humhub/libs/SearchQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,81 +13,118 @@
* Parses a given search query into relevant terms.
*
* Examples:
* - `Apple Banana`, `Apple OR Banana`
* - `Apple AND Banana`, `+Apple +Banana`
* - `Apple AND Banana OR Grape`, `+Apple +Banana Grape`
* - `Apple Banana`, `+Apple +Banana`
* - `"Apple pie" "Banana bread"`
* - `Apple NOT Banana`, `Apple -Banana`
*
*
* @since 1.16
*/
class SearchQuery
{
/**
* @var string[] All terms without operator
* @readonly
*/
public array $orTerms;

/**
* @var string[] All terms which are required, with AND or + operator
* All terms which are required, without operator or with AND or + operator
*
* This is DEFAULT term (for all keyword without operator)
*
* Keyword samples how they are stored here:
* 'Apple' => ['Apple*']
* 'AND Apple' => ['AND*', 'Apple*']
* '+Apple' => ['Apple*']
* '"Apple"' => ['Apple']
* '+"Apple"' => ['Apple']
* '+"Apple Banana"' => ['Apple Banana']
* 'Apple pie AND Banana' => ['Apple*', 'pie*', 'AND*', 'Banana*']
* '"Apple pie" Banana' => ['Apple pie', 'Banana*']
* @var string[] $terms
* @readonly
*/
public array $andTerms;
public array $terms = [];

/**
* @var string[] All terms which should excluded, with NOT or - operator
* All terms which should excluded, with NOT or - operator
*
* Keyword samples how they are stored here:
* 'NOT Apple' => ['Apple*']
* '-Apple' => ['Apple*']
* 'NOT "Apple"' => ['Apple']
* '-"Apple"' => ['Apple']
* '-"Apple Banana"' => ['Apple Banana']
* '-Apple -pie NOT Banana' => ['Apple*', 'pie*', 'Banana*']
* '-"Apple pie" -Banana' => ['Apple pie', 'Banana*']
* @var string[] $notTerms
* @readonly
*/
public array $notTerms;
public array $notTerms = [];

/**
* @param $query string The search query to parse
*/
public function __construct(string $query)
{
$notTerms = [];
$orTerms = [];
$andTerms = [];

preg_match_all(
'/(?|((?i)AND )?((?i)OR )?((?i)NOT )?[\+\-]?"([^"]+)"|(((?i)AND )?((?i)OR )?((?i)NOT )?\S+))/',
$query,
$result,
PREG_PATTERN_ORDER,
);

if (!empty($result[0]) && is_array($result[0])) {
foreach ($result[0] as $i => $term) {

// Remove quotation marks
$term = str_replace('"', '', $term);

if (str_starts_with($term, '+') || str_starts_with($term, 'AND ')) {

/**
* Special Case: In search queries like "Apple AND Banana", Apple should
* become a `AND` term too.
*/
if ($i === 1 && str_starts_with($term, 'AND ') && count($orTerms) === 1) {
$andTerms = $orTerms;
$orTerms = [];
}

$andTerms[] = preg_replace('/^\+?((?i)AND )?/', '', $term);
} elseif (str_starts_with($term, '-') || str_starts_with($term, 'NOT ')) {
$notTerms[] = preg_replace('/^\-?((?i)NOT )?/', '', $term);
} else {
$orTerms[] = preg_replace('/^((?i)OR )?/', '', $term);
}
foreach ($this->parseQuery($query) as $term) {
if ($this->isNotTerm($term)) {
$this->addNotTerm($term);
} else {
$this->addTerm($term);
}
}
}

protected function parseQuery(string $query): array
{
preg_match_all('/(?|((?i)NOT )?[\+\-]*"([^"]+)"|(((?i)NOT )?\S+))/', $query, $result);

$this->notTerms = array_filter($notTerms);
$this->orTerms = array_filter($orTerms);
$this->andTerms = array_filter($andTerms);
return !empty($result[0]) && is_array($result[0]) ? $result[0] : [];
}

protected function isNotTerm(string $term): bool
{
return str_starts_with($term, '-') || str_starts_with($term, 'NOT ');
}

protected function addNotTerm(string $term)
{
$term = preg_replace('/^(-*|NOT )/', '', $term);

if ($term = $this->prepareTerm($term)) {
$this->notTerms[] = $term;
}
}

protected function addTerm(string $term)
{
$term = ltrim($term, '+');

if ($term = $this->prepareTerm($term)) {
$this->terms[] = $term;
}
}

/**
* Prepare a term before using
*
* @param string $term
* @return string|null NULL - if the term must not be used at all
*/
protected function prepareTerm(string $term): ?string
{
if (!$this->isPhrase($term)) {
// A not quoted term should be searched as wildcard by default
// The last char '*' is used as flag for wildcard
$term = rtrim($term, '*"') . '*';
}

// Remove all rest quotes around each term
$term = trim($term, '"');

// Remove the chars if they stay after first quote like `"---Apple"`, `"++++Banana"`
$term = ltrim($term, '+-');

return $term === '' || $term === '*' ? null : $term;
}

protected function isPhrase(string $term): bool
{
return str_starts_with($term, '"') && str_ends_with($term, '"');
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ humhub.module('comment', function (module, require, $) {

// Highlight currently searching keywords in the loaded comments
const contentSearchKeyword = $('.container-contents .form-search input[name=keyword]');
if (contentSearchKeyword.length && contentSearchKeyword.val().length) {
contentSearchKeyword.val().split(' ').forEach((keyword) => $html.highlight(keyword))
if (contentSearchKeyword.length) {
additions.highlightWords($html, contentSearchKeyword.val());
}
}).catch(function (err) {
module.log.error(err, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public function safeUp()
$this->safeAddForeignKey('fk_content_fulltext', 'content_fulltext', 'content_id', 'content', 'id', 'CASCADE', 'CASCADE');

try {
$this->execute("ALTER TABLE content_fulltext ADD FULLTEXT INDEX ftx (contents ASC, comments ASC, files ASC)");
$this->execute('ALTER TABLE content_fulltext ADD FULLTEXT INDEX ftx (contents, comments, files)');

Yii::$app->queue->push(new SearchRebuildIndex());

Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
humhub.module('content.highlight', function (module, require, $) {
const Widget = require('ui.widget').Widget;
const event = require('event');
const highlightWords = require('ui.additions').highlightWords;

const layout = $('.layout-content-container');

const init = function () {
event.on('humhub:modules:content:highlight:afterInit', () => highlight());
layout.find('[data-ui-widget="ui.richtext.prosemirror.RichText"]')
.on('afterRender', (obj) => highlight($(obj.target)));
.on('afterRender', (obj) => highlight(obj.target));

const wallStream = Widget.instance('[data-ui-widget="stream.wall.WallStream"]');
if (wallStream) {
Expand All @@ -20,7 +21,7 @@ humhub.module('content.highlight', function (module, require, $) {
if (typeof object === 'undefined') {
object = layout;
}
module.config.keyword.split(' ').forEach((keyword) => object.highlight(keyword));
highlightWords(object, module.config.keyword);
}
}

Expand Down
26 changes: 19 additions & 7 deletions protected/humhub/modules/content/search/driver/MysqlDriver.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@

class MysqlDriver extends AbstractDriver
{
/**
* Minimum word length for "And Terms",
* Words with less length are handled as "Or Terms"
* NOTE: Using of the config value mysql.ft_min_word_len doesn't work properly.
*
* @var int $minAndTermLength
*/
public int $minAndTermLength = 3;

public function purge(): void
{
ContentFulltext::deleteAll();
Expand Down Expand Up @@ -128,14 +137,11 @@ private function createMysqlFullTextQuery(SearchQuery $query, array $matchFields
{
$againstQuery = '';

foreach ($query->andTerms as $keyword) {
$againstQuery .= '+' . rtrim($keyword, '*') . '* ';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurabakhtin Is there a reason why you removed the wildcards?

Basically, the search for “Apple” should also find “Applepie”.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@luke- I have moved the wildcards from drivers(mysql, zend, solr) to SearchQuery code here, because the chars " are removed on the SearchQuery side, so we cannot know what original term was really when we are on the driver side.

This is how it worked before the change:

  • Apple => Apple* (find Applepie)
  • "Apple" => Apple* (find Applepie)

After the change:

  • Apple => Apple* (find Applepie)
  • "Apple" => Apple (find only Apple, no Applepie)

As I understand it should work as it is described in the first table here.

Tests for quoted cases are here:

I have done the change for the request "Phrases should actually work. Can you check what is causing the problem here and add tests if necessary?", please correct me if I misunderstood it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurabakhtin My intention was for “SearchQuery” to be as neutral a representation of the search query as possible. Without any syntax like wildcards or terms in quotation marks.

e.g. Asterisk or Quotation marks is more of a driver-specific syntax. (Even if the similar for MySQL and Lucene)

But I understand that, for example, with SearchRequest::andTerms we cannot distinguish whether it was originally quoted or not.

Do you have any ideas?

  • We could introduce new attributes such as andParts (with Wildcard) and andTerms (with Quotationmarks).

  • We could define that as soon as a space is contained in e.g. andTerms, it is seen as a term, otherwise it is provided with a wildcard. (But then “Apple” always becomes Apple*.

  • ?

Or did I get something wrong here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@luke-

My intention was for “SearchQuery” to be as neutral a representation of the search query as possible. Without any syntax like wildcards or terms in quotation marks.

e.g. Asterisk or Quotation marks is more of a driver-specific syntax. (Even if the similar for MySQL and Lucene)

Ok, I have understood your intention.

For solve it we could implement new array as andParts or extend the existing array with new array item to mark each term like this:

SearchQuery->andTerms = [
    ['Apple', 'mask' => false],
    ['Pine', 'mask' => true],
];

so on driver side we will know what term should be masked with *.

But I am thinking why should we keep an additional array andParts or use the complex structure if we can use a last char of each term as flag. I.e. if the last char is * then we can decide such term should be processed with specific way on a driver side.
For current drivers(mysql, zend, solr) we can keep the code as is, but for future drivers the last char * can be used as flag like this, if the driver doesn't support terms ended with *:

foreach ($request->getSearchQuery()->andTerms as $term) {
    $keywordQuery->addSubquery(str_ends_with($term, '*') ? $this->prepareMaskedTerm($term) : $term);
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurabakhtin Okay, I got your point.

Do I understand it correctly that now e.g. in SearchQuery::andTerms, can contain entries that are either quoted and contain an asterisk. Or is only the asterisk included?

Your approach is probably the right one. Since in fact all current drivers support this and it also corresponds to the query language of SearchQuery.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I understand it correctly that now e.g. in SearchQuery::andTerms, can contain entries that are either quoted and contain an asterisk. Or is only the asterisk included?

@luke- Yes, but a bit correction about "quoted":

  • entries that are either simple word without quotes or contain an asterisk

Examples how a requested keyword is stored in terms:

  • Apple => Apple*
  • "Apple" => Apple
  • Foo bar => Foo bar*
  • "Foo bar" => Foo bar

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurabakhtin Can you please add this example somewhere in the SearchQuery PHPDoc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurabakhtin Can you please add this example somewhere in the SearchQuery PHPDoc?

@luke- 9cec0c6#diff-70f82fb3d09d4959fb9f44a526f8455144c112df88ec7299a2efe28a76171169R28-R64

}
foreach ($query->orTerms as $keyword) {
$againstQuery .= rtrim($keyword, '*') . '* ';
foreach ($query->terms as $term) {
$againstQuery .= '+' . $this->prepareTerm($term) . ' ';
}
foreach ($query->notTerms as $keyword) {
$againstQuery .= '-' . $keyword . ' ';
foreach ($query->notTerms as $term) {
$againstQuery .= '-' . $this->prepareTerm($term) . ' ';
}

return sprintf(
Expand All @@ -145,6 +151,12 @@ private function createMysqlFullTextQuery(SearchQuery $query, array $matchFields
);
}

protected function prepareTerm(string $term): string
{
// Wrap a keyword in quotes to avoid error with the special chars in the sql MATCH-AGAINST expression
return preg_match('#[^\p{L}\d\*’\'`\-\_]#', $term) ? '"' . $term . '"' : $term;
}

protected function addQueryFilterVisibility(ActiveQuery $query): ActiveQuery
{
$query->andWhere(['content.state' => Content::STATE_PUBLISHED]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
use ZendSearch\Lucene\Index;
use ZendSearch\Lucene\Index\Term;
use ZendSearch\Lucene\Lucene;
use ZendSearch\Lucene\Search\Query\AbstractQuery;
use ZendSearch\Lucene\Search\Query\Boolean;
use ZendSearch\Lucene\Search\Query\MultiTerm;
use ZendSearch\Lucene\Search\Query\Phrase;
use ZendSearch\Lucene\Search\Query\Range;
use ZendSearch\Lucene\Search\Query\Term as TermQuery;
use ZendSearch\Lucene\Search\Query\Wildcard;
Expand Down Expand Up @@ -155,23 +157,35 @@ public function search(SearchRequest $request): ResultSet
return $resultSet;
}

protected function prepareTerm(string $term): AbstractQuery
{
$term = mb_strtolower($term);

if (str_contains($term, ' ')) {
return new Phrase(explode(' ', $term));
}

if (str_ends_with($term, '*')) {
return new Wildcard(new Term($term));
}

return new TermQuery(new Term($term));
}

protected function buildSearchQuery(SearchRequest $request): Boolean
{
$query = new Boolean();

Wildcard::setMinPrefixLength(0);

$keywordQuery = new Boolean();
foreach ($request->getSearchQuery()->orTerms as $term) {
$keywordQuery->addSubquery(new Wildcard(new Term(mb_strtolower($term) . '*')), null);
}

foreach ($request->getSearchQuery()->andTerms as $term) {
$keywordQuery->addSubquery(new Wildcard(new Term(mb_strtolower($term) . '*')), true);
foreach ($request->getSearchQuery()->terms as $term) {
$keywordQuery->addSubquery($this->prepareTerm($term), true);
}

foreach ($request->getSearchQuery()->notTerms as $term) {
$keywordQuery->addSubquery(new TermQuery(new Term(mb_strtolower($term))), false);
$keywordQuery->addSubquery($this->prepareTerm($term), false);
}

if (count($keywordQuery->getSubqueries())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use humhub\modules\content\models\Content;
use humhub\modules\content\Module;
use humhub\modules\content\search\driver\AbstractDriver;
use humhub\modules\content\search\driver\MysqlDriver;
use humhub\modules\content\search\ResultSet;
use humhub\modules\content\search\SearchRequest;
use humhub\modules\content\services\ContentSearchService;
Expand Down Expand Up @@ -45,16 +46,23 @@ public function testKeywords()
(new Post($space, Content::VISIBILITY_PUBLIC, ['message' => 'Something Other']))->save();
(new Post($space, Content::VISIBILITY_PUBLIC, ['message' => 'Marabru Leav Test X']))->save();

// Test Multiple AND Keywords
#$this->assertCount(1, $this->getSearchResultByKeyword('Marabru')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('"Marabru" Tes')->results);
$this->assertCount(0, $this->getSearchResultByKeyword('"Marabr" Test')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('"Marabru Leav" "Leav Test"')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('"Something Other"')->results);
$this->assertCount(0, $this->getSearchResultByKeyword('Some Test')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('Some -Test')->results);

$this->assertCount(1, $this->getSearchResultByKeyword('Marabru Leav Abcd')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('Marabru Leav')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('Marabru Leav NOT Abcd')->results);
$this->assertCount(0, $this->getSearchResultByKeyword('Marabru -Leav')->results);
$this->assertCount(0, $this->getSearchResultByKeyword('+Marabru +Leav* +Abcd')->results);
$this->assertCount(0, $this->getSearchResultByKeyword('Marabru Leav +Abcd')->results);

$this->assertCount(1, $this->getSearchResultByKeyword('Something -Marabru')->results);
$this->assertCount(1, $this->getSearchResultByKeyword('Something -Marab')->results);

// Wildcards
// Wildcards (it is applied automatically even if the char `*` is not typed)
$this->assertCount(1, $this->getSearchResultByKeyword('Marabr*')->results);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ humhub.module('stream.SimpleStream', function (module, require, $) {
var content = require('content');
var Url = require('ui.filter').Url;
var Widget = require('ui.widget').Widget;
var highlightWords = require('ui.additions').highlightWords;

/**
* Simple stream component can be used for static streams without load logic (only reload single content).
Expand Down Expand Up @@ -58,7 +59,7 @@ humhub.module('stream.SimpleStream', function (module, require, $) {
widgets.on('afterInit', function() {
if (!$(this).data('isHighlighted')) {
$(this).data('isHighlighted', true);
that.highlightInput.val().split(' ').forEach((keyword) => $(this).highlight(keyword));
highlightWords(this, that.highlightInput.val());
}
});
}
Expand Down