add must not search in indexer.go and add some test cases.

huichen · Nov 26, 2015 · f701dd9 · f701dd9
1 parent e1b46af
commit f701dd9
Show file tree

Hide file tree

Showing 2 changed files with 195 additions and 13 deletions.
diff --git a/core/indexer.go b/core/indexer.go
@@ -141,15 +141,15 @@ func (indexer *Indexer) Lookup(
 	}
 	numDocs = 0
 
-	// 合并关键词和标签为搜索键
-	keywords := make([]string, len(tokens)+len(labels))
-	copy(keywords, tokens)
-	copy(keywords[len(tokens):], labels)
+	mustKeywords, mustTokensLength, mustNotKeywords, isValid := getProssedQueries(tokens, labels)
+	if !isValid {
+		return
+	}
 
 	indexer.tableLock.RLock()
 	defer indexer.tableLock.RUnlock()
-	table := make([]*KeywordIndices, len(keywords))
-	for i, keyword := range keywords {
+	table := make([]*KeywordIndices, len(mustKeywords))
+	for i, keyword := range mustKeywords {
 		indices, found := indexer.tableLock.table[keyword]
 		if !found {
 			// 当反向索引表中无此搜索键时直接返回
@@ -160,6 +160,15 @@ func (indexer *Indexer) Lookup(
 		}
 	}
 
+	// 保存must not搜索键
+	mustNotTable := make([]*KeywordIndices, 0)
+	for _, keyword := range mustNotKeywords {
+		indices, found := indexer.tableLock.table[keyword]
+		if found {
+			mustNotTable = append(mustNotTable, indices)
+		}
+	}
+
 	// 当没有找到时直接返回
 	if len(table) == 0 {
 		return
@@ -171,6 +180,7 @@ func (indexer *Indexer) Lookup(
 	for iTable := 0; iTable < len(table); iTable++ {
 		indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
 	}
+
 	// 平均文本关键词长度，用于计算BM25
 	avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
 	for ; indexPointers[0] >= 0; indexPointers[0]-- {
@@ -186,7 +196,9 @@ func (indexer *Indexer) Lookup(
 				continue
 			}
 		}
+
 		iTable := 1
+
 		found := true
 		for ; iTable < len(table); iTable++ {
 			// 二分法比简单的顺序归并效率高，也有更高效率的算法，
@@ -196,7 +208,11 @@ func (indexer *Indexer) Lookup(
 			position, foundBaseDocId := indexer.searchIndex(table[iTable],
 				0, indexPointers[iTable], baseDocId)
 			if foundBaseDocId {
-				indexPointers[iTable] = position
+				if !indexer.findInMustNotTable(mustNotTable, baseDocId) {
+					indexPointers[iTable] = position
+				} else {
+					found = false
+				}
 			} else {
 				if position == 0 {
 					// 该搜索键中所有的文档ID都比baseDocId大，因此已经没有
@@ -211,19 +227,27 @@ func (indexer *Indexer) Lookup(
 			}
 		}
 
+		// 如果搜索键只返回一个反向表， 并且存在逻辑非搜索键
+		// 则需要判断baseDocId是不是在逻辑非反向表中
+		if len(table) == 1 && len(mustNotTable) > 0 {
+			if indexer.findInMustNotTable(mustNotTable, baseDocId) {
+				found = false
+			}
+		}
+
 		if found {
 			indexedDoc := types.IndexedDocument{}
 
 			// 当为LocationsIndex时计算关键词紧邻距离
 			if indexer.initOptions.IndexType == types.LocationsIndex {
 				// 计算有多少关键词是带有距离信息的
 				numTokensWithLocations := 0
-				for i, t := range table[:len(tokens)] {
+				for i, t := range table[:mustTokensLength] {
 					if len(t.locations[indexPointers[i]]) > 0 {
 						numTokensWithLocations++
 					}
 				}
-				if numTokensWithLocations != len(tokens) {
+				if numTokensWithLocations != mustTokensLength {
 					if !countDocsOnly {
 						docs = append(docs, types.IndexedDocument{
 							DocId: baseDocId,
@@ -234,13 +258,13 @@ func (indexer *Indexer) Lookup(
 				}
 
 				// 计算搜索键在文档中的紧邻距离
-				tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
+				tokenProximity, tokenLocations := computeTokenProximity(table[:mustTokensLength], indexPointers, mustKeywords[:mustTokensLength])
 				indexedDoc.TokenProximity = int32(tokenProximity)
 				indexedDoc.TokenSnippetLocations = tokenLocations
 
 				// 添加TokenLocations
-				indexedDoc.TokenLocations = make([][]int, len(tokens))
-				for i, t := range table[:len(tokens)] {
+				indexedDoc.TokenLocations = make([][]int, mustTokensLength)
+				for i, t := range table[:mustTokensLength] {
 					indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
 				}
 			}
@@ -250,7 +274,7 @@ func (indexer *Indexer) Lookup(
 				indexer.initOptions.IndexType == types.FrequenciesIndex {
 				bm25 := float32(0)
 				d := indexer.docTokenLengths[baseDocId]
-				for i, t := range table[:len(tokens)] {
+				for i, t := range table[:mustTokensLength] {
 					var frequency float32
 					if indexer.initOptions.IndexType == types.LocationsIndex {
 						frequency = float32(len(t.locations[indexPointers[i]]))
@@ -423,3 +447,55 @@ func (indexer *Indexer) RemoveDoc(docId uint64) {
 	indexer.numDocuments--
 	indexer.tableLock.Unlock()
 }
+
+func getProssedQueries(tokens []string, labels []string) (
+	[]string, int, []string, bool) {
+	mustTokensLength := 0
+	mustKeywords := make([]string, 0)
+	mustNotKeywords := make([]string, 0)
+
+	for _, v := range tokens {
+		if len(v) > 0 && v[0:1] == "+" {
+			mustKeywords = append(mustKeywords, v[1:])
+			mustTokensLength++
+		}
+		if len(v) > 0 && v[0:1] == "-" {
+			mustNotKeywords = append(mustNotKeywords, v[1:])
+		}
+		if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
+			mustKeywords = append(mustKeywords, v)
+			mustTokensLength++
+		}
+	}
+
+	for _, v := range labels {
+		if len(v) > 0 && v[0:1] == "+" {
+			mustKeywords = append(mustKeywords, v[1:])
+		}
+		if len(v) > 0 && v[0:1] == "-" {
+			mustNotKeywords = append(mustNotKeywords, v[1:])
+		}
+		if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
+			mustKeywords = append(mustKeywords, v)
+		}
+	}
+
+	if mustTokensLength == 0 && len(mustNotKeywords) > 0 {
+		// 不能只包含非搜索键
+		return mustKeywords, mustTokensLength, mustNotKeywords, false
+	}
+	return mustKeywords, mustTokensLength, mustNotKeywords, true
+}
+
+// 在must not table中查找docId
+// 返回： 找到： true， 未找到： false
+func (indexer *Indexer) findInMustNotTable(table []*KeywordIndices, docId uint64) bool {
+	for i := 0; i < len(table); i++ {
+		_, foundDocId := indexer.searchIndex(table[i],
+			0, indexer.getIndexLength(table[i])-1, docId)
+		if foundDocId {
+			return true
+		}
+	}
+	return false
+}
diff --git a/core/indexer_test.go b/core/indexer_test.go
@@ -370,3 +370,109 @@ func TestLookupWithLocations(t *testing.T) {
 	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
 	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
 }
+
+func TestLookupWithMustNot(t *testing.T) {
+	var indexer Indexer
+	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
+	// doc0 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 0,
+		Keywords: []types.KeywordIndex{
+			{"token2", 0, []int{0, 21}},
+			{"token3", 0, []int{28}},
+			{"token4", 0, []int{7, 14, 35}},
+		},
+	})
+
+	docs, num := indexer.Lookup([]string{"+token2", "-token3"}, []string{}, nil, false)
+	utils.Expect(t, "0", num)
+	if len(docs) == 0 {
+		t.Log("Correct!, 0 docs returned.")
+	}
+
+	_, num = indexer.Lookup([]string{"+token2", "token4", "-token3"}, []string{}, nil, false)
+	utils.Expect(t, "0", num)
+
+	_, num = indexer.Lookup([]string{"+token2", "-token4", "-token3"}, []string{}, nil, false)
+	utils.Expect(t, "0", num)
+}
+
+func TestLookupWithMustNotMulti(t *testing.T) {
+	var indexer Indexer
+	indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
+	// doc0 = "token2 token3"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 0,
+		Keywords: []types.KeywordIndex{
+			{"token2", 0, []int{0}},
+			{"token3", 0, []int{7}},
+		},
+	})
+	// doc1 = "token1 token2 token3"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 1,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token2", 0, []int{7}},
+			{"token3", 0, []int{14}},
+		},
+	})
+	// doc2 = "token1 token2"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 2,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token2", 0, []int{7}},
+		},
+	})
+	// doc3 = "token2"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 3,
+		Keywords: []types.KeywordIndex{
+			{"token2", 0, []int{0}},
+		},
+	})
+	// doc7 = "token1 token3"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 7,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token3", 0, []int{7}},
+		},
+	})
+	// doc9 = "token3"
+	indexer.AddDocument(&types.DocumentIndex{
+		DocId: 9,
+		Keywords: []types.KeywordIndex{
+			{"token3", 0, []int{0}},
+		},
+	})
+
+	utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
+
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"+token4"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "+token4"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[2 0 []] [1 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"+token1", "token2"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[2 0 []] [1 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"+token2", "+token1"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[7 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"token1", "-token2"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[3 0 []] [2 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"token2", "-token3"}, []string{}, nil, false)))
+
+	utils.Expect(t, "[3 0 []] ",
+		indexedDocsToString(indexer.Lookup([]string{"token2", "-token3", "-token1"}, []string{}, nil, false)))
+
+	utils.Expect(t, "",
+		indexedDocsToString(indexer.Lookup([]string{"-token2", "-token3", "-token1"}, []string{}, nil, false)))
+}