Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creates a TokenMetadataStore to return startPosition of tokens in results #79

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ SRC = lib/lunr.js \
lib/stop_word_filter.js \
lib/trimmer.js \
lib/token_store.js \
lib/token_metadata_store.js \
lib/token.js \
lib/token_list.js \

YEAR = $(shell date +%Y)
VERSION = $(shell cat VERSION)
Expand Down
20 changes: 17 additions & 3 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ lunr.Index = function () {
this.tokenStore = new lunr.TokenStore
this.corpusTokens = new lunr.SortedSet
this.eventEmitter = new lunr.EventEmitter
this.tokenMetadataStore = new lunr.TokenMetadataStore

this._idfCache = {}

this.useTokenMetadata = true

this.on('add', 'remove', 'update', (function () {
this._idfCache = {}
}).bind(this))
Expand Down Expand Up @@ -145,7 +148,16 @@ lunr.Index.prototype.add = function (doc, emitEvent) {
emitEvent = emitEvent === undefined ? true : emitEvent

this._fields.forEach(function (field) {
var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name]))
var tokenList = this.pipeline.run(lunr.tokenizer(doc[field.name])),
fieldTokens = []

tokenList.toArray().forEach(function(token) {
token.field = field.name
if(this.useTokenMetadata){
this.tokenMetadataStore.add(docRef, token)
}
fieldTokens.push(token.indexedAs)
}, this)

docTokens[field.name] = fieldTokens
lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens)
Expand Down Expand Up @@ -199,6 +211,7 @@ lunr.Index.prototype.remove = function (doc, emitEvent) {
var docTokens = this.documentStore.get(docRef)

this.documentStore.remove(docRef)
this.tokenMetadataStore.remove(docRef)

docTokens.forEach(function (token) {
this.tokenStore.remove(token, docRef)
Expand Down Expand Up @@ -283,11 +296,12 @@ lunr.Index.prototype.idf = function (term) {
* @memberOf Index
*/
lunr.Index.prototype.search = function (query) {
var queryTokens = this.pipeline.run(lunr.tokenizer(query)),
var queryTokenList = this.pipeline.run(lunr.tokenizer(query)),
queryVector = new lunr.Vector,
documentSets = [],
fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)

var queryTokens = queryTokenList.indexTokens()
var hasSomeToken = queryTokens.some(function (token) {
return this.tokenStore.has(token)
}, this)
Expand Down Expand Up @@ -333,7 +347,7 @@ lunr.Index.prototype.search = function (query) {

return documentSet
.map(function (ref) {
return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
return { ref: ref, score: queryVector.similarity(this.documentVector(ref)), tokens: this.tokenMetadataStore.getAll(ref, queryTokens) }
}, this)
.sort(function (a, b) {
return b.score - a.score
Expand Down
33 changes: 25 additions & 8 deletions lib/pipeline.js
Original file line number Diff line number Diff line change
Expand Up @@ -172,21 +172,38 @@ lunr.Pipeline.prototype.remove = function (fn) {
* @returns {Array}
* @memberOf Pipeline
*/
lunr.Pipeline.prototype.run = function (tokens) {
var out = [],
tokenLength = tokens.length,
lunr.Pipeline.prototype.run = function (_tokens) {
var out = new lunr.TokenList,
tokens = new lunr.TokenList,
stackLength = this._stack.length

// If tokenizer didn't give us Tokens, convert them
if(_tokens instanceof lunr.TokenList) {
tokens = _tokens
}
else {
tokens.setList(_tokens.map(function(token) {
return new lunr.Token({ raw: token })
}))
}

var rawTokens = tokens.rawTokens(),
tokenLength = tokens.length

for (var i = 0; i < tokenLength; i++) {
var token = tokens[i]
var token = tokens.get(i)
var indexToken = token.raw

for (var j = 0; j < stackLength; j++) {
token = this._stack[j](token, i, tokens)
if (token === void 0) break
indexToken = this._stack[j](indexToken, i, rawTokens)
if (indexToken === void 0) break
};

if (token !== void 0) out.push(token)
};
if (indexToken !== void 0) {
token.indexedAs = indexToken
out.push(token)
}
}

return out
}
Expand Down
22 changes: 22 additions & 0 deletions lib/token.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
lunr.Token = function(args) {
// The indexed value of the token
this.indexedAs = args.indexedAs

// Start position in the document
this.startPos = args.startPos

// Name of the field in which this token appears
this.field = args.field

// The raw value of the token in the document
this.raw = args.raw
}

lunr.Token.prototype.toJSON = function() {
return {
indexedAs: this.indexedAs,
startPos: this.startPos,
field: this.field,
raw: this.raw
}
}
41 changes: 41 additions & 0 deletions lib/token_list.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
lunr.TokenList = function(elms) {
this.elements = []
this.length = 0

if(elms) {
this.setList(elms)
}
}

lunr.TokenList.prototype.push = function(token) {
if(!token instanceof lunr.Token){
throw new Error ('Cannot add type ' + typeof(token) + " to a token list, must be lunr.Token")
}
this.elements.push(token)
this.length++
}

lunr.TokenList.prototype.get = function(index) {
return this.elements[index]
}

lunr.TokenList.prototype.setList = function(elements) {
this.elements = elements
this.length = this.elements.length
}

lunr.TokenList.prototype.toArray = function() {
return Array.prototype.slice.call(this.elements, 0)
}

lunr.TokenList.prototype.indexTokens = function() {
return this.elements.map(function(token) {
return token.indexedAs
})
}

lunr.TokenList.prototype.rawTokens = function() {
return this.elements.map(function(token) {
return token.raw
})
}
51 changes: 51 additions & 0 deletions lib/token_metadata_store.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
lunr.TokenMetadataStore = function() {
this.store = {}
}

lunr.TokenMetadataStore.prototype.add = function(docRef, token) {
if(!(token instanceof lunr.Token)){
throw new Error ("Must add lunr.Token to TokenMetadataStore")
}

var idxVal = token.indexedAs

if(!idxVal) return

this.store[docRef] = this.store[docRef] || {}
this.store[docRef][idxVal] = this.store[docRef][idxVal] || []

this.store[docRef][idxVal].push(token);
}

lunr.TokenMetadataStore.prototype.get = function(docRef, idxVal) {
if(this.store[docRef] && this.store[docRef][idxVal]) {
return this.store[docRef][idxVal]
} else {
return null
}
}

lunr.TokenMetadataStore.prototype.getAll = function(docRef, idxValArray) {
out = []
idxValArray.forEach(function(idxVal) {
var tokens = this.get(docRef, idxVal)
if(tokens) {
tokens.forEach(function(token) {
if(token) {
out.push(token.toJSON())
}
})
}
}, this)
return out
}

lunr.TokenMetadataStore.prototype.remove = function(docRef) {
delete this.store[docRef]
}

lunr.TokenMetadataStore.prototype.toJSON = function() {
return {
store: this.store
}
}
36 changes: 29 additions & 7 deletions lib/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,21 @@
* @returns {Array}
*/
lunr.tokenizer = function (obj) {
if (!arguments.length || obj == null || obj == undefined) return []
if (Array.isArray(obj)) return obj.map(function (t) { return t.toLowerCase() })
if (!arguments.length || obj == null || obj == undefined) return (new lunr.TokenList)

var str = obj.toString().replace(/^\s+/, '')
if (Array.isArray(obj)){
return new lunr.TokenList(obj.map(function (t) {
return new lunr.Token({raw: t.toLowerCase()})
}))
}

var str = obj.toString(),
preStrLength = str.length

// Trim leading whitespace
str = str.replace(/^\s+/, '')

var trimCount = preStrLength - str.length

for (var i = str.length - 1; i >= 0; i--) {
if (/\S/.test(str.charAt(i))) {
Expand All @@ -24,9 +35,20 @@ lunr.tokenizer = function (obj) {
}
}

return str
.split(/\s+/)
.map(function (token) {
return token.toLowerCase()
var startPos = trimCount,
tokens = new lunr.TokenList

str.split(/\s/).forEach(function (_token, index) {
if(index){ startPos += 1 }

// I think lowercase should be a fn in the pipeline, not in the tokenizer
var trimmedToken = _token.replace(/^\s+/, '').toLowerCase()
if(trimmedToken !== "") {
tokens.push(new lunr.Token({raw: trimmedToken, startPos: startPos}))
}

startPos += _token.length
})

return tokens
}
4 changes: 4 additions & 0 deletions test/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
<script src="/lib/stop_word_filter.js" type="text/javascript" charset="utf-8"></script>
<script src="/lib/trimmer.js" type="text/javascript" charset="utf-8"></script>
<script src="/lib/token_store.js" type="text/javascript" charset="utf-8"></script>
<script src="/lib/token.js" type="text/javascript" charset="utf-8"></script>
<script src="/lib/token_metadata_store.js" type="text/javascript" charset="utf-8"></script>
<script src="/lib/token_list.js" type="text/javascript" charset="utf-8"></script>

<!-- Fixtures -->
<script src="/test/fixtures/stemming_vocab.json"></script>
Expand All @@ -44,6 +47,7 @@
<script src="/test/lunr_test.js"></script>
<script src="/test/token_store_test.js"></script>
<script src="/test/trimmer_test.js"></script>
<script src="/test/token_metadata_store_test.js"></script>

</head>
<body>
Expand Down
2 changes: 1 addition & 1 deletion test/pipeline_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ test("run should return the result of running the entire pipeline on each elemen
var pipeline = new lunr.Pipeline,
fn1 = function (t1) { return t1.toUpperCase() }
pipeline.add(fn1)
deepEqual(pipeline.run(['a']), ['A'])
deepEqual(pipeline.run(['a']).indexTokens(), ['A'])
})

test("run should filter out any undefined values at each stage in the pipeline", function () {
Expand Down
3 changes: 2 additions & 1 deletion test/serialisation_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ module('serialisation', {

test('dumping and loading an index', function () {
var idx = new lunr.Index

idx.useTokenMetadata = false
idx.field('title', { boost: 10 })
idx.field('body')

Expand All @@ -35,6 +35,7 @@ test('dumping and loading an index with a populated pipeline', function () {
this.field('title', { boost: 10 })
this.field('body')
})
idx.useTokenMetadata = false

this.corpus.forEach(function (doc) { idx.add(doc) })

Expand Down
51 changes: 51 additions & 0 deletions test/token_metadata_store_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
module(lunr.TokenMetadataStore)

test('adding a document to the index', function () {
var idx = new lunr.Index,
doc = {id: 1, body: 'this is a test'}

idx.field('body')
idx.add(doc)

ok(!!idx.tokenMetadataStore.store[doc.id], "tokenMetadataStore has an entry for doc 1")
})

test('searching for a document', function () {
var idx = new lunr.Index,
doc = {id: 1, body: 'this is a test'}

idx.field('body')
idx.add(doc)

var results = idx.search("test")
equal(results.length, 1, "There should be 1 search result")

var tokens = results[0].tokens
equal(tokens.length, 1, "There should be 1 lunr.Token in the result")
})

test('searching for a document with repeated tokens', function () {
var idx = new lunr.Index,
doc = {id: 1, body: 'is a test test'}

idx.field('body')
idx.add(doc)

var results = idx.search("test")
tokens = results[0].tokens

deepEqual(tokens, [{raw: 'test', startPos: 5, indexedAs: 'test', field:'body'}, {raw: 'test', startPos: 10, indexedAs: 'test', field:'body'}])
})

test('position works with whitespace', function () {
var idx = new lunr.Index,
doc = {id: 1, body: ' test'}

idx.field('body')
idx.add(doc)

var results = idx.search("test")
tokens = results[0].tokens

deepEqual(tokens, [{raw: 'test', startPos: 3, indexedAs: 'test', field:'body'}])
})