olivernn · cambridgemike · Apr 1, 2014 · Apr 1, 2014 · Apr 1, 2014 · Apr 1, 2014
diff --git a/Makefile b/Makefile
@@ -12,6 +12,9 @@ SRC = lib/lunr.js \
 	lib/stop_word_filter.js \
 	lib/trimmer.js \
 	lib/token_store.js \
+	lib/token_metadata_store.js \
+	lib/token.js \
+	lib/token_list.js \
 
 YEAR = $(shell date +%Y)
 VERSION = $(shell cat VERSION)

diff --git a/lib/index.js b/lib/index.js
@@ -18,9 +18,12 @@ lunr.Index = function () {
   this.tokenStore = new lunr.TokenStore
   this.corpusTokens = new lunr.SortedSet
   this.eventEmitter =  new lunr.EventEmitter
+  this.tokenMetadataStore = new lunr.TokenMetadataStore
 
   this._idfCache = {}
 
+  this.useTokenMetadata = true
+
   this.on('add', 'remove', 'update', (function () {
     this._idfCache = {}
   }).bind(this))
@@ -145,7 +148,16 @@ lunr.Index.prototype.add = function (doc, emitEvent) {
       emitEvent = emitEvent === undefined ? true : emitEvent
 
   this._fields.forEach(function (field) {
-    var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name]))
+    var tokenList = this.pipeline.run(lunr.tokenizer(doc[field.name])),
+        fieldTokens = []
+
+    tokenList.toArray().forEach(function(token) {
+      token.field = field.name
+      if(this.useTokenMetadata){
+        this.tokenMetadataStore.add(docRef, token)
+      }
+      fieldTokens.push(token.indexedAs)
+    }, this)
 
     docTokens[field.name] = fieldTokens
     lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens)
@@ -199,6 +211,7 @@ lunr.Index.prototype.remove = function (doc, emitEvent) {
   var docTokens = this.documentStore.get(docRef)
 
   this.documentStore.remove(docRef)
+  this.tokenMetadataStore.remove(docRef)
 
   docTokens.forEach(function (token) {
     this.tokenStore.remove(token, docRef)
@@ -283,11 +296,12 @@ lunr.Index.prototype.idf = function (term) {
  * @memberOf Index
  */
 lunr.Index.prototype.search = function (query) {
-  var queryTokens = this.pipeline.run(lunr.tokenizer(query)),
+  var queryTokenList = this.pipeline.run(lunr.tokenizer(query)),
       queryVector = new lunr.Vector,
       documentSets = [],
       fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
 
+  var queryTokens = queryTokenList.indexTokens()
   var hasSomeToken = queryTokens.some(function (token) {
     return this.tokenStore.has(token)
   }, this)
@@ -333,7 +347,7 @@ lunr.Index.prototype.search = function (query) {
 
   return documentSet
     .map(function (ref) {
-      return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
+      return { ref: ref, score: queryVector.similarity(this.documentVector(ref)), tokens: this.tokenMetadataStore.getAll(ref, queryTokens) }
     }, this)
     .sort(function (a, b) {
       return b.score - a.score

diff --git a/lib/pipeline.js b/lib/pipeline.js
@@ -172,21 +172,38 @@ lunr.Pipeline.prototype.remove = function (fn) {
  * @returns {Array}
  * @memberOf Pipeline
  */
-lunr.Pipeline.prototype.run = function (tokens) {
-  var out = [],
-      tokenLength = tokens.length,
+lunr.Pipeline.prototype.run = function (_tokens) {
+  var out = new lunr.TokenList,
+      tokens = new lunr.TokenList,
       stackLength = this._stack.length
 
+  // If tokenizer didn't give us Tokens, convert them
+  if(_tokens instanceof lunr.TokenList) {
+    tokens = _tokens
+  }
+  else {
+    tokens.setList(_tokens.map(function(token) {
+      return new lunr.Token({ raw: token })
+    }))
+  }
+
+  var rawTokens = tokens.rawTokens(),
+      tokenLength = tokens.length
+
   for (var i = 0; i < tokenLength; i++) {
-    var token = tokens[i]
+    var token = tokens.get(i)
+    var indexToken = token.raw
 
     for (var j = 0; j < stackLength; j++) {
-      token = this._stack[j](token, i, tokens)
-      if (token === void 0) break
+      indexToken = this._stack[j](indexToken, i, rawTokens)
+      if (indexToken === void 0) break
     };
 
-    if (token !== void 0) out.push(token)
-  };
+    if (indexToken !== void 0) {
+      token.indexedAs = indexToken
+      out.push(token)
+    }
+  }
 
   return out
 }

diff --git a/lib/token.js b/lib/token.js
@@ -0,0 +1,22 @@
+lunr.Token = function(args) {
+  // The indexed value of the token
+  this.indexedAs = args.indexedAs
+
+  // Start position in the document
+  this.startPos = args.startPos
+
+  // Name of the field in which this token appears
+  this.field = args.field
+
+  // The raw value of the token in the document
+  this.raw = args.raw
+}
+
+lunr.Token.prototype.toJSON = function() {
+  return {
+    indexedAs: this.indexedAs,
+    startPos: this.startPos,
+    field: this.field,
+    raw: this.raw
+  }
+}
diff --git a/lib/token_list.js b/lib/token_list.js
@@ -0,0 +1,41 @@
+lunr.TokenList = function(elms) {
+  this.elements = []
+  this.length = 0
+
+  if(elms) {
+    this.setList(elms)
+  }
+}
+
+lunr.TokenList.prototype.push = function(token) {
+  if(!token instanceof lunr.Token){
+    throw new Error ('Cannot add type ' + typeof(token) + " to a token list, must be lunr.Token")
+  }
+  this.elements.push(token)
+  this.length++
+}
+
+lunr.TokenList.prototype.get = function(index) {
+  return this.elements[index]
+}
+
+lunr.TokenList.prototype.setList = function(elements) {
+  this.elements = elements
+  this.length = this.elements.length
+}
+
+lunr.TokenList.prototype.toArray = function() {
+  return Array.prototype.slice.call(this.elements, 0)
+}
+
+lunr.TokenList.prototype.indexTokens = function() {
+  return this.elements.map(function(token) {
+    return token.indexedAs
+  })
+}
+
+lunr.TokenList.prototype.rawTokens = function() {
+  return this.elements.map(function(token) {
+    return token.raw
+  })
+}
diff --git a/lib/token_metadata_store.js b/lib/token_metadata_store.js
@@ -0,0 +1,51 @@
+lunr.TokenMetadataStore = function() {
+  this.store = {}
+}
+
+lunr.TokenMetadataStore.prototype.add = function(docRef, token) {
+  if(!(token instanceof lunr.Token)){
+    throw new Error ("Must add lunr.Token to TokenMetadataStore")
+  }
+
+  var idxVal = token.indexedAs
+
+  if(!idxVal) return
+
+  this.store[docRef] = this.store[docRef] || {}
+  this.store[docRef][idxVal] = this.store[docRef][idxVal] || []
+
+  this.store[docRef][idxVal].push(token);
+}
+
+lunr.TokenMetadataStore.prototype.get = function(docRef, idxVal) {
+  if(this.store[docRef] && this.store[docRef][idxVal]) {
+    return this.store[docRef][idxVal]
+  } else {
+    return null
+  }
+}
+
+lunr.TokenMetadataStore.prototype.getAll = function(docRef, idxValArray) {
+  out = []
+  idxValArray.forEach(function(idxVal) {
+    var tokens = this.get(docRef, idxVal)
+    if(tokens) {
+      tokens.forEach(function(token) {
+        if(token) {
+          out.push(token.toJSON())
+        }
+      })
+    }
+  }, this)
+  return out
+}
+
+lunr.TokenMetadataStore.prototype.remove = function(docRef) {
+  delete this.store[docRef]
+}
+
+lunr.TokenMetadataStore.prototype.toJSON = function() {
+  return {
+    store: this.store
+  }
+}
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -12,10 +12,21 @@
  * @returns {Array}
  */
 lunr.tokenizer = function (obj) {
-  if (!arguments.length || obj == null || obj == undefined) return []
-  if (Array.isArray(obj)) return obj.map(function (t) { return t.toLowerCase() })
+  if (!arguments.length || obj == null || obj == undefined) return (new lunr.TokenList)
 
-  var str = obj.toString().replace(/^\s+/, '')
+  if (Array.isArray(obj)){
+    return new lunr.TokenList(obj.map(function (t) { 
+      return new lunr.Token({raw: t.toLowerCase()}) 
+    }))
+  } 
+
+  var str = obj.toString(),
+      preStrLength = str.length
+
+  // Trim leading whitespace
+  str = str.replace(/^\s+/, '')
+
+  var trimCount = preStrLength - str.length
 
   for (var i = str.length - 1; i >= 0; i--) {
     if (/\S/.test(str.charAt(i))) {
@@ -24,9 +35,20 @@ lunr.tokenizer = function (obj) {
     }
   }
 
-  return str
-    .split(/\s+/)
-    .map(function (token) {
-      return token.toLowerCase()
+  var startPos = trimCount,
+      tokens = new lunr.TokenList
+
+  str.split(/\s/).forEach(function (_token, index) {
+      if(index){ startPos += 1 }
+
+      // I think lowercase should be a fn in the pipeline, not in the tokenizer
+      var trimmedToken = _token.replace(/^\s+/, '').toLowerCase()
+      if(trimmedToken !== "") {
+        tokens.push(new lunr.Token({raw: trimmedToken, startPos: startPos}))
+      }
+
+      startPos += _token.length
     })
+
+  return tokens
 }
diff --git a/test/index.html b/test/index.html
@@ -25,6 +25,9 @@
     <script src="/lib/stop_word_filter.js" type="text/javascript" charset="utf-8"></script>
     <script src="/lib/trimmer.js" type="text/javascript" charset="utf-8"></script>
     <script src="/lib/token_store.js" type="text/javascript" charset="utf-8"></script>
+    <script src="/lib/token.js" type="text/javascript" charset="utf-8"></script>
+    <script src="/lib/token_metadata_store.js" type="text/javascript" charset="utf-8"></script>
+    <script src="/lib/token_list.js" type="text/javascript" charset="utf-8"></script>
 
     <!-- Fixtures -->
     <script src="/test/fixtures/stemming_vocab.json"></script>
@@ -44,6 +47,7 @@
     <script src="/test/lunr_test.js"></script>
     <script src="/test/token_store_test.js"></script>
     <script src="/test/trimmer_test.js"></script>
+    <script src="/test/token_metadata_store_test.js"></script>
 
   </head>
   <body>

diff --git a/test/pipeline_test.js b/test/pipeline_test.js
@@ -107,7 +107,7 @@ test("run should return the result of running the entire pipeline on each elemen
   var pipeline = new lunr.Pipeline,
       fn1 = function (t1) { return t1.toUpperCase() }
   pipeline.add(fn1)
-  deepEqual(pipeline.run(['a']), ['A'])
+  deepEqual(pipeline.run(['a']).indexTokens(), ['A'])
 })
 
 test("run should filter out any undefined values at each stage in the pipeline", function () {

diff --git a/test/serialisation_test.js b/test/serialisation_test.js
@@ -18,7 +18,7 @@ module('serialisation', {
 
 test('dumping and loading an index', function () {
   var idx = new lunr.Index
-
+  idx.useTokenMetadata = false
   idx.field('title', { boost: 10 })
   idx.field('body')
 
@@ -35,6 +35,7 @@ test('dumping and loading an index with a populated pipeline', function () {
     this.field('title', { boost: 10 })
     this.field('body')
   })
+  idx.useTokenMetadata = false
 
   this.corpus.forEach(function (doc) { idx.add(doc) })
 

diff --git a/test/token_metadata_store_test.js b/test/token_metadata_store_test.js
@@ -0,0 +1,51 @@
+module(lunr.TokenMetadataStore)
+
+test('adding a document to the index', function () {
+  var idx = new lunr.Index,
+      doc = {id: 1, body: 'this is a test'}
+
+  idx.field('body')
+  idx.add(doc)
+
+  ok(!!idx.tokenMetadataStore.store[doc.id], "tokenMetadataStore has an entry for doc 1")
+})
+
+test('searching for a document', function () {
+  var idx = new lunr.Index,
+      doc = {id: 1, body: 'this is a test'}
+
+  idx.field('body')
+  idx.add(doc)
+
+  var results = idx.search("test")
+  equal(results.length, 1, "There should be 1 search result")
+
+  var tokens = results[0].tokens
+  equal(tokens.length, 1, "There should be 1 lunr.Token in the result")
+})
+
+test('searching for a document with repeated tokens', function () {
+  var idx = new lunr.Index,
+      doc = {id: 1, body: 'is a test test'}
+
+  idx.field('body')
+  idx.add(doc)
+
+  var results = idx.search("test")
+      tokens = results[0].tokens
+
+  deepEqual(tokens, [{raw: 'test', startPos: 5, indexedAs: 'test', field:'body'}, {raw: 'test', startPos: 10, indexedAs: 'test', field:'body'}])
+})
+
+test('position works with whitespace', function () {
+  var idx = new lunr.Index,
+      doc = {id: 1, body: '   test'}
+
+  idx.field('body')
+  idx.add(doc)
+
+  var results = idx.search("test")
+      tokens = results[0].tokens
+
+  deepEqual(tokens, [{raw: 'test', startPos: 3, indexedAs: 'test', field:'body'}])
+})