Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero-copy loading of precomputed dictionaries #65

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
.DS_Store
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ Typo.js is a JavaScript spellchecker that uses Hunspell-style dictionaries.
Usage
=====

Simple Loading
--------------

To use Typo in a Chrome extension, simply include the typo.js file in your extension's background page, and then initialize the dictionary like so:

```javascript
Expand All @@ -22,6 +25,33 @@ var Typo = require("typo-js");
var dictionary = new Typo([...]);
```


Faster Loading
--------------

If you care about memory or cpu usage, you should try this method.

The above methods load the dictionary from hunspell compatible `.dic` and `.aff` files. But if you are using node.js or are using a bundler that supports `require(...)`, you can load dictionaries for fast and memory efficient zero-copy-ish files that are precomputed using a script

To load en_US with the included precomputed dictionary files:

```javascript
var Typo = require("typo-js");
var dictionary = new Typo();
dictionary.loadPrecomputed([...]); // Supports most of the same settings as the constructor
```

Assuming you installed this as a node module, if you have some other set of `.aff` and `.dic` files, precompute the `.sst` and `.json` files used by the above technique by running:

`./node_modules/.bin/typo-precompute [en_US|other_code] [path/to/dictionaries]` using your terminal in your project's root folder

NOTE: The precompute script will require a lot of memory if processing a large dictionary.



Methods
-------

To check if a word is spelled correctly, do this:

```javascript
Expand All @@ -36,6 +66,9 @@ var array_of_suggestions = dictionary.suggest("mispeling");
// array_of_suggestions == ["misspelling", "dispelling", "misdealing", "misfiling", "misruling"]
```

Compatibility
-------------

Typo.js has full support for the following Hunspell affix flags:

* PFX
Expand Down
39 changes: 39 additions & 0 deletions bin/benchmark.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env node

'use strict';


const Typo = require('../src/typo');

function time(name, f, iters) {
iters = iters || 1;

let t = new Date();
for(var i = 0; i < iters; i++) {
f();
}
let te = new Date();

let elapsed = ((te - t) / 1000) + 's';
console.log(name, elapsed)
}

console.log('Dictionary load time');

var dict = new Typo();
time('- regular', () => dict.load('en_US'));


var preDict = new Typo();
time('- precomputed', () => preDict.loadPrecomputed('en_US'));


console.log('\n\ndict.check() speed');
var words = ['hypersensitiveness', "Abbott's", '9th', 'aaraara', "didn't", "he're"];

var n = 1000000;
words.map((w) => {
time('- ' + w + ' (reg)', () => dict.check(w), n);
time('- ' + w + ' (pre)', () => preDict.check(w), n);
})

91 changes: 91 additions & 0 deletions bin/precompute-dic.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env node
/*
The hunspell '.dic' dictionaries that typo.js take too long to load for most web applications and the default javascript object based word table is very inefficient on memory.

This is a node.js script for taking an existing dictionary, loading it the regular way and then outputing

Usage:
- call as ./bin/precompute-dic.js [local_code] [path_to_dictionaries_folder]

- default usage is equivalent to "./bin/precompute-dic.js en_US ./src/dictionaries"
*/

'use strict';

const fs = require('fs');
const path = require('path');
const sstab = require('sstab');

var DICT = process.argv[2] || 'en_US';
var FOLDER = path.resolve(process.argv[3] || (__dirname + '/../src/dictionaries/'));
var BINSIZE = 12; /**< Number of strings per sst bin */

console.log();


var Typo = require('../src/typo', null, null, { dictionaryPath: FOLDER });
var dict = new Typo();
dict.load(DICT)

console.log('# Expanded Words:', Object.keys(dict.dictionaryTable).length)
console.log('Flags:', dict.flags)

var dt = dict.dictionaryTable;


var obj = {};

console.log('1/4 Compressing values');
for(var k in dt) {
if(dt.hasOwnProperty(k)) {
var v = dt[k]? dt[k] : [];

// Making that that the charset loading correctly
if(k.slice(2) == 'rich' && k[0] == 'Z') {
console.log(k.charCodeAt(1), '==', 'Zürich'.charCodeAt(1)); // Should be 'Zürich'
}

// Remove rules that have already been applied
for(var i = 0; i < v.length; i++) {
var r = dict.rules[v[i]];
if(r && (r.type === 'PFX' || r.type === 'SFX')) {
v.splice(i, 1);
i--;
}
}

v = v.join('');

if(typeof(v) !== 'string') {
console.log('Unsupported flags:', typeof(v));
}

obj[k] = v;
}
}


console.log('2/3 Creating table');

var buf = sstab.build(obj);


console.log('3/3 Saving');

// Generate metadata file
fs.writeFileSync(FOLDER + `/${DICT}/${DICT}.json`, JSON.stringify({
compoundRuleCodes: dict.compoundRuleCodes,
dictionary: dict.dictionary,
rules: dict.rules,
compoundRules: dict.compoundRules.map((r) => r.toString()), // Regex needs to be explicitly stringified for JSON serialization
compoundRuleCodes: dict.compoundRuleCodes,
replacementTable: dict.replacementTable,
flags: dict.flags,
loaded: dict.loaded
}))

fs.writeFileSync(FOLDER + `/${DICT}/${DICT}.sst`, buf);

console.log('Done!');


20 changes: 20 additions & 0 deletions examples/node/precomputed.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Before running, ensure that you have done
* $ npm install typo-js
*/

var Typo = require("typo-js");
var dictionary = new Typo();
dictionary.loadPrecomputed("en_US");

var is_spelled_correctly = dictionary.check("mispelled");

console.log( "Is 'mispelled' spelled correctly? " + is_spelled_correctly );

var is_spelled_correctly = dictionary.check("misspelled");

console.log( "Is 'misspelled' spelled correctly? " + is_spelled_correctly );

var array_of_suggestions = dictionary.suggest("mispeling");

console.log( "Spelling suggestions for 'mispeling': " + array_of_suggestions.join( ', ' ) );
2 changes: 2 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

module.exports = require('./src/typo');