Skip to content

Commit a013868

Browse files
committed
Soundex, bugfixes, documentation, enableSuggestions moved to opts
1 parent a41c7f5 commit a013868

File tree

7 files changed

+163
-46
lines changed

7 files changed

+163
-46
lines changed

README.md

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,28 @@
22

33
[![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts)
44

5-
Fast and extendible Node.js/Javascript full text search engine.
5+
Fast and extendible pure JavaScript full text search engine.
66

77
## Features
88

99
* Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset.
1010
* In-memory operation
1111
* Few external dependencies
12-
* Natural language search
12+
* Natural language searchx
1313
* Partial matching
1414
* Expression correction / suggestions
1515
* Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.)
16+
* Search modifiers (+ require, - exclude, "searchword" precise match - excepts wordprocessors)
1617
* Field preprocessors
1718
* HTML-Stripper
1819
* Word preprocessors
19-
* Swedish stemmer with stemmer stop words
20-
* Stop words
21-
* Wordforms
22-
* Stripper for multiple characters
20+
* [Stemmers](https://en.wikipedia.org/wiki/Stemming)
21+
* Swedish
22+
* English
23+
* [Stop words](https://en.wikipedia.org/wiki/Stop_words)
24+
* Word forms
25+
* [Soundex](https://en.wikipedia.org/wiki/Soundex)
26+
* Stripper for repeated characters
2327
* Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly.
2428

2529

@@ -300,13 +304,19 @@ An optional feature of the stemmers is to supply a list of words that you don't
300304

301305
Currently there is two stemmers available, swedish through a custom version of the Snowball algorithm, and english through the Porter algorithm.
302306

303-
Example setting up thinker with standard ranker and english stemming
307+
Example setting up thinker with standard ranker, english stemming and some stemmer stopwords.
304308

305309
```javascript
306310
var
307311
thinker = Thinker(),
308312
ranker = Thinker.rankers.standard(),
309-
stemmer = Thinker.processors.stemmers.english();
313+
stemmer = Thinker.processors.stemmers.english({
314+
"stemmer": true,
315+
"stemming": true,
316+
"dontstemthiseither": true,
317+
"leonardo": true,
318+
"anders", true
319+
});
310320

311321
thinker.addWordProcessor(stemmer);
312322

@@ -322,9 +332,8 @@ var
322332
thinker = Thinker(),
323333
ranker = Thinker.rankers.standard(),
324334
stemmer = Thinker.processors.stemmers.swedish({
325-
"stemmer": true,
326-
"stemming": true,
327-
"dontstemthiseither": true,
335+
"berta": true,
336+
"jonas": true,
328337
"leonardo": true,
329338
"anders", true
330339
});
@@ -334,6 +343,23 @@ thinker.addWordProcessor(stemmer);
334343
thinker.ranker = ranker;
335344
```
336345

346+
#### Soundex
347+
348+
Soundex preprocesses the words in such way that words that sounds alike matches each other.
349+
350+
Example setting up thinker with Soundex processing.
351+
352+
```javascript
353+
var
354+
thinker = Thinker(),
355+
ranker = Thinker.rankers.standard(),
356+
soundex = Thinker.processors.soundex();
357+
358+
thinker.addWordProcessor(soundex);
359+
360+
thinker.ranker = ranker;
361+
```
362+
337363

338364
## Dependencies
339365

@@ -343,6 +369,8 @@ Note: Dependencies is installed automatically by npm
343369

344370
[stemmer](https://github.com/wooorm/stemmer) (https://github.com/wooorm/stemmer)
345371

372+
[node-soundex](https://github.com/LouisT/node-soundex) (https://github.com/LouisT/node-soundex)
373+
346374

347375
## Development dependencies
348376

lib/Thinker.js

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,6 @@ THE SOFTWARE.
2222
2323
*/
2424

25-
/* ToDo:
26-
27-
* Vikta titel efter hur mycket plats sökorden tar i titeln
28-
* Missingspacesnurra
29-
30-
*/
31-
3225
var Index = require('./index.js'),
3326
processors = require('./processors.js'),
3427
rankers = require('./rankers.js');
@@ -118,9 +111,6 @@ function Thinker (opts) {
118111
return new Thinker(opts);
119112
}
120113

121-
122-
// Can be set afterwards
123-
self.enableSuggestions = false;
124114
self.ranker = function() {};
125115

126116
// All these options must be set before indexing and
@@ -134,7 +124,8 @@ function Thinker (opts) {
134124
maxWordLen: 32,
135125
wordProcessors: [],
136126
fieldProcessors: [],
137-
suggestionMinWordCount: 6
127+
suggestionMinWordCount: 6,
128+
enableSuggestions: false
138129
}, opts );
139130

140131
// Index backend
@@ -170,9 +161,9 @@ Thinker.prototype.feed = function (texts, opts) {
170161
for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) {
171162
for (j = 0; j < (word.original.length - i) + 1; j++) {
172163
// Do not input partial if equals processed
173-
//if( word.original.substr(j,i) !== word.processed ) {
164+
if( word.original.substr(j,i) !== word.processed ) {
174165
self.index.populatePartial(word.original.substr(j, i), wIndex);
175-
//}
166+
}
176167
}
177168
}
178169
}
@@ -256,16 +247,16 @@ Thinker.prototype.find = function (string) {
256247
continue;
257248
}
258249

259-
//
250+
//
260251
queryResult = self.index.query(word, exact);
261252

262-
//
253+
// Enable suggestions if self.options.enableSuggestions is true
263254
suggestion = undefined;
264-
if (!queryResult.direct.length && self.enableSuggestions) {
255+
if (!queryResult.direct.length && self.options.enableSuggestions) {
265256
suggestion = self.index.findClosestWord(word.original);
266257
}
267258

268-
//
259+
// Push this expression to result array
269260
resultSet.expressions.push({
270261
interpretation: exact ? word.original : word.processed,
271262
original: word.original,
@@ -281,6 +272,11 @@ Thinker.prototype.find = function (string) {
281272
time('rankTime')
282273
resultSet.documents = self.ranker(resultSet,self.index.getWordCount());
283274

275+
// Remove expression[m].hits from resultset, not needed anymore
276+
for (i = 0; i < resultSet.expressions.length; i++) {
277+
delete resultSet.expressions[i].hits;
278+
}
279+
284280
// Add timers to resultset
285281
resultSet.findTime = time('findTime');
286282
resultSet.rankTime = time('rankTime');

lib/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ function index(opts) {
173173
} else {
174174
direct = queryProcessed( location.processed );
175175
}
176-
partial = queryPartial( location.original ) || queryPartial( location.partial );
176+
partial = queryPartial( location.original ) || queryPartial( location.processed );
177177

178178
// Add object
179179
return {

lib/processors.js

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ THE SOFTWARE.
2424

2525
'use strict';
2626

27-
var porterStemmer = require('stemmer');
27+
var porterStemmer = require('stemmer'),
28+
Soundex = require('soundex');
2829

2930
function stopwords ( stopwords ) {
3031
var stopwords = stopwords || {};
@@ -262,17 +263,27 @@ function swedishStemmer ( stopwords ) {
262263
263264
}*/
264265

265-
function englishStemmer ( ) {
266+
function englishStemmer ( stopwords ) {
267+
var stopwords = stopwords || {};
266268
return function ( w ) {
269+
// Dont process stopwords
270+
if ( stopwords[w] === true ) return w;
267271
return porterStemmer( w );
268272
};
269273
};
270274

275+
function soundex ( ) {
276+
return function ( w ) {
277+
return Soundex( w );
278+
};
279+
};
280+
271281
module.exports = {
272282
stemmers: {
273283
swedish: swedishStemmer,
274284
english: englishStemmer
275285
},
286+
soundex: soundex,
276287
stopwords: stopwords,
277288
wordforms: wordforms,
278289
multiples: multiples,

lib/rankers.js

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ THE SOFTWARE.
2626

2727
/* Default ranker */
2828
function standard (options) {
29+
2930
// Defaults
3031
var defaultFieldOptions = {
3132
weight: 1,
@@ -68,6 +69,7 @@ function standard (options) {
6869

6970
j = 0;
7071
while ((word = resultSet.expressions[j++])) {
72+
7173
matches = [
7274
{
7375
flag: 1,
@@ -100,15 +102,16 @@ function standard (options) {
100102
// current field or fall back on the default settings.
101103
fieldOptions = options.fields[fieldIndex] || defaultFieldOptions;
102104

103-
//
105+
// Multiply match weight with field-specific weight
104106
weight = match.weight * fieldOptions.weight;
105107

106-
// Not sure what this is
108+
// For field with boostPercentage flag enabled - add extra weight the more of the field that is matched.
109+
// 1 + (noOfMatchedWords / totalWordsInField)
107110
if (fieldOptions.boostPercentage) {
108111
weight *= (1 + (matchCount / wordCount[documentId][fieldIndex - 1]));
109112
}
110113

111-
// Something explanatory
114+
// Add this fields calculated weight to the document total
112115
getDocument(documentId).weight += weight;
113116
getDocument(documentId).expressions[j - 1] = match.flag;
114117

@@ -123,8 +126,6 @@ function standard (options) {
123126
// Convert document results from object to array (to be sortable)
124127
documentResultsFinal = Object.keys(documentResults).map(function (key) { return documentResults[key]; });
125128

126-
// Remove unwanted documents
127-
128129
// Sort documents by total weight
129130
documentResultsFinal.sort(function(a, b) {
130131
return b.weight - a.weight
@@ -164,6 +165,7 @@ function standard (options) {
164165
if (!toss) {
165166
temp.push(documentResultsFinal[i]);
166167
}
168+
167169
}
168170

169171
resultSet = temp;

package.json

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "thinker-fts",
3-
"version": "1.0.7",
4-
"description": "Javascript/Node.js in-memory full text search engine.",
3+
"version": "1.0.8",
4+
"description": "Pure Javascript/Node.js in-memory full text search engine.",
55
"author": "Hexagon <github.com/hexagon>",
66
"contributors": [{
77
"name": "Pehr Boman",
@@ -22,11 +22,17 @@
2222
"thinker",
2323
"fts",
2424
"fulltext",
25-
"in-memory"
25+
"in-memory",
26+
"levenshtein",
27+
"soundex",
28+
"porter",
29+
"stemmer",
30+
"full text search"
2631
],
2732
"dependencies": {
2833
"fast-levenshtein": "*",
29-
"stemmer": "*"
34+
"stemmer": "*",
35+
"soundex": "*"
3036
},
3137
"devDependencies": {
3238
"mocha": "*",

0 commit comments

Comments
 (0)