From a050a1a43d1b4800a10f2fcdd73a4e9ccbd4f6ad Mon Sep 17 00:00:00 2001 From: Cris Stringfellow <22254235+crislin2046@users.noreply.github.com> Date: Wed, 22 Dec 2021 17:11:24 +0800 Subject: [PATCH] "Fixed an issue with fuzzy and i_url. need to improve snippets" --- archivist.js | 36 +- lib/fz.js | 3071 ++++++++++++++++++++++++++++++++++++++++++++++ libraryServer.js | 20 +- todo | 1 + 4 files changed, 3098 insertions(+), 30 deletions(-) create mode 100644 lib/fz.js diff --git a/archivist.js b/archivist.js index c94d24d..1695191 100644 --- a/archivist.js +++ b/archivist.js @@ -21,11 +21,12 @@ addDocumentToIndex as ndx, removeDocumentFromIndex, vacuumIndex - } from './lib/ndx.js'; + } from 'ndx'; import { query as NDXQuery } from 'ndx-query'; import { toSerializable, fromSerializable } from 'ndx-serializable'; //import { DocumentIndex } from 'ndx'; - import Fuzzy from 'fz-search'; + //import Fuzzy from 'fz-search'; + import * as _Fuzzy from './lib/fz.js'; import Nat from 'natural'; import args from './args.js'; @@ -41,6 +42,7 @@ // search related state: constants and variables // common + const Fuzzy = globalThis.FuzzySearch; const NDX_OLD = false; const USE_FLEX = true; const FTS_INDEX_DIR = args.fts_index_dir; @@ -50,10 +52,6 @@ NDX_ID_KEY ]); const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key); - const nextOffset = (query, doc, startAt = 0) => Nat.LevenshteinDistanceSearch( - query, - doc.slice(startAt) - ); let Id; // natural (NLP tools -- stemmers and tokenizers, etc) @@ -441,6 +439,7 @@ export default Archivist; doc.contentSignature = contentSignature; fuzzy.add(doc); State.Docs.set(url, doc); + console.log(doc,url); } DEBUG && console.log("NDX updated", doc.ndx_id); @@ -660,6 +659,7 @@ export default Archivist; const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString(); State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => { doc.contentSignature = getContentSig(doc); + console.log(doc.url, doc); return [doc.url, doc]; })); await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc))); @@ -676,7 +676,7 @@ export default Archivist; function saveFuzzy(basePath) { const docs = [...State.Docs.values()] - .map(({url, title, content, id}) => ({url, title, content, id})); + .map(({i_url, url, title, content, id}) => ({i_url, url, title, content, id})); const path = getFuzzyPath(basePath); Fs.writeFileSync( path, @@ -862,18 +862,12 @@ export default Archivist; function findOffsets(query, doc, count) { let res = []; - let i = 0; - while(i < doc.length) { - const result = nextOffset(query, doc, i); - console.log(result, i); - i += result.offset + result.substring.length + SNIP_CONTEXT; + const result = Nat.LevenshteinDistanceSearch(query, doc); + + if ( result.distance/result.substring.length < 0.5 ) { res.push(result); } - res.sort(({distance:a}, {distance:b}) => a-b); - console.log({res}); - res = res.slice(0, count); - return res; } @@ -939,7 +933,15 @@ export default Archivist; const results = combineResults({flex, ndx, fuzz}); - return {query,results}; + const highlights = fuzzRaw.map(obj => ({ + id: obj.id, + url: fuzzy.highlight(obj.url), + title: fuzzy.highlight(State.Index.get(obj.id).title), + })); + const HL = new Map(); + highlights.forEach(hl => HL.set(hl.id, hl)); + + return {query,results, HL}; } function combineResults({flex,ndx,fuzz}) { diff --git a/lib/fz.js b/lib/fz.js new file mode 100644 index 0000000..7ff50ea --- /dev/null +++ b/lib/fz.js @@ -0,0 +1,3071 @@ +/** + * @license FuzzySearch.js + * Autocomplete suggestion engine using approximate string matching + * https://github.com/jeancroy/FuzzySearch + * + * Copyright (c) 2015, Jean Christophe Roy + * Licensed under The MIT License. + * http://opensource.org/licenses/MIT + */ + +(function () { 'use strict'; + +/** + * @param options + * @constructor + */ +'use strict'; + +function FuzzySearch(options) { + + if (options === undefined) options = {}; + if (!(this instanceof FuzzySearch)) return new FuzzySearch(options); + FuzzySearch.setOptions(this, options, FuzzySearch.defaultOptions, _privates, true, this._optionsHook) + +} + +FuzzySearch.defaultOptions = +/** @lends {FuzzySearchOptions.prototype} */{ + + // + // Scoring, include in result + // + + minimum_match: 1.0, // Minimum score to consider two token are not unrelated + thresh_include: 2.0, // To be a candidate, score of item must be at least this + thresh_relative_to_best: 0.5, // and be at least this fraction of the best score + field_good_enough: 20, // If a field have this score, stop searching other fields. (field score is before item related bonus) + + // + // Scoring, bonus + // + + bonus_match_start: 0.5, // Additional value per character in common prefix + bonus_token_order: 2.0, // Value of two token properly ordered + bonus_position_decay: 0.7, // Exponential decay for position bonus (smaller : more importance to first item) + + score_per_token: true, // if true, split query&field in token, allow to match in different order + // if false, bypass at least half the computation cost, very fast + // also disable different token that score different field, because no more token!! + + score_test_fused: false, // Try one extra match where we disregard token separation. + // "oldman" match "old man" + + score_acronym: false, // jrrt match against John Ronald Reuel Tolkien + token_sep: " .,-:", + + // + // Output sort & transform + // + + score_round: 0.1, // Two item that have the same rounded score are sorted alphabetically + output_limit: 0, // Return up to N result, 0 to disable + + sorter: compareResults, // Function used to sort. See signature of Array.sort(sorter) + normalize: normalize, // Function used to transform string (lowercase, accents, etc) + filter: null, // Select elements to be searched. (done before each search) + + /**@type {string|function({SearchResult})}*/ + output_map: "item", // Transform the output, can be a function or a path string. + // output_map="root" return SearchResult object, needed to see the score + // output_map="root.item" return original object. + // output_map="root.item.somefield" output a field of original object. + // (root.) is optional. + // + // output_map=function(root){ return something(root.item) } + // ^this get original object and apply something() on it. + + join_str: ", ", //String used to join array fields + + // + // Tokens options + // + + token_query_min_length: 2, // Avoid processing very small words, include greater or equal, in query + token_field_min_length: 3, // include greater or equal, in item field + token_query_max_length: 64, // Shorten large token to give more even performance. + token_field_max_length: 64, // Shorten large token to give more even performance. + token_fused_max_length: 64, // Shorten large token to give more even performance. + + //Do not attempt to match token too different in size: n/m = len(field_tok)/len(query_tok) + token_min_rel_size: 0.6, // Field token should contain query token. Reject field token that are too small. + token_max_rel_size: 10, // Large field token tend to match against everything. Ensure query is long enough to be specific. + + + // + // Interactive - suggest as you type. + // Avoid doing search that will be discarded without being displayed + // This also help prevent lag/ temp freeze + // + + interactive_debounce: 150, // This is initial value. Will try to learn actual time cost. Set to 0 to disable. + interactive_mult: 1.2, // Overhead for variability and to allow other things to happens (like redraw, highlight ). + interactive_burst: 3, // Allow short burst, prevent flicker due to debounce suppression of a callback + + // + // Data + // + + source: [], + keys: [], + lazy: false, // when true, any refresh happens only when a user make a search, option stay put until changed. + token_re: /\s+/g, //Separator string will be parsed to this re. + + identify_item: null, // How to uniquely identify an item when adding to the index. Defaults to null, meaning no duplicate detection. Must be a method that takes a single (source) argument. + + use_index_store: false, // Enable a time vs memory trade-off for faster search (but longer initial warm-up). + store_thresh: 0.7, // cutoff point relative to best, to graduate from store phase. + store_max_results: 1500 // Maximum number of result to graduate from store, to the full search quality algorithm + // Note that store only perform a crude search, ignoring some options, so the best result can be only "meh" here. + +}; + + +var _privates = +/** @lends {FuzzySearch.prototype} */{ + + keys: [], + tags: [], // alternative name for each key, support output alias and per key search + index: [], // source is processed using keys, then stored here + index_map: {}, // To manage update of record already in dataset + nb_indexed: 0, // To manage active count of index + store: {}, // Dictionary used for time VS memory trade off. (Optional) + + tags_re: null, + acro_re: null, + token_re: null, + + /**@type {FuzzySearchOptions}*/ + options: null, + dirty: false, // when true, schedule a source refresh using new or existing source & keys, used once then clear itself. + + //Information on last search + query: null, + results: [], + start_time: 0, + search_time: 0 + +}; + +/** + * Number of bit in a int. + * DEBUG-tip: setting this to zero will force "long string" algorithm for everything! + * @const + */ +var INT_SIZE = 32; + +function FuzzySearchOptions(defaults, options) { + for (var key in defaults) { + if (defaults.hasOwnProperty(key)) { //fill self with value from either options or default + this[key] = (options.hasOwnProperty(key) && options[key] !== undefined ) ? options[key] : defaults[key]; + } + } +} + +FuzzySearchOptions.update = function (self, defaults, options) { + for (var key in options) { + if (options.hasOwnProperty(key) && defaults.hasOwnProperty(key)) { + //explicitly set a options to undefined => reset default, else get value + self[key] = (options[key] === undefined) ? defaults[key] : options[key]; + } + } +}; + +/** + * Set property of object, + * Restrict properties that can be set from a list of available defaults. + * + * @param {FuzzySearch} self + * @param {Object} options + * @param {Object} defaults + * @param {Object} privates + * @param {boolean} reset + * @param {function({Object})} hook + * + */ +FuzzySearch.setOptions = function (self, options, defaults, privates, reset, hook) { + + if (reset) { + extend(self, privates); + self.options = new FuzzySearchOptions(defaults, options); + } else { + FuzzySearchOptions.update(self.options, defaults, options); + } + + hook.call(self, options) +}; + +function extend(a, b) { + for (var key in b) if (b.hasOwnProperty(key)) a[key] = b[key]; +} + + +// +// - - - - - - - - - - - - +// SET & PARSE SETTINGS +// - - - - - - - - - - - - +// + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * Allow to change options after the object has been created. + * If source is changed, new source is indexed. + * + * Optional reset allow to change any setting not in options to defaults. + * This is similar to creating new object, but using same pointer. + * + * @param {Object} options + * @param {boolean=} reset + */ + + setOptions: function (options, reset) { + if (reset === undefined) reset = options.reset || false; + FuzzySearch.setOptions(this, options, FuzzySearch.defaultOptions, _privates, reset, this._optionsHook); + }, + + /** + * + * @param {Object} options + * @private + */ + + _optionsHook: function (options) { + + //Items of options have been copied into this.options + //We still test "option_name in option" to know if we have received something new + //This allow to support "shorthand" options and is used to refresh data. + + var self_options = this.options; + + //Output stage + if ("output_map" in options && typeof options.output_map === "string") { + if (self_options.output_map === "alias") self_options.output_map = this.aliasResult; + else self_options.output_map = removePrefix(self_options.output_map, ["root", "."]); + } + + this.source = self_options.source; + + // Input stage, work to allow different syntax for keys definition is done here. + var oKeys; + if (("keys" in options) && ( ( oKeys = options.keys) !== undefined)) { + + var key_type = Object.prototype.toString.call(oKeys); + var key_index, nb_keys; + + this.tags = null; + + if (key_type === "[object String]") { + this.keys = oKeys.length ? [oKeys] : []; + } + + else if (key_type === "[object Object]") { + + this.keys = []; + this.tags = []; //we don't know the "length" of dictionary + key_index = 0; + for (var tag in oKeys) { + if (oKeys.hasOwnProperty(tag)) { + this.tags[key_index] = tag; + this.keys[key_index] = oKeys[tag]; + key_index++; + } + } + + } + + else { + this.keys = oKeys; + } + + oKeys = this.keys; + nb_keys = oKeys.length; + for (key_index = -1; ++key_index < nb_keys;) { + oKeys[key_index] = removePrefix(oKeys[key_index], ["item", "."]) + } + + if (!this.tags) this.tags = oKeys; + this.tags_re = buildTagsRE(this.tags); + + } + + if (this.acro_re === null || "acronym_tok" in options) { + this.acro_re = buildAcronymRE(self_options.token_sep); + } + + if (this.token_re === null || "token_sep" in options) { + this.token_re = self_options.token_re = new RegExp("[" + re_escape(self_options.token_sep) + "]+", "g"); + } + + // Determine if we need to rebuild this.index from this.source + if (options.dirty || ("source" in options) || ("keys" in options) || ("use_index_store" in options)) { + if (self_options.lazy) this.dirty = true; // Schedule later. + else { + this._buildIndexFromSource(); + this.dirty = false; + } + } + + } + +}); + +/** + * Removes optional prefix of paths. + * for example "root.", "." + * + * @param {string} str - input + * @param {Array} prefixes to remove + * @returns {string} + */ + +function removePrefix(str, prefixes) { + var n = prefixes.length; + var offset = 0; + + for (var i = -1; ++i < n;) { + var p = prefixes[i], l = p.length; + if (str.substr(offset, l) === p) offset += l; + } + + return (offset > 0) ? str.substr(offset) : str; +} + +function buildTagsRE(tags) { + + var n = tags.length; + if (!n) return null; + + var tag_str = re_escape(tags[0]); + for (var i = 0; ++i < n;) { + tag_str += "|" + re_escape(tags[i]); + } + + return new RegExp("(?:^|\\s)\\s*(" + tag_str + "):\\s*", "g"); + +} + +function buildAcronymRE(sep) { + + var n = sep.length; + if (!n) return null; + var acro_str = re_escape(sep); + return new RegExp("(?:^|[" + acro_str + "])+([^" + acro_str + "])[^" + acro_str + "]*", "g"); + +} + +// Build regexp for tagged search +function re_escape(str) { + var re = /[\-\[\]\/\{}\(\)\*\+\?\.\\\^\$\|]/g; + return str.replace(re, "\\$&"); +} + +// +// - - - - - - - - - - - - +// OUTPUT OR POST PROCESS +// - - - - - - - - - - - - +// +'use strict'; + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * Given a SearchResult object, recover the value of the best matching field. + * This is done on demand for display. + * + * @param {SearchResult} result + * @return {string} original field + */ + + getMatchingField: function (result) { + var f = FuzzySearch.generateFields(result.item, [this.keys[result.matchIndex]]); + return f[0][result.subIndex]; + }, + + /** + * Given a SearchResult object, generate a new object that follow alias structure + * @param {SearchResult} result + * @return {*} aliased result + */ + + aliasResult: function (result) { + + var options = this.options; + var f = FuzzySearch.generateFields(result.item, this.keys); + var out = {}, tags = this.tags, join_str = options.join_str; + + for (var i = -1, n = f.length; ++i < n;) { + out[tags[i]] = f[i].join(join_str) + } + + out._item = result.item; + out._score = result.score; + out._match = f[result.matchIndex][result.subIndex]; + + return out; + + } + +}); + + +// - - - - - - - - - - - - - - - - - - - - - - +// Output stage, prepare results for return +//- - - - - - - - - - - - - - - - - - - - - - + +/** + * Own version of Array.prototype.map() + * + * @param {Array} source + * @param transform callback + * @param {*=} context (*this* in called function) + * @param {number=} max_out + * @returns {Array} + */ + +FuzzySearch.map = function (source, transform, context, max_out) { + + var n = source.length; + if (max_out > 0 && max_out < n) n = max_out; + if (typeof transform !== "function") return source.slice(0, n); + + var out = new Array(n); + for (var i = -1; ++i < n;) { + out[i] = transform.call(context, source[i], i, source); + } + + return out; + +}; + +/** + * Take an array of objects, return an array containing a field of those object. + * + * test = [ {key:"A",value:10}, {key:"B",value:20} ] + * mapField(test,"value") = [10,20] + * + * @param source - array to process + * @param {string} path - key to address on each item OR function to apply + * @param {Number=} [max_out=source.length] - only process first items + * @returns {Array} + */ + +FuzzySearch.mapField = function (source, path, max_out) { + + var n = source.length; + if (max_out > 0 && max_out < n) n = max_out; + if (path === "") return source.slice(0, n); + + var out = new Array(n); + var obj, i; + + + if (path.indexOf(".") === -1) { + //fast case no inner loop + for (i = -1; ++i < n;) { + obj = source[i]; + if (path in obj) out[i] = obj[path]; + } + + } else { + + //general case + var parts = path.split("."); + var nb_level = parts.length; + + for (i = -1; ++i < n;) { + obj = source[i]; + + for (var level = -1; ++level < nb_level;) { + var key = parts[level]; + if (!(key in obj)) break; + obj = obj[key]; + } + + out[i] = obj; + } + + } + + return out; + +}; + +/** + * Filter array for item where item[field] >= atleast + * + * @param array + * @param field + * @param atleast + * @returns {Array} + */ + +FuzzySearch.filterGTE = function (array, field, atleast) { + var i = -1, j = -1; + var n = array.length; + var out = [], obj; + + while (++i < n) { + obj = array[i]; + if (obj[field] >= atleast) { + out[++j] = obj; + } + } + + return out; +}; + + +/** + * SearchResult constructor + * - Internal result list + * - Output of search when output_map="" + * + * @param {*} item + * @param {Array} fields + * @param {number} item_score + * @param {number} matched_field_index + * @param {number} matched_field_sub + * @param {(string|number)} sortkey + * @constructor + */ + +function SearchResult(item, fields, item_score, matched_field_index, matched_field_sub, sortkey) { + this.item = item; + this.fields = fields; + this.score = item_score; + this.matchIndex = matched_field_index; + this.subIndex = matched_field_sub; + this.sortKey = sortkey; +} + + +/** + * Sort function + * first by decreasing order of score, then alphabetical order of sortkey. + * + * @param {SearchResult} a + * @param {SearchResult} b + * @returns {number} - ">0" if b before a, "<0" if b after a. + */ +function compareResults(a, b) { + var d = b.score - a.score; + if (d !== 0) return d; + var ak = a.sortKey, bk = b.sortKey; + return ak > bk ? 1 : ( ak < bk ? -1 : 0); +} + +// +// - - - - - - - - - - - - +// Prepare Query +// - - - - - - - - - - - - +// + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + + /** + * Input: a user search string + * Output a query object + * + * Perform a few transformation to allw faster searching. + * String is set to lowercase, some accents removed, split into tokens. + * Token too small are filtered out, token too large are trimmed. + * Token are packed in group of 32 char, each token is processed to extract an alphabet map. + * + * If score_test_fused is enabled, we do an extra pass disregarding tokens. + * IF score_per_token is disabled this is the only pass we do. + * + * @param query_string + * @returns {Query} + * @private + */ + + _prepQuery: function (query_string) { + + var options = this.options; + var opt_tok = options.score_per_token; + var opt_fuse = options.score_test_fused; + var opt_fuselen = options.token_fused_max_length; + var opt_qmin = options.token_field_min_length; + var opt_qmax = options.token_field_max_length; + + var tags = this.tags; + var tags_re = this.tags_re; + var nb_tags = tags.length; + var token_re = this.token_re; + + var norm, fused, fused_map, children, has_tags, group, words; + + if (opt_tok && nb_tags && tags_re) { + + var start = 0, end; + var q_index = 0; + var q_parts = new Array(nb_tags + 1); + + var match = tags_re.exec(query_string); + has_tags = (match !== null); + + while (match !== null) { + end = match.index; + q_parts[q_index] = query_string.substring(start, end); + start = end + match[0].length; + q_index = tags.indexOf(match[1]) + 1; + match = tags_re.exec(query_string); + } + + q_parts[q_index] = query_string.substring(start); + + children = []; + + for (var i = -1; ++i < nb_tags;) { + + var qp = q_parts[i + 1]; + if (!qp || !qp.length) continue; + + norm = options.normalize(qp); + fused = norm.substring(0, opt_fuselen); + fused_map = (opt_fuse || !opt_tok) ? FuzzySearch.alphabet(fused) : {}; + words = FuzzySearch.filterSize(norm.split(token_re), opt_qmin, opt_qmax); + group = FuzzySearch.pack_tokens(words); + + children[i] = new Query(norm, words, group, fused, fused_map, false, []); + } + + + norm = options.normalize(q_parts[0]); + words = FuzzySearch.filterSize(norm.split(token_re), opt_qmin, opt_qmax); + group = FuzzySearch.pack_tokens(words); + + } + + else { + norm = options.normalize(query_string); + words = FuzzySearch.filterSize(norm.split(token_re), opt_qmin, opt_qmax); + group = opt_tok ? FuzzySearch.pack_tokens(words) : []; + has_tags = false; + children = new Array(nb_tags); + } + + fused = norm.substring(0, opt_fuselen); + fused_map = (opt_fuse || !opt_tok) ? FuzzySearch.alphabet(fused) : {}; + + return new Query(norm, words, group, fused, fused_map, has_tags, children) + + } +}); + +// +// Query objects +// + +/** + * Hold a query + * + * @param {string} normalized + * @param {Array.} words + * @param {Array.} tokens_groups + * @param {string} fused_str + * @param {Object} fused_map + * @param {boolean} has_children + * @param {Array} children + * + * @constructor + */ + +function Query(normalized, words, tokens_groups, fused_str, fused_map, has_children, children) { + + this.normalized = normalized; + this.words = words; + this.tokens_groups = tokens_groups; + + this.fused_str = fused_str; + this.fused_map = fused_map; + this.fused_score = 0; + + this.has_children = has_children; + this.children = children; + +} + +// +// Query hold some memory to keep score of it's tokens. +// Used in search methods + +/** + * Loop tru each item score and reset to 0, apply to child query + */ +Query.prototype.resetItem = function () { + var groups = this.tokens_groups; + + for (var group_index = -1, nb_groups = groups.length; ++group_index < nb_groups;) { + var score_item = groups[group_index].score_item; + for (var i = -1, l = score_item.length; ++i < l;) score_item[i] = 0 + + } + + this.fused_score = 0; + + if (this.has_children) { + var children = this.children; + for (var child_index = -1, nb_child = children.length; ++child_index < nb_child;) { + var child = children[child_index]; + if (child) child.resetItem(); + } + } + +}; + +/** + * Sum each item score and add to child score + */ +Query.prototype.scoreItem = function () { + + var query_score = 0; + var groups = this.tokens_groups; + + for (var group_index = -1, nb_groups = groups.length; ++group_index < nb_groups;) { + var group_scores = groups[group_index].score_item; + for (var score_index = -1, nb_scores = group_scores.length; ++score_index < nb_scores;) { + query_score += group_scores[score_index] + } + } + + if (this.fused_score > query_score) query_score = this.fused_score; + + if (this.has_children) { + var children = this.children; + for (var child_index = -1, nb_child = children.length; ++child_index < nb_child;) { + var child = children[child_index]; + if (child) query_score += child.scoreItem(); + } + } + + return query_score; + +}; + + +/** + * Hold a group of token for parallel scoring + * + * @param {Array.} group_tokens + * @param {Object} group_map + * @param {number} gate + * @constructor + */ + +function PackInfo(group_tokens, group_map, gate) { + this.tokens = group_tokens; + this.map = group_map; + this.gate = gate; + + var t = group_tokens.length, i = -1; + var scores = new Array(t); + while (++i < t) scores[i] = 0; + + this.score_item = scores.slice(); + this.score_field = scores.slice(); + this.field_pos = scores; +} + +// +// - - - - - - - - - - - - - - - - - +// Prepare Token for search +// - - - - - - - - - - - - - - - - - +// a normal string can be view as an array of char. +// so we map ( position -> char). +// +// we reverse that relation to map +// char -> positions + +/** + * Record position of each character in a token. + * If token is small, position is recorded by position of a single bit in an int. + * If token is larger than INT_SIZE, position is recorder as array of number. + * + * @param {string} token + * @returns {Object} key value map char->positions (as array of position or single int (can be seen as an array of bit) ) + */ +FuzzySearch.alphabet = function (token) { + var len = token.length; + if (len > INT_SIZE) return FuzzySearch.posVector(token); + else return FuzzySearch.bitVector(token, {}, 0); +}; + +/** + * Apply FuzzySearch.alphabet on multiple tokens + * + * @param {Array.} tokens + * @returns {Array.} + */ +FuzzySearch.mapAlphabet = function (tokens) { + var outlen = tokens.length; + var out = new Array(outlen), i = -1; + while (++i < outlen) { + var t = tokens[i]; + if (t.length > INT_SIZE) out[i] = FuzzySearch.posVector(t); + else out[i] = FuzzySearch.bitVector(t, {}, 0); + } + return out; +}; + +/** + * Record position of each char using a single bit + * + * @param {string} token + * @param {Object} map - Existing map to modify, can init with {} + * @param offset - used for packing multiple word in a single map, can init with 0 + * @returns {Object} Key value map char -> int + */ + +FuzzySearch.bitVector = function (token, map, offset) { + + var len = token.length; + var i = -1, c; + var b = offset; + + while (++i < len) { + c = token[i]; + if (c in map) map[c] |= (1 << b++); + else map[c] = (1 << b++); + } + + return map; + +}; + +/** + * Record position of each char in a token using an array + * Append Infinity as a stop marker for llcs_large + * + * map = posVector("position") + * map["p"] -> [0,Inf] + * map["o"] -> [1,6,Inf] + * + * @param {string} pattern + * @returns {Object} - key value map char->array of position (as number) + */ +FuzzySearch.posVector = function (pattern) { + + var map = {}, c; + + var m = pattern.length, i = -1; + while (++i < m) { + c = pattern[i]; + if (c in map) map[c].push(i); + else map[c] = [i]; + } + + for (c in map) { + if (map.hasOwnProperty(c)) { + map[c].push(Infinity); + } + } + + return map; + +}; + +/** + * Given a list of tokens, pack them into group of upto INT_SIZE(32) chars. + * If a single token is bigger than INT_SIZE create a groupe of a single item + * And use posVector instead of bitVector to prepare fallback algorithm. + * + * @param {Array.} tokens + * @returns {Array.} + */ +FuzzySearch.pack_tokens = function (tokens) { + + var token_index = -1; + var nb_tokens = tokens.length; + var large; + var groups = []; + + //For each group + while (token_index < nb_tokens) { + + var group_tokens = []; + var group_map = {}; + var offset = 0; + var gate = 0; + + //For each token in the group + while (++token_index < nb_tokens) { + + var token = tokens[token_index]; + var l = token.length; + + if (l >= INT_SIZE) { + + large = new PackInfo([token], + FuzzySearch.posVector(token), + 0xFFFFFFFF); + + break; + + } + else if (l + offset >= INT_SIZE) { + token_index--; + break; + } + else { + group_tokens.push(token); + FuzzySearch.bitVector(token, group_map, offset); + gate |= ( (1 << ( token.length - 1) ) - 1 ) << offset; + offset += l + } + + } + + if (group_tokens.length > 0) { + groups.push(new PackInfo(group_tokens, group_map, gate)); + } + + if (large) { + groups.push(large); + large = null; + } + + } + + return groups; + +}; + +// +//----------------------------- +// SCORING FUNCTIONS +// --------------------------- +// +'use strict'; + + +/** + * Score of "search a in b" using self as options. + * @param {string} a + * @param {string} b + */ +FuzzySearch.prototype.score = function (a, b) { + var aMap = FuzzySearch.alphabet(a); + return FuzzySearch.score_map(a, b, aMap, this.options); +}; + +// Adapted from paper: +// A fast and practical bit-vector algorithm for +// the Longest Common Subsequence problem +// Maxime Crochemore et Al. +// +// With modification from +// Bit-parallel LCS-length computation revisited (H Hyyrö, 2004) +// http://www.sis.uta.fi/~hh56766/pubs/awoca04.pdf +// + +/** + * Score of "search a in b" using precomputed alphabet map + * Main algorithm for single query token to score + * + * @param {string} a + * @param {string} b + * @param {Object} aMap - See FuzzySearch.alphabet + * @param {FuzzySearchOptions} options + */ +FuzzySearch.score_map = function (a, b, aMap, options) { + + var j, lcs_len; + var m = a.length; + var n = b.length; + var bonus_prefix = options.bonus_match_start; + + var k = m < n ? m : n; + if (k === 0) return 0; + + //normalize score against length of both inputs + var sz_score = (m + n) / ( 2.0 * m * n); + + //common prefix is part of lcs + var prefix = 0; + if (a === b) prefix = k; //speedup equality + else { + while ((a[prefix] === b[prefix]) && (++prefix < k)) { + } + } + + //shortest string consumed + if (prefix === k) { + lcs_len = prefix; + return sz_score * lcs_len * lcs_len + bonus_prefix * prefix; + } + + //alternative algorithm for large string + //need to keep this condition in sync with bitvector + if (m > INT_SIZE) { + lcs_len = FuzzySearch.llcs_large(a, b, aMap, prefix); + return sz_score * lcs_len * lcs_len + bonus_prefix * prefix; + } + + var mask = ( 1 << m ) - 1; + var S = mask, U, c; + + j = prefix - 1; + while (++j < n) { + c = b[j]; + if (c in aMap) { + // Hyyrö, 2004 S=V'=~V + U = S & aMap[c]; + S = (S + U) | (S - U); + } + } + + // Remove match already accounted in prefix region. + mask &= ~( ( 1 << prefix ) - 1 ); + + // lcs_len is number of 0 in S (at position lower than m) + // inverse S, mask it, then do "popcount" operation on 32bit + S = ~S & mask; + + S = S - ((S >> 1) & 0x55555555); + S = (S & 0x33333333) + ((S >> 2) & 0x33333333); + lcs_len = (((S + (S >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; + + lcs_len += prefix; + return sz_score * lcs_len * lcs_len + bonus_prefix * prefix; + +}; + +/** + * Call score_map on the first token. + * Filter size + * + * @param {PackInfo} packinfo + * @param {string} token + * @param {FuzzySearchOptions} options + * @return {Array.} score + */ +FuzzySearch.score_single = function (packinfo, token, options) { + var field_tok = packinfo.tokens[0]; + var m = field_tok.length; + var n = token.length; + if (n < options.token_min_rel_size * m || n > options.token_max_rel_size * m) return [0]; + return [FuzzySearch.score_map(field_tok, token, packinfo.map, options)]; +}; + +/** + * Score multiple query token against a single field token. + * Apply above score function in parallel + * Computation is done as if everything was one big token, + * but ZM bit-vector modify boundary so score are independant + * + * @param {PackInfo} packinfo + * @param {string} field_token + * @param {FuzzySearchOptions} options + * @returns {Array.} scores + */ +FuzzySearch.score_pack = function (packinfo, field_token, options) { + + var packed_tokens = packinfo.tokens; + var nb_packed = packed_tokens.length; + + //single item token can contain either a single word "overflow" or a large word that need special handling + if (nb_packed == 1)return FuzzySearch.score_single(packinfo, field_token, options); + + var S = 0xFFFFFFFF, U, c; + var ZM = packinfo.gate | 0; + var aMap = packinfo.map; + + for (var j = -1, n = field_token.length; ++j < n;) { + c = field_token[j]; + if (c in aMap) { + U = S & aMap[c]; + S = ( (S & ZM) + (U & ZM) ) | (S - U); + } + } + + S = ~S; + + var bonus_prefix = options.bonus_match_start; + var min_rs = options.token_min_rel_size; + var max_rs = options.token_max_rel_size; + var scores = new Array(nb_packed); + var offset = 0; + + for (var k = -1; ++k < nb_packed;) { + + var query_tok = packed_tokens[k]; + var m = query_tok.length; + var lcs_len, prefix; + + if (n < min_rs * m || n > max_rs * m) { + scores[k] = 0; + offset += m; + continue; + } + + if (query_tok === field_token) + prefix = lcs_len = m; + + else { + var p = (m < n) ? m : n; + prefix = 0; + while ((query_tok[prefix] === field_token[prefix]) && (++prefix < p)) { + } + lcs_len = prefix; + var Sm = ( (S >>> offset) & ( (1 << m) - 1 ) ) >>> prefix; + while (Sm) { + Sm &= Sm - 1; + lcs_len++ + } + } + + offset += m; + var sz = (m + n) / ( 2.0 * m * n); + scores[k] = sz * lcs_len * lcs_len + bonus_prefix * prefix; + + } + + return scores; + +}; + + +// +// Compute LLCS, using vectors of position. +// +// Based on: +// An input sensitive online algorithm for LCS computation +// Heikki Hyyro 2009 +// +// We fill the dynamic programing table line per line +// but instead of storing the whole line we only store position where the line increase +// ( bitvector algorithm store increase yes/no as a bit) this time we will store sequence +// +// s u r g e r y +// g [0,0,0,1,1,1,1] : [3,4] (Add level 1) +// s [1,1,1,1,1,1,1] : [0,1] (Make level 1 happens sooner) +// u [1,2,2,2,2,2,2] : [0,2] (Add level 2, append to block of consecutive increase) +// r [1,2,3,3,3,3,3] : [0,3] (Add level 3, append to block of consecutive increase) +// v [1,2,3,3,3,3,3] : [0,3] (v not in surgery, copy) +// e [1,2,3,3,4,4,4] : [0,3],[4,5] (Add level 4, create new block for it) +// y [1,2,3,3,4,4,5] : [0,3],[4,5],[6,7] (Add level 5, create new block for it) +// +// There is 2 Basic operations: +// - Make a level-up happens sooner +// - Add an extra level up at the end. (this is where llcs increase !) +// +// 12345678901234567890 // Position (for this demo we start at 1) +// ii------iii---i--i-- // Increase point of previous line +// 12222222345555666777 // Score previous line [1,3] [9,12] [15,16] [18,19] +// ---m-m---------m---m // Match of this line +// 12233333345555677778 // Score of this line [1,3] [4,5] [10,12] [15,17] [20,21] +// ii-i-----ii---ii---i // New increase point +// 12345678901234567890 // Position + + +FuzzySearch.llcs_large = function (a, b, aMap, prefix) { + + //var aMap = FuzzySearch.posVector(a); + + //Position of next interest point. Interest point are either + // - Increase in previous line + // - Match on this line + var block_start, match_pos; + + // We encode increase sequence as [start_pos, end_pos+1] + // So end-start = length + + // To avoid dealing with to many edge case we place + // a special token at start & end of list + var last_line, line_index, last_end, block_end; + if (prefix === undefined) prefix = 0; + + if (prefix) + last_line = [new Block(0, prefix), new Block(Infinity, Infinity)]; + else + last_line = [new Block(Infinity, Infinity)]; + + var lcs_len = prefix; + + var match_list, match_index; + var block, block_index, block_size; + + //First line + var nb_blocks = last_line.length; + + var n = b.length, j; + for (j = prefix; j < n; j++) { + + //Each line we process a single character of b + var c = b[j]; + if (!(c in aMap)) continue; + match_list = aMap[c]; + + //New line + // the number of if block can only increase up to llcs+1+sentinel + // alternatively each block having >1 item can split. (+1 at end accounted by splitting sentinel) + /** @type Array. */ + var current_line = new Array(Math.min(2 * nb_blocks, lcs_len + 2)); + line_index = -1; + + //First match + match_index = 0; + match_pos = match_list[0]; + + //Place end of first block before the string + block_end = -1; + block_index = -1; + + + while (++block_index < nb_blocks) { + + //Place cursor just after last block + last_end = block_end; + + //Read end block + block = last_line[block_index]; + block_start = block.start; //Encode block as [s,e[ + block_end = block.end; //End is position of char that follow last. + block_size = block_end - block_start; //Size of block, for sentinel (Inf-Inf=NaN) + + //get next match from list of matches + while (match_pos < last_end) { + match_pos = match_list[++match_index]; + } + + // This cover two case + // a) no match between two block + // b) block happens after last match (so match_pos=Infinity). + // At the last block, this will append closing "sentinel" to line + if (block_start <= match_pos) { + current_line[++line_index] = block; + continue; + } + + // + // If we have reached here, we have a dominant match ! + // Decide where to register the match ... + // + + if (match_pos === last_end) { + //End of last block ? (step a.ii) + current_line[line_index].end++; + } + else { + + //Increase need it's own block ( step a.i) + //try to reuse block that will get deleted. + if (block_size === 1) { + //Can we reuse next block ? + block.start = match_pos; + block.end = match_pos + 1; + current_line[++line_index] = block; + } else { + //start a new block + current_line[++line_index] = new Block(match_pos, match_pos + 1); + } + + } + + // if not empty, append next block to current line (step a.iii) + // (this condition reject "sentinel", it'll get added just after the for loop) + if (block_size > 1) { + block.start++; // Move start by one + current_line[++line_index] = block; + } + + } + + // If the line finish with a match: + // a) llcs at end of this line is one greater than last line, increase score + // b) we still need to append sentinel + if (block_start > match_pos) { + current_line[++line_index] = block; + lcs_len++ + } + + + //Current become last + last_line = current_line; + + //Count actual number of block because we allocate a bit more. + nb_blocks = ++line_index; + + + } + + return lcs_len; + +}; + +/** + * A block with start and end position + * Used to record consecutive increase position in llcs_large + * @param start + * @param end + * @constructor + */ +function Block(start, end) { + this.start = start; + this.end = end; +} + +// +// Reference implementation to debug +// Might need to swap input to match internal of a given algorithm +// + +/* + function lcs(a, b) { + + var m = a.length; + var n = b.length; + var i, j; + + //init m by n array with 0 + var C = [], row = [], lcs = []; + for (j = 0; j < n; j++) row[j] = 0; + for (i = 0; i < m; i++) C[i] = row.slice(); + + //fill first row and col + C[0][0] = (a[0] === b[0]) ? 1 : 0; + for (i = 1; i < m; i++) C[i][0] = (a[i] === b[0] || C[i - 1][0]) ? 1 : 0 + for (j = 1; j < n; j++) C[0][j] = (a[0] === b[j] || C[0][j - 1]) ? 1 : 0 + console.log(JSON.stringify(C[0])); + + //bulk + for (i = 1; i < m; i++) { + for (j = 1; j < n; j++) { + C[i][j] = (a[i] === b[j]) ? C[i - 1][j - 1] + 1 : Math.max(C[i][j - 1], C[i - 1][j]); + } + console.log(JSON.stringify(C[i])); + } + + //backtrack + i--; + j--; + while (i > -1 && j > -1) { + if (i && C[i][j] == C[i - 1][j]) i--; + else if (j && C[i][j] == C[i][j - 1]) j--; + else { + lcs.push(a[i]); + j--; + i--; + } + } + + return lcs.reverse().join(''); + }*/ +// main entry of the algorithm (once settings are set) +// loop over everything and merge best scores +'use strict'; + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * Perform a search on the already indexed source. + * + * @param {string} query_string + * @returns {Array} + */ + search: function (query_string) { + + var time_start = Date.now(); + this.start_time = time_start; + var options = this.options; + + // As long as lazy is set to false, we guarantee that making a search is read only. + if (this.dirty && options.lazy) { + this._buildIndexFromSource(); + this.dirty = false; + } + + var query = this.query = this._prepQuery(query_string); + var source = this.index; + var results = []; + + if (options.use_index_store) { + source = this._storeSearch(query, source); + } + + if (options.filter) { + source = options.filter.call(this, source); + } + + // ---- MAIN SEARCH LOOP ---- // + var thresh_include = this._searchIndex(query, source, results); + + //keep only results that are good enough compared to best + results = FuzzySearch.filterGTE(results, "score", thresh_include); + + // sort by decreasing order of score + // equal rounded score: alphabetical order + if (typeof options.sorter === "function") + results = results.sort(options.sorter); + + if (options.output_map || options.output_limit > 0) { + if (typeof options.output_map === "function") + results = FuzzySearch.map(results, options.output_map, this, options.output_limit); + else + results = FuzzySearch.mapField(results, options.output_map, options.output_limit); + } + + var time_end = Date.now(); + this.search_time = time_end - time_start; + this.results = results; + + return results + + }, + + + /** + * Main search loop for a specified source + * This separation allow to search a different source, or a subset of source + * + * @param {Query} query + * @param {Array.} source + * @param {Array.} results + * @returns {number} - thresh_include after this run. + * + * @private + */ + + _searchIndex: function (query, source, results) { + + var options = this.options; + var opt_bpd = options.bonus_position_decay; + var opt_fge = options.field_good_enough; + var opt_trb = options.thresh_relative_to_best; + var opt_score_tok = options.score_per_token; + var opt_round = options.score_round; + var thresh_include = options.thresh_include; + + var best_item_score = 0; + + var sub_query = query.children; + + for (var item_index = -1, nb_items = source.length; ++item_index < nb_items;) { + + //get indexed fields + var item = source[item_index]; + var item_fields = item.fields; + + //reset score + query.resetItem(); + + var item_score = 0; + var matched_field_index = -1; + var matched_node_index = -1; + var position_bonus = 1.0; + + // + //Foreach field + // + + for (var field_index = -1, nb_fields = item_fields.length; ++field_index < nb_fields;) { + + var field_score = 0; + var field_node = -1; + var field = item_fields[field_index]; + + var child_query = sub_query[field_index]; //tag search + var tagged = !!child_query; + + for (var node_index = -1, nb_nodes = field.length; ++node_index < nb_nodes;) { + var node_score, node = field[node_index]; + + if (opt_score_tok) { + node_score = this._scoreField(node, query); + if (tagged) node_score += this._scoreField(node, child_query);//tag search + } + else + node_score = FuzzySearch.score_map(query.fused_str, node.join(" "), query.fused_map, options); + + if (node_score > field_score) { + field_score = node_score; + field_node = node_index; + } + } + + field_score *= (1.0 + position_bonus); + position_bonus *= opt_bpd; + + if (field_score > item_score) { + item_score = field_score; + matched_field_index = field_index; + matched_node_index = field_node; + + if (field_score > opt_fge) break; + } + + } + + // + // Different query token match different fields ? + // + + if (opt_score_tok) { + + var query_score = query.scoreItem(); + item_score = 0.5 * item_score + 0.5 * query_score; + + } + + // + // Keep track of best result, this control inclusion in the list + // + + if (item_score > best_item_score) { + best_item_score = item_score; + var tmp = item_score * opt_trb; + if (tmp > thresh_include) thresh_include = tmp; + } + + // + //candidate for best result ? push to list + // + + if (item_score > thresh_include) { + + item_score = Math.round(item_score / opt_round) * opt_round; + + results.push(new SearchResult( + item.item, + item_fields, + item_score, + matched_field_index, + matched_node_index, + item_fields[0][0].join(" ") + )); + + } + + } + + return thresh_include + }, + + /** + * Internal loop that is run for each field in an item + * + * @param {Array} field_tokens + * @param {Query} query + * @returns {number} + * @private + */ + + _scoreField: function (field_tokens, query) { + + var groups = query.tokens_groups; + var nb_groups = groups.length; + var nb_tokens = field_tokens.length; + if (!nb_groups || !nb_tokens) return 0; + + var field_score = 0, sc, bf; + var last_index = -1; + var options = this.options; + + var bonus_order = options.bonus_token_order; + var minimum_match = options.minimum_match; + + var token, scores, i; + for (var group_index = -1; ++group_index < nb_groups;) { + + var group_info = groups[group_index]; + var nb_scores = group_info.tokens.length; + + // Each packinfo have their own reusable scratch pad + // to store best score information, reset them to 0 + + var best_of_field = group_info.score_field; + for (i = -1; ++i < nb_scores;) best_of_field[i] = 0 + + var best_index = group_info.field_pos; + for (i = -1; ++i < nb_scores;) best_index[i] = 0 + + for (var field_tk_index = -1; ++field_tk_index < nb_tokens;) { + + token = field_tokens[field_tk_index]; + scores = FuzzySearch.score_pack(group_info, token, options); + for (i = -1; ++i < nb_scores;) { + sc = scores[i]; + bf = best_of_field[i]; + //Score is an improvement OR + //Score is within a token order bonus from being better, but word are swapped + + if (sc > bf || ( bf - sc < bonus_order && i > 0 && best_index[i] <= best_index[i - 1] )) { + best_of_field[i] = sc; + best_index[i] = field_tk_index; + } + + } + + } + + var best_match_this_item = group_info.score_item; + for (i = -1; ++i < nb_scores;) { + + sc = best_of_field[i]; + field_score += sc; + + + // Give bonus for pair in consecutive order + // Only consider positive match for bonus + if (sc > minimum_match) { + var this_index = best_index[i]; + + //Bonus is diluted by the distance between words. + //Positive match, but out of order get half the bonus. + var d = this_index - last_index; + var bo = bonus_order * ( 1.0 / (1.0 + Math.abs(d))); + if (d > 0) bo *= 2; + field_score += bo; + sc += bo; + last_index = this_index; + } + + if (sc > best_match_this_item[i]) + best_match_this_item[i] = sc; + + } + + + } + + if (options.score_test_fused) { + + // field_tokens.join(" "), remove last one if acronym + // performance of array.join(" ") and str concat look similar on modern browser. + + var n = (options.score_acronym) ? nb_tokens - 1 : nb_tokens; + var fused_field = field_tokens[0], fi = 0; + while (++fi < n) fused_field += " " + field_tokens[fi]; + + // test "space bar is broken" no token match + var fused_score = FuzzySearch.score_map(query.fused_str, fused_field, query.fused_map, options); + fused_score += bonus_order; //fused cannot be out of order + field_score = fused_score > field_score ? fused_score : field_score; + + if (fused_score > query.fused_score) { + query.fused_score = fused_score; + } + } + + + return field_score; + + } +}); + + + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * Take a `source_item` (unprocessed item from source) and keys and produce + * an `item` that's ready to be added to `this.index`. + * + * Preparation steps: + * - Apply lowercase, accent removal + * - Split field into token + * - Remove small token eg "a" "of" and prefix large token + */ + _prepItem: function (source_item, keys) { + + var item_fields = FuzzySearch.generateFields(source_item, keys); + + var nb_fields = item_fields.length; + + for (var field_index = -1; ++field_index < nb_fields;) { + + var field = item_fields[field_index]; + for (var node_index = -1, nb_nodes = field.length; ++node_index < nb_nodes;) { + + var norm = this.options.normalize(field[node_index]); + var nodes = norm.split(this.token_re); + //Filter size. (If total field length is very small, make an exception. + // Eg some movie/Book have a single letter title, filter risk of removing everything ) + if (norm.length > 2 * this.options.token_field_min_length) nodes = FuzzySearch.filterSize(nodes, this.options.token_field_min_length, this.options.token_field_max_length); + if (this.options.score_acronym) nodes.push(norm.replace(this.acro_re, "$1")); + field[node_index] = nodes; + + } + + } + + return new Indexed(source_item, item_fields); + }, + + /** + * Add an item to search index AND source collection. + * It'll use identify_item to find if the item already exist. + * If identify_item is null (default), calling this method is append-only with no duplicate detection + * + * To update the source, it use the assumption that this.source and this.index can be synced + * by array index. That assumption will be true if source is a plain array, and always updated by this library. + * Feel free to set `should_update_source` to false to manually manage source collection. + * + * Keeping source in sync is important to allow to recompute index from source. + * This will happens with certain setting changes. + * + * @param {*} source_item - item to add to search index + * @param {boolean=} should_update_source - set to false to skip updating the source. + */ + + add: function(source_item, should_update_source){ + + // Default to keeping source in sync. + if(should_update_source === undefined) + should_update_source = true; + + var item_id = typeof this.options.identify_item === "function" + ? this.options.identify_item(source_item) + : null; + + // Find where to insert new item + + var idx; + if (item_id === null) { + // No identifier, append to end + idx = this.nb_indexed; + this.nb_indexed++; + } + else if (item_id in this.index_map) { + // Item exist, update + idx = this.index_map[item_id]; + } + else { + // New identifier, append to end & record new + this.index_map[item_id] = this.nb_indexed; + idx = this.nb_indexed; + this.nb_indexed++; + } + + // Compute indexed item and update index + var prepared = this._prepItem(source_item, this.keys); + this.index[idx] = prepared; + + // Insert in source; + if(should_update_source) + this.source[idx] = source_item; + + if (this.options.use_index_store) { + this._storeAdd(prepared, idx); + } + + }, + + /** + * Build (or rebuild) `this.index` from `this.source` + * Flatten object into array using specified keys + * + * @private + */ + + _buildIndexFromSource: function () { + var nb_items = this.source.length; + + this.index = new Array(nb_items); + this.index_map = {}; + this.nb_indexed = 0; + + for (var item_index = -1; ++item_index < nb_items;) { + var source_item = this.source[item_index]; + + // Add item to index. + // Because we are iterating over source, do not attempt to modify it. + this.add(source_item, false); + } + } +}); + +/** + * Original item with cached normalized field + * + * @param {*} source_item + * @param {Array.} fields + * @constructor + */ + +function Indexed(source_item, fields) { + this.item = source_item; + this.fields = fields; +} + +// - - - - - - - - - - - - - - - - - - - - - - +// Input stage: prepare field for search +//- - - - - - - - - - - - - - - - - - - - - - + + +/** + * Given an object to index and a list of field to index + * Return a flat list of the values. + * + * @param {Object} obj + * @param {Array.} fieldlist + * @returns {Array} + */ + +FuzzySearch.generateFields = function (obj, fieldlist) { + + if (!fieldlist || !fieldlist.length) return [[obj.toString()]]; + + var n = fieldlist.length; + var indexed_fields = new Array(n); + + for (var i = -1; ++i < n;) + indexed_fields[i] = _collectValues(obj, fieldlist[i].split("."), [], 0); + + return indexed_fields; + +}; + + +/** + * Traverse an object structure to collect item specified by parts. + * If leaf node is an array or dictionary collect every children. + * If key is wildcard '*' branch out the search process on each children. + * + * @param {*} obj - root to process + * @param {Array.} parts - array of subkey to direct object traversal "those.that.this"->["those","that","this"] + * @param {Array} list - where to put collected items + * @param {number} level - index of current position on parts list + * @returns {Array} - return list + * @private + */ +function _collectValues(obj, parts, list, level) { + + var key, i, olen; + var nb_level = parts.length; + while (level < nb_level) { + + key = parts[level++]; + if (key === "*" || key === "") break; + if (!(key in obj)) return list; + obj = obj[key]; + + } + + var type = Object.prototype.toString.call(obj); + var isArray = ( type === '[object Array]' ); + var isObject = ( type === '[object Object]' ); + + if (level === nb_level) { + + if (isArray) + for (i = -1, olen = obj.length; ++i < olen;) list.push(obj[i].toString()); + + else if (isObject) { + for (key in obj) { + if (obj.hasOwnProperty(key)) list.push(obj[key].toString()); + } + } + + else list.push(obj.toString()); + + + } + + else if (key === "*") { + + if (isArray) + for (i = -1, olen = obj.length; ++i < olen;) { + _collectValues(obj[i], parts, list, level); + } + + else if (isObject) + for (key in obj) { + if (obj.hasOwnProperty(key)) + _collectValues(obj[key], parts, list, level); + } + } + + return list; +} + +'use strict'; + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * + * @param {Indexed} preparedItem + * @param {int} idx + */ + _storeAdd: function (preparedItem, idx) { + + var keyList = keysFromIndexedItem(preparedItem); + if (keyList.length == 0) return; + + // register idx on all appropriate key + for (var i = 0; i < keyList.length; i++) { + var key = keyList[i]; + + if (key in this.store) { + // append to existing array of index + this.store[key].push(idx); + } + else { + // Format is dict key => array of item index + this.store[key] = [idx]; + } + } + + + }, + + + /** + * + * @param {Query} preparedQuery + * @param {Array.} source + */ + _storeSearch: function (preparedQuery, source) { + + // Scan query for index keys. + var keyList = keysFromQuery(preparedQuery); + if (keyList.length == 0) return []; + + // return filtered source + var idAndCount = retrieveCount(keyList, this.store); + if (idAndCount.length == 0) return []; + + // Get minimum quality and remap to original items. + var tresh = idAndCount[0].count * this.options.store_thresh; + idAndCount = FuzzySearch.filterGTE(idAndCount, "count", tresh); + return FuzzySearch.map(idAndCount, + function (x) { return source[x.id] }, + this, this.options.store_max_results); + + } + +}); + +/** + * + * @param {Indexed} preparedItem + */ + +function keysFromIndexedItem(preparedItem) { + + // Process the nested structure of a prepared item in order to extract index keys. + var keyList = []; + var keyDict = {}; + + // item -> fields -> nodes -> word_tokens + var fields = preparedItem.fields; + for (var i = 0; i < fields.length; i++) { + var nodes = fields[i]; + for (var j = 0; j < nodes.length; j++) { + var words = nodes[j]; + for (var k = 0; k < words.length; k++) { + keysFromWord(words[k], keyList, keyDict) + } + } + } + + return keyList; +} + +/** + * + * @param {Query} query + */ + +function keysFromQuery(query) { + + var keyList = []; + var keyDict = {}; + var i, j; + + var words = query.words; + for (i = 0; i < words.length; i++) { + keysFromWord(words[i], keyList, keyDict) + } + + var children = query.children; + for (i = 0; i < children.length; i++) { + words = children[i].words; + for (j = 0; j < words; j++) { + keysFromWord(words[j], keyList, keyDict) + } + } + + return keyList; + +} + + +function keysFromWord(word, keysList, existingDict) { + + var len = word.length; + if (len == 0) return; + + if (len >= 3) { + // 3o6, 3o5, 3o4, 3o3 + select3(word, 6, keysList, existingDict) + } + + if (len >= 2) { + // 2o4, 2o3,2o2 + select2(word, 4, keysList, existingDict) + } + + // 1o1 strategy: This index by first letter + union(word[0], keysList, existingDict); + +} + +function select2(str, maxlen, existingList, existingDict) { + var len = Math.min(str.length, maxlen); + for (var i = 0; i < len - 1; i++) { + for (var j = i + 1; j < len; j++) { + union(str[i] + str[j], existingList, existingDict) + } + } + return existingList; +} + +function select3(str, maxlen, existingList, existingDict) { + var len = Math.min(str.length, maxlen); + for (var i = 0; i < len - 2; i++) { + for (var j = i + 1; j < len - 1; j++) { + for (var k = j + 1; k < len; k++) { + union(str[i] + str[j] + str[k], existingList, existingDict) + } + } + } + return existingList; +} + + +function union(word, existingList, existingDict) { + if (!(word in existingDict)) { + existingDict[word] = true; + existingList.push(word); + } +} + +function retrieveCount(keys, store) { + + // Dictionary idx => count + var countPerIndex = {}; + + if (keys.length == 0) + return []; + + for (var i = 0; i < keys.length; i++) { + + var key = keys[i]; + + // Does the key exist in the index ? + if (key in store) { + + // If so add every entry of that key into countPerIndex + // Also for each entry, maintain a count of matched keys. + + var idxList = store[key]; + for (var j = 0; j < idxList.length; j++) { + + var idx = idxList[j]; + + if (idx in countPerIndex) { + countPerIndex[idx]++; + } else { + countPerIndex[idx] = 1; + } + } + + } + } + + // Transform countPerIndex into a sorted list of IdAndCount + + var outList = []; + + for (var id in countPerIndex) { + if (countPerIndex.hasOwnProperty(id)) { + outList.push(new IdAndCount(id, countPerIndex[id])); + } + } + + // We can probably filterGte here. + + // Custom sort decreasing order + outList = outList.sort(function (a, b) { + return b.count - a.count + }); + + return outList; + +} + +function IdAndCount(id, count) { + this.id = id; + this.count = count; +} +// +// Shared string and array of string functions +// +'use strict'; + + +/** + * Take a string into a normal form. Allow to compare in a case insensitive way. + * Also allow to match accents with their base form "é" vs "e" + * Finally standardize token separator to be a single space. + * + * @param {string} str + * @returns {string} - normalized str + */ + +function normalize(str) { + if (!str)return ""; + return str.toLowerCase().replace(/[^\u0000-\u007E]/g, function (a) { + return diacriticsMap[a] || a; + }); +} + +function getDiacriticsMap() { + // replace most common accents in french-spanish by their base letter + //"ãàáäâæẽèéëêìíïîõòóöôœùúüûñç" + var from = "\xE3\xE0\xE1\xE4\xE2\xE6\u1EBD\xE8\xE9\xEB\xEA\xEC\xED\xEF\xEE\xF5\xF2\xF3\xF6\xF4\u0153\xF9\xFA\xFC\xFB\xF1\xE7"; + var to = "aaaaaaeeeeeiiiioooooouuuunc"; + var diacriticsMap = {}; + for (var i = 0; i < from.length; i++) { + diacriticsMap[from[i]] = to[i] + } + return diacriticsMap; +} + +var diacriticsMap = getDiacriticsMap(); + +/** + * Process an array of string, filter out item smaller than min, trim item larger than max. + * + * @param {Array.} array - array of string + * @param minSize - filter out item smaller than this + * @param maxSize - substring item larger than this + * @returns {Array} + */ + +FuzzySearch.filterSize = function (array, minSize, maxSize) { + var i = -1, j = -1; + var n = array.length; + var out = []; + var str, slen; + + while (++i < n) { + str = array[i]; + slen = str.length; + if (slen >= minSize) out[++j] = (slen < maxSize) ? str : str.substr(0, maxSize) + } + return out; +}; + + +// +// Extend base option to support highlight +// +'use strict'; + +extend(FuzzySearch.defaultOptions, /** @lends {FuzzySearchOptions.prototype} */{ + + highlight_prefix: false, // true: force prefix as part of highlight, (false: minimum gap, slower) + highlight_bridge_gap: 2, // display small gap as substitution, set to size of gap, 0 to disable + highlight_before: '', //tag to put before/after the highlight + highlight_after: '' + +}); + + +/** + * Highlight a string using query stored in a FuzzySearch object. + * @param {string} str + * @param {string=} field + */ +FuzzySearch.prototype.highlight = function (str, field) { + var i, subq; + var qnorm = this.query.normalized; + if (field && field.length && (i = this.tags.indexOf(field)) > -1 && (subq = this.query.children[i])) qnorm += (qnorm.length ? " " : "") + subq.normalized; + return FuzzySearch.highlight(qnorm, str, this.options) +}; + +/** + * Highlight string b, from searching a in it. + * + * @param {string} a - string to search + * @param {string} b - string to highlight + * @param {FuzzySearchOptions=} options + * + */ +FuzzySearch.highlight = function (a, b, options) { + + if (options === undefined) options = FuzzySearch.defaultOptions; + if (!b) return ""; + + var open_string = options.highlight_before; + var close_string = options.highlight_after; + var opt_score_tok = options.score_per_token; + var opt_fuse = options.score_test_fused; + var opt_acro = options.score_acronym; + var token_re = options.token_re; + + var aa = options.normalize(a); + var bb = options.normalize(b); + + //Normalized needle + var a_tokens = aa.split(token_re); + + //Normalized haystack + var b_tokens = bb.split(token_re); + + //Original spelling haystack + var disp_tokens = [], disp_sep = []; + splitKeepSep(b, token_re, disp_tokens, disp_sep); + + + var strArr = []; + var match_list = []; + var fused_score = 0, match_score = 0; + + if (opt_score_tok) { + match_score = FuzzySearch.matchTokens(b_tokens, a_tokens, match_list, options, false); + } + + //Test "space bar is broken" no token match + if (opt_fuse || !opt_score_tok || opt_acro) fused_score = FuzzySearch.score_map(aa, bb, FuzzySearch.alphabet(aa), options) + options.bonus_token_order; + + if (match_score === 0 && fused_score === 0) return b; //shortcut no match + + + if (!opt_score_tok || fused_score > match_score) { + a_tokens = [aa]; //everything in a single token + b_tokens = [bb]; + disp_tokens = [b]; + match_list = [0]; + } + + var nbtok = disp_tokens.length, j = -1; + while (++j < nbtok) { + + var i = match_list[j]; + + if (i === -1) { + strArr.push(disp_tokens[j] + disp_sep[j]); + continue; + } + + var ta = a_tokens[i]; + var tb = b_tokens[j]; + var td = disp_tokens[j]; + var curr = 0; + + var start_positions = []; + var end_positions = []; + FuzzySearch.align(ta, tb, start_positions, end_positions); + var len = start_positions.length; + + var k = -1; + while (++k < len) { + + var s = start_positions[k]; + var e = end_positions[k]; + if (s > curr) strArr.push(td.substring(curr, s)); + strArr.push(open_string + td.substring(s, e) + close_string); + curr = e; + + } + + strArr.push(td.substring(curr) + disp_sep[j]); + + } + + return strArr.join(''); + +}; + + +function splitKeepSep(str, pattern, tokens, seps) { + + var tok_index = tokens.length; + + var match = pattern.exec(str); + if (match === null) { + tokens[tok_index] = str; + seps[tok_index] = ""; + return; + } + + var start = 0, end, len; + while (match !== null) { + end = match.index; + len = match[0].length; + tokens[tok_index] = str.substring(start, end); + seps[tok_index] = str.substr(end, len); + start = end + len; + tok_index++; + match = pattern.exec(str); + } + + tokens[tok_index] = str.substring(start); + seps[tok_index] = ""; + + +} + + +// +// Smith-Waterman-Gotoh local Alignment +// +// Smith&Waterman worked the idea of local alignment +// While Gotoh 82 worked on affine gap penalty. +// +// This is the basic algorithm with some optimisation to use less space. +// JAligner has been used as a reference implementation to debug. +// Some of their implementation detail to save memory has been reused here. +// +// See pseudo-code on +// http://jaligner.sourceforge.net/api/jaligner/SmithWatermanGotoh.html +// +// + +/** + * Smith-Waterman-Gotoh local Alignment + * Build sequences of matches, called send array (seq_start,seq_end) to store them + * Return match score + * + * @param {string} a - string to search + * @param {string} b - string to be searched + * @param {Array.} seq_start - store for match start + * @param {Array.} seq_end - store for match end + * @param {FuzzySearchOptions=} options + * @returns {number} + */ + +FuzzySearch.align = function (a, b, seq_start, seq_end, options) { + + if (options === undefined) options = FuzzySearch.defaultOptions; + + var wm = 100; // score of making a match + var wo = -10; // score to open a gap + var we = -1; // score to continue an open gap + + //Traceback directions constants + var STOP = 0; + var UP = 1; + var LEFT = 2; + var DIAGONAL = 3; + + var score_acronym = options.score_acronym; + var sep_tokens = options.token_sep; + + var m = Math.min(a.length + 1, options.token_query_max_length); + var n = Math.min(b.length + 1, options.token_field_max_length); + + // Comon prefix is part of lcs, + // but not necessarily part of best alignment (it can introduce an extra gap) + // however prefix make sens in an autocomplete scenario and speed things up + // + var i, j; + var k = m < n ? m : n; + var prefix_len = 0; + + if (a === b) { + //speedup equality + prefix_len = m; + m = 0; + } + else if (options.highlight_prefix) { + for (i = 0; i < k && (a[i] === b[i]); i++) prefix_len++; + + if (prefix_len) { + a = a.substring(prefix_len); + b = b.substring(prefix_len); + + m -= prefix_len; + n -= prefix_len; + } + } + + var vmax = 0, imax = 0, jmax = 0; + var trace = new Array(m * n); + var pos = n - 1; + + //m,n = length+1 + if (m > 1 && n > 1) { + + + var vrow = new Array(n), vd, v, align; + var gapArow = new Array(n), gapA, gapB = 0; + + for (j = 0; j < n; j++) { + gapArow[j] = 0; + vrow[j] = 0; + trace[j] = STOP; + } + + //DEBUG + //var DEBUG_V = []; + //var DEBUG_TR = []; + + for (i = 1; i < m; i++) { + + gapB = 0; + vd = vrow[0]; + + pos++; + trace[pos] = STOP; + + //DEBUG + //DEBUG_V[i] = []; + //DEBUG_TR[i] = []; + + for (j = 1; j < n; j++) { + + // + // Reference "pseudocode" + // We try to fill that table, but using o(n) instead o(m*n) memory + // If we need traceback we still need o(m*n) but we store a single table instead of 3 + // + // F[i][j] = f = Math.max(F[i - 1][j] + we, V[i - 1][j] + wo ); + // E[i][j] = e = Math.max(E[i][j - 1] + we, V[i][j - 1] + wo ); + // align = (a[i - 1] === b[j - 1]) ? V[i - 1][j - 1] + wm : -Infinity; + // V[i][j] = v = Math.max(e, f, align, 0); + // + + // Score the options + gapA = gapArow[j] = Math.max(gapArow[j] + we, vrow[j] + wo); //f + gapB = Math.max(gapB + we, vrow[j - 1] + wo); //e + + if (score_acronym) + align = ( a[i - 1] !== b[j - 1] ) ? -Infinity : ( + vd + wm + + ( ( i < 2 || sep_tokens.indexOf(a[i - 2]) > -1 ) ? wm : 0) + + ( ( j < 2 || sep_tokens.indexOf(b[j - 2]) > -1 ) ? wm : 0) + ); + else + align = ( a[i - 1] === b[j - 1] ) ? vd + wm : -Infinity; + + vd = vrow[j]; + + v = vrow[j] = Math.max(align, gapA, gapB, 0); + + //DEBUG + //DEBUG_V[i][j] = v; + + // Determine the trace back direction + pos++; //pos = i * n + j; + switch (v) { + + // what triggered the best score ? + //In case of equality, taking gapB get us closer to the start of the string. + case gapB: + trace[pos] = LEFT; + break; + + case align: + trace[pos] = DIAGONAL; + + if (v > vmax) { + vmax = v; + imax = i; + jmax = j; + } + + break; + + + case gapA: + trace[pos] = UP; + break; + + default: + trace[pos] = STOP; + break; + + } + + //DEBUG + //DEBUG_TR[i][j] = trace[pos]; + + } + } + + + } + + //DEBUG + //console.table(DEBUG_V); + //console.table(DEBUG_TR); + + + // - - - - - - - - - + // TRACEBACK + // - - - - - - - - - + + var bridge = options.highlight_bridge_gap; + var last_match = 0; + + if (vmax > 0) { + + // backtrack to aligned sequence + // record start and end of substrings + // vmax happens at the end of last substring + + i = imax; + j = jmax; + pos = i * n + j; + last_match = jmax; + seq_end.push(jmax + prefix_len); + + + var backtrack = true; + while (backtrack) { + + switch (trace[pos]) { + + case UP: + i--; + pos -= n; + break; + + case LEFT: + j--; + pos--; + break; + + case DIAGONAL: + + // if we have traversed a gap + // record start/end of sequence + // (unless we want to bridge the gap) + + if (last_match - j > bridge) { + seq_start.push(last_match + prefix_len); + seq_end.push(j + prefix_len); + } + + j--; + i--; + last_match = j; + pos -= n + 1; + break; + + case STOP: + default : + backtrack = false; + + } + + } + + //first matched char + seq_start.push(last_match + prefix_len); + + } + + + if (prefix_len) { + + if (last_match > 0 && last_match <= bridge) { + + //bridge last match to prefix ? + seq_start[seq_start.length - 1] = 0 + + } else { + + //add prefix to matches + seq_start.push(0); + seq_end.push(prefix_len); + + } + + } + + //array were build backward, reverse to sort + seq_start.reverse(); + seq_end.reverse(); + + return vmax + prefix_len; + + +}; + + +// +// Each query token is matched against a field token +// or against nothing (not in field) +// +// a: [paint] [my] [wall] +// b: [wall] [painting] +// +// match: [1, -1, 0] +// +// if a[i] match b[j] +// then match[i] = j +// +// if a[i] match nothing +// then match[i] = -1 +// +// return match score +// take vector match by reference to output match detail +// +// Ideal case: +// each token of "a" is matched against it's highest score(a[i],b[j]) +// +// But in case two token have the same best match +// We have to check for another pairing, giving highest score +// under constraint of 1:1 exclusive match +// +// To do that we check all possible pairing permutation, +// but we restrict ourselves to a set of plausible pairing. +// +// That is a token a will only consider pairing with a score at least +// thresh_relative_to_best * [highest score] +// + +/** + * Match token of A again token of B, under constraint that tokens can be matched at most once. + * + * @param {Array.} a_tokens + * @param {Array.} b_tokens + * @param {Array.} match - array to store results + * @param {FuzzySearchOptions=} options + * @param {boolean=} flip - if true score A against B, but return index of B against A. + * @returns {number} Score of the best match combination. + */ +FuzzySearch.matchTokens = function (a_tokens, b_tokens, match, options, flip) { + + if (options === undefined) options = FuzzySearch.defaultOptions; + if (flip === undefined) flip = false; + + var minimum_match = options.minimum_match; + var best_thresh = options.thresh_relative_to_best; + + var i, j, row; + var C = []; + + var m = a_tokens.length; + var n = b_tokens.length; + + var a_maps = FuzzySearch.mapAlphabet(a_tokens); + var a_tok, b_tok, a_mp; + + var rowmax = minimum_match, imax = -1, jmax = -1, v; + var match_count = 0; + var thresholds = []; + + for (i = 0; i < m; i++) { + + row = []; + match[i] = -1; + rowmax = minimum_match; + + a_tok = a_tokens[i]; + if (!a_tok.length) { + //skip score loop but still fill array + for (j = 0; j < n; j++) row[j] = 0; + C[i] = row; + continue; + } + + a_mp = a_maps[i]; + + for (j = 0; j < n; j++) { + + b_tok = b_tokens[j]; + if (!b_tok.length) { + row[j] = 0; + continue; + } + + v = FuzzySearch.score_map(a_tok, b_tok, a_mp, options); + row[j] = v; + + if (v > minimum_match) match_count++; + + if (v > rowmax) { + rowmax = v; + imax = i; + jmax = j; + } + + } + + thresholds[i] = rowmax; + + C[i] = row; + } + + //Shortcut: no match + if (match_count === 0) return 0; + + //Shortcut: single possible pairing + if (match_count === 1) { + match[imax] = jmax; + if (flip) _flipmatch(match, n); + return rowmax + } + + + //Only consider matching close enough to best match + for (i = 0; i < a_tokens.length; i++) { + thresholds[i] = Math.max(best_thresh * thresholds[i], minimum_match); + } + + + var score = _matchScoreGrid(C, match, thresholds, options.bonus_token_order); + + //Flip back the problem if necessary + if (flip) _flipmatch(match, n); + + return score; + +}; + +/** + * Perform the match as FuzzySearch.matchTokens + * but token against token score is already computed as C + * + * This is mostly a preparation phase for _buildScoreTree as well + * as a post processing traversal to recover the match. + * + * @param {Array.>} C - precomputed score + * @param {Array.} match - store the position of best matches + * @param {Array.} thresholds - Information about the minimum score each token is willing to match + * @param {number} order_bonus + * @returns {number} - best score + * @private + */ +function _matchScoreGrid(C, match, thresholds, order_bonus) { + + var i_len = C.length; + var i, j; + + //Traverse score grid to find best permutation + var score_tree = []; + for (i = 0; i < i_len; i++) { + score_tree[i] = {}; + } + + var opt = new TreeOptions(C, score_tree, thresholds, order_bonus); + var score = _buildScoreTree(opt, 0, 0).score; + + var used = 0, item; + + for (i = 0; i < i_len; i++) { + + item = score_tree[i][used]; + if (!item) break; + match[i] = j = item.index; + if (j > -1) used |= (1 << j); + + } + + return score +} + +// +// Cache tree: +// +// Given 5 node: 1,2,3,4,5 +// +// What is the best match ... +// - knowing that we have passed tru 1->2->3 +// - knowing that we have passed tru 2->3->1 +// - knowing that we have passed tru 3->1->2 +// +// All those question have the same answer +// because they are equivalent to match {4,5} against {4,5} +// ( in an alternate pass we can match {1,3} against {4,5} for example ) +// +// We store match in j in a bit vector of size 32 +// +// In addition of saving computation, the cache_tree data structure is used to +// trace back the best permutation ! +// +// In addition of quick testing if an item is already used, used_mask serve +// as a key in cache_tree (in addition to level). Ideal key would be a list of available trial +// but, used & available are complementary vector (~not operation) so used is a perfectly valid key too... + + +/** + * Branch out to try each permutation of items of A against item of B. + * - Only try branched not already used. + * - Prune branch below token threshold. + * - Build a tree to cache sub-problem for which we already have a solution + * + * @param {TreeOptions} tree_opt + * @param {number} used_mask + * @param {number} depth + * @returns {MatchTrial} best_trial + * @private + */ + +function _buildScoreTree(tree_opt, used_mask, depth) { + + var C = tree_opt.score_grid; + var cache_tree = tree_opt.cache_tree; + var score_thresholds = tree_opt.score_thresholds; + var order_bonus = tree_opt.order_bonus; + + var ilen = C.length; + var jlen = C[depth].length; + if (jlen > INT_SIZE) jlen = INT_SIZE; + + var j, score; + var include_thresh = score_thresholds[depth]; + var best_score = 0, best_index = -1; + var has_child = (depth < ilen - 1); + var child_tree = cache_tree[depth + 1], child_key; + + for (j = 0; j < jlen; j++) { + + var bit = 1 << j; + + //if token previously used, skip + if (used_mask & bit) continue; + + //score for this match + score = C[depth][j]; + + //too small of a match, skip + if (score < include_thresh) continue; + + //score for child match + //if we already have computed this sub-block get from cache + if (has_child) { + child_key = used_mask | bit; + + /** @type MatchTrial */ + var trial = (child_key in child_tree) ? + child_tree[child_key] : + _buildScoreTree(tree_opt, child_key, depth + 1); + + score += trial.score; + if (j < trial.index) { + score += order_bonus + } + } + + //Because of DFS, first loop that finish is toward the end of the query. + //As a heuristic, it's good to match higher index toward the end. So we accept equality. + if (score >= best_score) { + best_score = score; + best_index = j; + } + + } + + //try the move of "do not match this token against anything" + if (has_child) { + + child_key = used_mask; + if (child_key in child_tree) score = child_tree[child_key].score; + else score = _buildScoreTree(tree_opt, child_key, depth + 1).score; + + if (score > best_score) { + best_score = score; + best_index = -1; + } + + } + + var best_trial = new MatchTrial(best_score, best_index); + cache_tree[depth][used_mask] = best_trial; + return best_trial; + +} + +/** + * + * @param score + * @param index + * @constructor + */ +function MatchTrial(score, index) { + this.score = score; + this.index = index; +} + +/** + * + * @param {Array>} score_grid + * @param {Array>} cache_tree + * @param {Array} score_thresholds + * @param {number} order_bonus + * @constructor + */ +function TreeOptions(score_grid, cache_tree, score_thresholds, order_bonus) { + this.score_grid = score_grid; + this.cache_tree = cache_tree; + this.score_thresholds = score_thresholds; + this.order_bonus = order_bonus +} + + +/** + * Let A,B be two array + * Input is an array that map "index of A"->"index of B" + * Output is the reverse "index of B"->"index of A" + * + * Array is modified in place + * + * @param {Array.} match - array to remap + * @param {number} newlen - length of B + * @private + */ + +function _flipmatch(match, newlen) { + + var i, j; + var ref = match.slice(); + match.length = newlen; + + for (i = 0; i < newlen; i++) { + match[i] = -1; + } + + for (i = 0; i < ref.length; i++) { + j = ref[i]; + if (j > -1 && j < newlen) match[j] = i; + } + +} +// +// - - - - - - - - - - - - +// UI INTEGRATION +// - - - - - - - - - - - - +// + +extend(FuzzySearch.prototype, /** @lends {FuzzySearch.prototype} */ { + + /** + * Return a Debounced version of FuzzySearch.search. + * New function signature allow to specific callback for different phase of the debounce. + * De-bounce is adaptative, it will allow short burst and try to learn actual computation time. + * + * query: term to search + * immediate_cb(results) : if search was done without filtering + * suppress_cb(cached_results) : debounce has supressed the search, return cache of last result + * finally_cb(results): if at least 1 supression occured, make a new search when debounce end and call this. + * + * @returns {function({string}, function({Array}), function({Array}), function({Array}))} + */ + getInteractive: function () { + + var self = this; + var options = this.options; + var wait = options.interactive_debounce; + var mult = options.interactive_mult; + var burst = options.interactive_burst; + + // Debounce off + if (wait === 0) { + return (function (query, immediate_cb, suppress_cb, finally_cb) { + return immediate_cb(self.search(query)) + }) + } + + // Debounce + var clock = (window.performance && window.performance.now) ? window.performance : Date; + var timeout, cache; + var count = 0, suppressed = false; + + return function (query, immediate_cb, suppress_cb, finally_cb) { + + var later = function () { + timeout = null; + if (suppressed) { + cache = self.search(query); + finally_cb(cache); + } + count = 0; + suppressed = false; + }; + + clearTimeout(timeout); + timeout = setTimeout(later, wait); + + if (++count < burst) { + + suppressed = false; + var before = clock.now(); + cache = self.search(query); + var ret = immediate_cb(cache); + var now = clock.now(); + + //try to learn typical time (time mult factor); + wait = 0.5 * wait + 0.5 * mult * (now - before); + //console.log(wait); + return ret; + + } else { + suppressed = true; + //console.log("supress"); + return suppress_cb(cache); + } + } + + }, + + /** + * Allow the FuzzySearch object to be given as a source to twitter typeahead. + * This implement similar interface than Bloodhound object. + * + * @returns {function({string}, function({Array}) ,function({Array}) )} Interactive version of search. + */ + + __ttAdapter: function ttAdapter() { + + var debounced = this.getInteractive(); + var noop = function (a) { + }; + return function (query, sync, async) { + debounced(query, sync, noop, async); + } + + }, + + /** + * Generate a function compatible with jQuery UI auto-complete Source + * + * @returns {function( {Object}, {function()} )} Interactive version of search. + */ + $uiSource: function () { + + var debounced = this.getInteractive(); + var noop = function (a) { + }; + return function (request, response) { + debounced(request.term, response, noop, response); + } + + } +}); +// +// Export FuzzySearch +// + +if (typeof require === 'function' && typeof module !== 'undefined' && module.exports) { + + // CommonJS-like environments + module.exports = FuzzySearch; + +} else if (typeof define === 'function' && define.amd) { + + // AMD. Register as an anonymous module. + define(function () { + return FuzzySearch; + }); + +} else { + + // Browser globals + globalThis['FuzzySearch'] = FuzzySearch; + +} + +return FuzzySearch; + +})(); diff --git a/libraryServer.js b/libraryServer.js index ad3747a..910ea82 100644 --- a/libraryServer.js +++ b/libraryServer.js @@ -55,7 +55,7 @@ function addHandlers() { app.get('/search(.json)?', async (req, res) => { await Archivist.isReady(); - const {query, results:resultIds} = await Archivist.search(req.query.query); + const {query, results:resultIds, HL} = await Archivist.search(req.query.query); const results = resultIds.map(docId => Archivist.getDetails(docId)); if ( req.path.endsWith('.json') ) { res.end(JSON.stringify({ @@ -63,17 +63,9 @@ function addHandlers() { }, null, 2)); } else { results.forEach(r => { - const Offsets = Archivist.findOffsets(query, r.content, 3); - - r.snippet = []; - for ( const {substring,offset} of Offsets ) { - r.snippet.push(r.content.substring(offset-SNIP_CONTEXT, offset) + - `${substring}` + - r.content.substr(offset+substring.length, SNIP_CONTEXT) - ); - } + r.snippet = ['no snippet'] }); - res.end(SearchResultView({results, query})); + res.end(SearchResultView({results, query, HL})); } }); @@ -194,7 +186,7 @@ function IndexView(urls) { ` } -function SearchResultView({results, query}) { +function SearchResultView({results, query, HL}) { return ` @@ -249,7 +241,9 @@ function SearchResultView({results, query}) { ${ results.map(({snippet, url,title,id}) => `
  • - ${DEBUG ? id + ':' : ''} ${title||url} + ${DEBUG ? id + ':' : ''} ${HL.get(id)?.title||title||url} +
    + ${(HL.get(id)?.url||url).slice(0,128)}

    ${snippet.join('…')}

  • `).join('\n') diff --git a/todo b/todo index a01d807..00a2d77 100644 --- a/todo +++ b/todo @@ -1,3 +1,4 @@ +- fix snippets - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this - Snippets with highlights via levenshtein distance search from natural - Improve search page look