From 37013ed080a839371e1894c1c6786a4660376084 Mon Sep 17 00:00:00 2001 From: Cris Stringfellow <22254235+crislin2046@users.noreply.github.com> Date: Thu, 23 Dec 2021 11:01:20 +0800 Subject: [PATCH] "Snippets" --- archivist.js | 37 +++++++--------- lib/fuzzy.js | 109 +++++++++++++++++++++++++++++++++++++++++++++++ lib/testFuzzy.js | 6 +++ libraryServer.js | 9 +--- todo | 2 + 5 files changed, 135 insertions(+), 28 deletions(-) create mode 100644 lib/fuzzy.js create mode 100644 lib/testFuzzy.js diff --git a/archivist.js b/archivist.js index 6d4e006..1b84279 100644 --- a/archivist.js +++ b/archivist.js @@ -28,6 +28,8 @@ //import Fuzzy from 'fz-search'; import * as _Fuzzy from './lib/fz.js'; import Nat from 'natural'; + //import match from 'autosuggest-highlight/match'; + //import parse from 'autosuggest-highlight/parse'; import args from './args.js'; import { @@ -859,22 +861,10 @@ export default Archivist; return {url, title, id, content}; } - function findOffsets(query, doc, count) { - // this is the slow part - let res = []; - - const result = Nat.LevenshteinDistanceSearch(query, doc); - - if ( result.distance/result.substring.length < 0.5 ) { - const {substring,offset} = result; - res.push( - doc.substring(offset-50, offset) + - `${substring}` + - doc.substr(substring.length + offset, 50) - ); - } - - return res; + function findOffsets(query, doc, count = 0) { + const hl = fuzzy.highlight(doc); + DEBUG && console.log(hl); + return hl; } function beforePathChanged() { @@ -944,8 +934,8 @@ export default Archivist; const title = State.Index.get(obj.url)?.title; return { id: obj.id, - url: Archivist.findOffsets(query, obj.url)[0] || obj.url, - title: Archivist.findOffsets(query, title)[0] || title, + url: Archivist.findOffsets(query, obj.url) || obj.url, + title: Archivist.findOffsets(query, title) || title, }; }); highlights.forEach(hl => HL.set(hl.id, hl)); @@ -961,9 +951,14 @@ export default Archivist; fuzz.forEach(countRank(score)); const results = [...Object.values(score)].map(obj => { - const {id} = State.Index.get(obj.url); - obj.id = id; - return obj; + try { + const {id} = State.Index.get(obj.url); + obj.id = id; + return obj; + } catch(e) { + console.log(obj, State.Index, e); + throw e; + } }); results.sort(({score:scoreA}, {score:scoreB}) => scoreA-scoreB); const resultIds = results.map(({id}) => id); diff --git a/lib/fuzzy.js b/lib/fuzzy.js new file mode 100644 index 0000000..3201387 --- /dev/null +++ b/lib/fuzzy.js @@ -0,0 +1,109 @@ +/** + * Modified Dec 23 2021 by Cris Stringfellow + * fuzzy.js v0.1.0 + * (c) 2016 Ben Ripkens + * @license: MIT + */ + // NOTES + /* + * Whether or not fuzzy.js should analyze sub-terms, i.e. also + * check term starting positions != 0. + * + * Example: + * Given the term 'Halleluja' and query 'luja' + * + * Fuzzy.js scores this combination with an 8, when analyzeSubTerms is + * set to false, as the following matching string will be calculated: + * Ha[l]lel[uja] + * + * If you activate sub temr analysis though, the query will reach a score + * of 10, as the matching string looks as following: + * Halle[luja] + * + * Naturally, the second version is more expensive than the first one. + * You should therefore configure how many sub terms you which to analyse. + * This can be configured through fuzzy.analyzeSubTermDepth = 10. + */ + fuzzy.analyzeSubTerms = false; + /* + * How many sub terms should be analyzed. + */ + fuzzy.analyzeSubTermDepth = 10; + fuzzy.highlighting = { + before: '', + after: '' + }; + fuzzy.matchComparator = function matchComparator(m1, m2) { + return (m2.score - m1.score != 0) ? m2.score - m1.score : m1.term.length - m2.term.length; + }; + + export default function fuzzy(term, query) { + var max = calcFuzzyScore(term, query); + var termLength = term.length; + + if (fuzzy.analyzeSubTerms) { + + for (var i = 1; i < termLength && i < fuzzy.analyzeSubTermDepth; i++) { + var subTerm = term.substring(i); + var score = calcFuzzyScore(subTerm, query); + if (score.score > max.score) { + // we need to correct 'term' and 'matchedTerm', as calcFuzzyScore + // does not now that it operates on a substring. Doing it only for + // new maximum score to save some performance. + score.term = term; + score.highlightedTerm = term.substring(0, i) + score.highlightedTerm; + max = score; + } + } + } + + return max; + } + + function calcFuzzyScore(term, query) { + var score = 0; + var termLength = term.length; + var queryLength = query.length; + var highlighting = ''; + var ti = 0; + // -1 would not work as this would break the calculations of bonus + // points for subsequent character matches. Something like + // Number.MIN_VALUE would be more appropriate, but unfortunately + // Number.MIN_VALUE + 1 equals 1... + var previousMatchingCharacter = -2; + + for (var qi = 0; qi < queryLength && ti < termLength; qi++) { + var qc = query.charAt(qi); + var lowerQc = qc.toLowerCase(); + + for (; ti < termLength; ti++) { + var tc = term.charAt(ti); + + if (lowerQc === tc.toLowerCase()) { + score++; + + if ((previousMatchingCharacter + 1) === ti) { + score += 5; + } + + highlighting += fuzzy.highlighting.before + + tc + + fuzzy.highlighting.after; + previousMatchingCharacter = ti; + ti++; + break; + } else { + highlighting += tc; + } + } + } + + highlighting += term.substring(ti, term.length); + + return { + score: score, + term: term, + query: query, + highlightedTerm: highlighting + }; + }; diff --git a/lib/testFuzzy.js b/lib/testFuzzy.js new file mode 100644 index 0000000..78e6e9b --- /dev/null +++ b/lib/testFuzzy.js @@ -0,0 +1,6 @@ +import fuzzy from './fuzzy.js'; + +console.log(fuzzy); + +const doc = 'Meghan Markle requested this unexpected Christmas present for Archie from the Queen'; +console.log(fuzzy(doc, 'Queen')); diff --git a/libraryServer.js b/libraryServer.js index ef55cc7..09f3320 100644 --- a/libraryServer.js +++ b/libraryServer.js @@ -63,12 +63,7 @@ function addHandlers() { }, null, 2)); } else { results.forEach(r => { - const m = Archivist.findOffsets(query, r.content); - if ( m.length ) { - r.snippet = m; - } else { - r.snippet = [r.content.slice(0, 150)]; - } + r.snippet = Archivist.findOffsets(query, r.content.slice(0,150)); }); res.end(SearchResultView({results, query, HL})); } @@ -249,7 +244,7 @@ function SearchResultView({results, query, HL}) { ${DEBUG ? id + ':' : ''} ${HL.get(id)?.title||title||url}
${(HL.get(id)?.url||url).slice(0,128)} -

${snippet.join('…')}

+

${snippet}

`).join('\n') } diff --git a/todo b/todo index deb0e73..4f4a793 100644 --- a/todo +++ b/todo @@ -1,3 +1,5 @@ +- use ukkonen to find snippets +- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible) - Create instant search (or at least instant queries (so search over previous queries -- not results necessarily)) - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this - Improve search page look