"Fixed the windowed highlight bug, where incorrect offsets were occuring. Now using correct source array to remove that problem. Also improved highlight matching by adding a score (minima match) to fuzzy.options (3, versus default 1)."

This commit is contained in:
Cris Stringfellow 2021-12-25 13:47:21 +08:00
parent 56bb405520
commit ac74c809eb
3 changed files with 15 additions and 11 deletions

View File

@ -84,6 +84,13 @@
DEBUG && console.log({NDX_FTSIndex});
// fuzzy (maybe just for queries ?)
const REGULAR_SEARCH_OPTIONS_FUZZY = {
minimum_match: 1.0
};
const HIGHLIGHT_OPTIONS_FUZZY = {
minimum_match: 3.0
};
const FUZZ_OPTS = {
keys: ndxDocFields({namesOnly:true})
};
@ -864,6 +871,7 @@ export default Archivist;
if ( maxLength ) {
doc = Array.from(doc).slice(0, maxLength).join('');
}
Object.assign(fuzzy.options, HIGHLIGHT_OPTIONS_FUZZY);
const hl = fuzzy.highlight(doc);
DEBUG && console.log(query, hl);
return hl;
@ -931,6 +939,7 @@ export default Archivist;
url: State.Index.get('ndx'+r.key),
score: r.score
}));
Object.assign(fuzzy.options, REGULAR_SEARCH_OPTIONS_FUZZY);
const fuzzRaw = fuzzy.search(query);
const fuzz = processFuzzResults(fuzzRaw);

View File

@ -64,13 +64,13 @@ export function highlight(query, doc, {
console.log('Zero highlights, showing first score', scores[0]);
return scores.slice(0,1);
} else {
let better = JSON.parse(JSON.stringify(highlights)).slice(0, 10);
let better = Array.from(highlights).slice(0, 10);
better = better.map(hl => {
const length = Array.from(hl.fragment.text).length;
const extra = Math.round(length/2);
let {offset} = hl.fragment;
const newText = doc.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + doc.slice(offset + length, offset + length + extra).join('');
//console.log({newText, oldText:hl.fragment.text});
let {offset, symbols} = hl.fragment;
const newText = symbols.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + symbols.slice(offset + length, offset + length + extra).join('');
DEBUG && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});
hl.fragment.text = newText;
const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
@ -106,7 +106,7 @@ function getFragmenter(chunkSize) {
currentFrag = frags.pop();
currentFrag.text += nextSymbol;
} else {
currentFrag = {text:nextSymbol, offset:index};
currentFrag = {text:nextSymbol, offset:index, symbols};
currentLength = 0;
}
currentLength++;

7
todo
View File

@ -1,10 +1,5 @@
- highlights are mostly rubbish right now
- implement trigram index
- try an exact match on the query term if possible for highlight. first one.
- don't highlight small matches like:
- search: Zuckerberg, top result: Hacker News - Top Links
- highlight Ha<strong>cker</strong> News
- WTF come on... I need a threshold on this stuff....or like, if I can find a good match in
body then don't highlight worse match in title...or maybe I can use ukkonen as part of threshold
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this