"Snippets getting better. Scoring still has some errors : 0 score for unrelated framgnet? how ... discover this"

This commit is contained in:
Cris Stringfellow 2021-12-24 11:40:57 +08:00
parent 197ff25631
commit 0d37fe503b
3 changed files with 51 additions and 25 deletions

View File

@ -278,7 +278,7 @@ export default Archivist;
const {sessionId} = install;
if ( ! ConfirmedInstalls.has(sessionId) ) {
ConfirmedInstalls.add(sessionId);
console.log({confirmedInstall:val, context});
DEBUG && console.log({confirmedInstall:val, context});
}
}
@ -293,8 +293,9 @@ export default Archivist;
}
function displayTargetInfo({targetInfo}) {
const DEBUG = true;
if ( targetInfo.type === 'page' ) {
console.log("Target info", JSON.stringify(targetInfo, null, 2));
DEBUG && console.log("Target info", JSON.stringify(targetInfo, null, 2));
}
}
@ -385,7 +386,7 @@ export default Archivist;
if ( info.url.startsWith('chrome') ) return;
if ( dontCache(info) ) return;
console.log('Index URL called', info);
DEBUG && console.log('Index URL called', info);
if ( State.Indexing.has(info.targetId) ) return;
State.Indexing.add(info.targetId);
@ -441,14 +442,14 @@ export default Archivist;
doc.contentSignature = contentSignature;
fuzzy.add(doc);
State.Docs.set(url, doc);
console.log(doc,url);
DEBUG && console.log({updateFuzz: {doc,url}});
}
DEBUG && console.log("NDX updated", doc.ndx_id);
UpdatedKeys.add(url);
console.log({id: doc.id, title, url, indexed: true});
DEBUG && console.log({id: doc.id, title, url, indexed: true});
State.Indexing.delete(info.targetId);
}
@ -657,7 +658,6 @@ export default Archivist;
}
async function loadFuzzy() {
const DEBUG = true;
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
doc.i_url = getURI(doc.url);
@ -684,7 +684,7 @@ export default Archivist;
path,
JSON.stringify(docs)
);
console.log(`Wrote fuzzy to ${path}`);
DEBUG && console.log(`Wrote fuzzy to ${path}`);
}
function clearSavers() {
@ -725,7 +725,6 @@ export default Archivist;
}
try {
const DEBUG = true;
const flexBase = getFlexBase();
Fs.readdirSync(flexBase, {withFileTypes:true}).forEach(dirEnt => {
if ( dirEnt.isFile() ) {
@ -862,7 +861,6 @@ export default Archivist;
}
function findOffsets(query, doc, count = 0) {
const DEBUG = true;
const hl = fuzzy.highlight(doc);
DEBUG && console.log(query, hl);
return hl;
@ -948,7 +946,6 @@ export default Archivist;
}
function combineResults({flex,ndx,fuzz}) {
const DEBUG = true;
DEBUG && console.log({flex,ndx,fuzz});
const score = {};
flex.forEach(countRank(score));
@ -1014,7 +1011,7 @@ export default Archivist;
console.error('Error writing full text search index', e);
}
});
console.log(`Wrote Flex to ${flexBase}`);
DEBUG && console.log(`Wrote Flex to ${flexBase}`);
NDX_FTSIndex.save(dir);
saveFuzzy(dir);
UpdatedKeys.clear();
@ -1114,7 +1111,7 @@ export default Archivist;
path,
objStr
);
console.log("Write NDX to ", path);
DEBUG && console.log("Write NDX to ", path);
},
load: newIndex => {
retVal.index = newIndex;
@ -1143,7 +1140,6 @@ export default Archivist;
}
function loadNDXIndex(ndxFTSIndex) {
const DEBUG = true;
if ( Fs.existsSync(getNDXPath()) ) {
const indexContent = Fs.readFileSync(getNDXPath()).toString();
const index = fromSerializable(JSON.parse(indexContent));

View File

@ -5,21 +5,29 @@ const CHUNK_SIZE = 24;
testHighlighter();
export function highlight(query, doc, {
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE
} = {}) {
function params(qLength, chunkSize) {
const MaxDist = CHUNK_SIZE;
const MinScore = Math.abs(qLength - CHUNK_SIZE);
const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
return {MaxDist,MinScore,MaxScore};
}
export function highlight(query, doc, {
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE,
chunkSize: chunkSize = CHUNK_SIZE
} = {}) {
doc = Array.from(doc);
const highlights = [];
// use array from then length rather than string length to
// give accurate length for all unicode
const qLength = Array.from(query).length;
const MinScore = Math.abs(qLength - CHUNK_SIZE);
const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
const fragments = Array.from(doc).reduce(getFragmenter(CHUNK_SIZE), []);
const {MaxDist,MinScore,MaxScore} = params(qLength, chunkSize);
const fragments = doc.reduce(getFragmenter(chunkSize), []);
query.toLocaleLowerCase();
console.log(fragments);
const scores = fragments.map(fragment => {
const distance = ukkonen(query, fragment, MaxDist);
const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
// the min score possible = the minimum number of edits between
const scaledScore = (distance - MinScore)/MaxScore;
return {score: scaledScore, fragment};
@ -27,7 +35,6 @@ export function highlight(query, doc, {
// sort ascending (smallest scores win)
scores.sort(({score:a}, {score:b}) => a-b);
console.log({scores});
for( const {score, fragment} of scores ) {
if ( score > maxAcceptScore ) {
@ -38,6 +45,26 @@ export function highlight(query, doc, {
if ( highlights.length === 0 ) {
console.log('Zero highlights, showing first score', scores[0]);
return scores.slice(0,1);
} else {
let better = JSON.parse(JSON.stringify(highlights)).slice(0, 10);
better = better.map(hl => {
const length = Array.from(hl.fragment.text).length;
const extra = Math.round(length/2);
let {offset} = hl.fragment;
const newText = doc.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + doc.slice(offset + length, offset + length + extra).join('');
//console.log({newText, oldText:hl.fragment.text});
hl.fragment.text = newText;
const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
// the min score possible = the minimum number of edits between
const scaledScore = (distance - MinScore)/MaxScore;
hl.score = scaledScore;
return hl;
});
better.sort(({score:a}, {score:b}) => a-b);
console.log(JSON.stringify({better},null,2));
return better.slice(0,3);
}
return highlights;
@ -60,9 +87,9 @@ function getFragmenter(chunkSize) {
// keep adding to the currentFragment
if ( frags.length && ((currentLength + 1) <= chunkSize) ) {
currentFrag = frags.pop();
currentFrag += nextSymbol;
currentFrag.text += nextSymbol;
} else {
currentFrag = nextSymbol;
currentFrag = {text:nextSymbol, offset:index};
currentLength = 0;
}
currentLength++;

View File

@ -5,6 +5,7 @@ import express from 'express';
import args from './args.js';
import {DEBUG, say, sleep, APP_ROOT, SNIP_CONTEXT} from './common.js';
import Archivist from './archivist.js';
import {highlight} from './highlighter.js';
const SITE_PATH = path.resolve(APP_ROOT, 'public');
@ -63,7 +64,9 @@ function addHandlers() {
}, null, 2));
} else {
results.forEach(r => {
r.snippet = Archivist.findOffsets(query, r.content.slice(0,150));
r.snippet = Archivist.findOffsets(query,
highlight(query, r.content).map(hl => hl.fragment.text).join('&hellip;')
);
});
res.end(SearchResultView({results, query, HL}));
}