"Snippets getting better. Scoring still has some errors : 0 score for unrelated framgnet? how ... discover this"
This commit is contained in:
parent
197ff25631
commit
0d37fe503b
22
archivist.js
22
archivist.js
|
@ -278,7 +278,7 @@ export default Archivist;
|
|||
const {sessionId} = install;
|
||||
if ( ! ConfirmedInstalls.has(sessionId) ) {
|
||||
ConfirmedInstalls.add(sessionId);
|
||||
console.log({confirmedInstall:val, context});
|
||||
DEBUG && console.log({confirmedInstall:val, context});
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -293,8 +293,9 @@ export default Archivist;
|
|||
}
|
||||
|
||||
function displayTargetInfo({targetInfo}) {
|
||||
const DEBUG = true;
|
||||
if ( targetInfo.type === 'page' ) {
|
||||
console.log("Target info", JSON.stringify(targetInfo, null, 2));
|
||||
DEBUG && console.log("Target info", JSON.stringify(targetInfo, null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -385,7 +386,7 @@ export default Archivist;
|
|||
if ( info.url.startsWith('chrome') ) return;
|
||||
if ( dontCache(info) ) return;
|
||||
|
||||
console.log('Index URL called', info);
|
||||
DEBUG && console.log('Index URL called', info);
|
||||
|
||||
if ( State.Indexing.has(info.targetId) ) return;
|
||||
State.Indexing.add(info.targetId);
|
||||
|
@ -441,14 +442,14 @@ export default Archivist;
|
|||
doc.contentSignature = contentSignature;
|
||||
fuzzy.add(doc);
|
||||
State.Docs.set(url, doc);
|
||||
console.log(doc,url);
|
||||
DEBUG && console.log({updateFuzz: {doc,url}});
|
||||
}
|
||||
|
||||
DEBUG && console.log("NDX updated", doc.ndx_id);
|
||||
|
||||
UpdatedKeys.add(url);
|
||||
|
||||
console.log({id: doc.id, title, url, indexed: true});
|
||||
DEBUG && console.log({id: doc.id, title, url, indexed: true});
|
||||
|
||||
State.Indexing.delete(info.targetId);
|
||||
}
|
||||
|
@ -657,7 +658,6 @@ export default Archivist;
|
|||
}
|
||||
|
||||
async function loadFuzzy() {
|
||||
const DEBUG = true;
|
||||
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
|
||||
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
|
||||
doc.i_url = getURI(doc.url);
|
||||
|
@ -684,7 +684,7 @@ export default Archivist;
|
|||
path,
|
||||
JSON.stringify(docs)
|
||||
);
|
||||
console.log(`Wrote fuzzy to ${path}`);
|
||||
DEBUG && console.log(`Wrote fuzzy to ${path}`);
|
||||
}
|
||||
|
||||
function clearSavers() {
|
||||
|
@ -725,7 +725,6 @@ export default Archivist;
|
|||
}
|
||||
|
||||
try {
|
||||
const DEBUG = true;
|
||||
const flexBase = getFlexBase();
|
||||
Fs.readdirSync(flexBase, {withFileTypes:true}).forEach(dirEnt => {
|
||||
if ( dirEnt.isFile() ) {
|
||||
|
@ -862,7 +861,6 @@ export default Archivist;
|
|||
}
|
||||
|
||||
function findOffsets(query, doc, count = 0) {
|
||||
const DEBUG = true;
|
||||
const hl = fuzzy.highlight(doc);
|
||||
DEBUG && console.log(query, hl);
|
||||
return hl;
|
||||
|
@ -948,7 +946,6 @@ export default Archivist;
|
|||
}
|
||||
|
||||
function combineResults({flex,ndx,fuzz}) {
|
||||
const DEBUG = true;
|
||||
DEBUG && console.log({flex,ndx,fuzz});
|
||||
const score = {};
|
||||
flex.forEach(countRank(score));
|
||||
|
@ -1014,7 +1011,7 @@ export default Archivist;
|
|||
console.error('Error writing full text search index', e);
|
||||
}
|
||||
});
|
||||
console.log(`Wrote Flex to ${flexBase}`);
|
||||
DEBUG && console.log(`Wrote Flex to ${flexBase}`);
|
||||
NDX_FTSIndex.save(dir);
|
||||
saveFuzzy(dir);
|
||||
UpdatedKeys.clear();
|
||||
|
@ -1114,7 +1111,7 @@ export default Archivist;
|
|||
path,
|
||||
objStr
|
||||
);
|
||||
console.log("Write NDX to ", path);
|
||||
DEBUG && console.log("Write NDX to ", path);
|
||||
},
|
||||
load: newIndex => {
|
||||
retVal.index = newIndex;
|
||||
|
@ -1143,7 +1140,6 @@ export default Archivist;
|
|||
}
|
||||
|
||||
function loadNDXIndex(ndxFTSIndex) {
|
||||
const DEBUG = true;
|
||||
if ( Fs.existsSync(getNDXPath()) ) {
|
||||
const indexContent = Fs.readFileSync(getNDXPath()).toString();
|
||||
const index = fromSerializable(JSON.parse(indexContent));
|
||||
|
|
|
@ -5,21 +5,29 @@ const CHUNK_SIZE = 24;
|
|||
|
||||
testHighlighter();
|
||||
|
||||
export function highlight(query, doc, {
|
||||
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE
|
||||
} = {}) {
|
||||
function params(qLength, chunkSize) {
|
||||
const MaxDist = CHUNK_SIZE;
|
||||
const MinScore = Math.abs(qLength - CHUNK_SIZE);
|
||||
const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
|
||||
return {MaxDist,MinScore,MaxScore};
|
||||
}
|
||||
|
||||
export function highlight(query, doc, {
|
||||
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE,
|
||||
chunkSize: chunkSize = CHUNK_SIZE
|
||||
} = {}) {
|
||||
doc = Array.from(doc);
|
||||
const highlights = [];
|
||||
// use array from then length rather than string length to
|
||||
// give accurate length for all unicode
|
||||
const qLength = Array.from(query).length;
|
||||
const MinScore = Math.abs(qLength - CHUNK_SIZE);
|
||||
const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
|
||||
|
||||
const fragments = Array.from(doc).reduce(getFragmenter(CHUNK_SIZE), []);
|
||||
const {MaxDist,MinScore,MaxScore} = params(qLength, chunkSize);
|
||||
const fragments = doc.reduce(getFragmenter(chunkSize), []);
|
||||
query.toLocaleLowerCase();
|
||||
console.log(fragments);
|
||||
|
||||
const scores = fragments.map(fragment => {
|
||||
const distance = ukkonen(query, fragment, MaxDist);
|
||||
const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
|
||||
// the min score possible = the minimum number of edits between
|
||||
const scaledScore = (distance - MinScore)/MaxScore;
|
||||
return {score: scaledScore, fragment};
|
||||
|
@ -27,7 +35,6 @@ export function highlight(query, doc, {
|
|||
|
||||
// sort ascending (smallest scores win)
|
||||
scores.sort(({score:a}, {score:b}) => a-b);
|
||||
console.log({scores});
|
||||
|
||||
for( const {score, fragment} of scores ) {
|
||||
if ( score > maxAcceptScore ) {
|
||||
|
@ -38,6 +45,26 @@ export function highlight(query, doc, {
|
|||
|
||||
if ( highlights.length === 0 ) {
|
||||
console.log('Zero highlights, showing first score', scores[0]);
|
||||
return scores.slice(0,1);
|
||||
} else {
|
||||
let better = JSON.parse(JSON.stringify(highlights)).slice(0, 10);
|
||||
better = better.map(hl => {
|
||||
const length = Array.from(hl.fragment.text).length;
|
||||
const extra = Math.round(length/2);
|
||||
let {offset} = hl.fragment;
|
||||
const newText = doc.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + doc.slice(offset + length, offset + length + extra).join('');
|
||||
//console.log({newText, oldText:hl.fragment.text});
|
||||
hl.fragment.text = newText;
|
||||
const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
|
||||
const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
|
||||
// the min score possible = the minimum number of edits between
|
||||
const scaledScore = (distance - MinScore)/MaxScore;
|
||||
hl.score = scaledScore;
|
||||
return hl;
|
||||
});
|
||||
better.sort(({score:a}, {score:b}) => a-b);
|
||||
console.log(JSON.stringify({better},null,2));
|
||||
return better.slice(0,3);
|
||||
}
|
||||
|
||||
return highlights;
|
||||
|
@ -60,9 +87,9 @@ function getFragmenter(chunkSize) {
|
|||
// keep adding to the currentFragment
|
||||
if ( frags.length && ((currentLength + 1) <= chunkSize) ) {
|
||||
currentFrag = frags.pop();
|
||||
currentFrag += nextSymbol;
|
||||
currentFrag.text += nextSymbol;
|
||||
} else {
|
||||
currentFrag = nextSymbol;
|
||||
currentFrag = {text:nextSymbol, offset:index};
|
||||
currentLength = 0;
|
||||
}
|
||||
currentLength++;
|
||||
|
|
|
@ -5,6 +5,7 @@ import express from 'express';
|
|||
import args from './args.js';
|
||||
import {DEBUG, say, sleep, APP_ROOT, SNIP_CONTEXT} from './common.js';
|
||||
import Archivist from './archivist.js';
|
||||
import {highlight} from './highlighter.js';
|
||||
|
||||
const SITE_PATH = path.resolve(APP_ROOT, 'public');
|
||||
|
||||
|
@ -63,7 +64,9 @@ function addHandlers() {
|
|||
}, null, 2));
|
||||
} else {
|
||||
results.forEach(r => {
|
||||
r.snippet = Archivist.findOffsets(query, r.content.slice(0,150));
|
||||
r.snippet = Archivist.findOffsets(query,
|
||||
highlight(query, r.content).map(hl => hl.fragment.text).join('…')
|
||||
);
|
||||
});
|
||||
res.end(SearchResultView({results, query, HL}));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue