"Fixed an issue with fuzzy and i_url. need to improve snippets"

This commit is contained in:
Cris Stringfellow 2021-12-22 17:11:24 +08:00
parent c8517f6b0f
commit a050a1a43d
4 changed files with 3098 additions and 30 deletions

View File

@ -21,11 +21,12 @@
addDocumentToIndex as ndx,
removeDocumentFromIndex,
vacuumIndex
} from './lib/ndx.js';
} from 'ndx';
import { query as NDXQuery } from 'ndx-query';
import { toSerializable, fromSerializable } from 'ndx-serializable';
//import { DocumentIndex } from 'ndx';
import Fuzzy from 'fz-search';
//import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural';
import args from './args.js';
@ -41,6 +42,7 @@
// search related state: constants and variables
// common
const Fuzzy = globalThis.FuzzySearch;
const NDX_OLD = false;
const USE_FLEX = true;
const FTS_INDEX_DIR = args.fts_index_dir;
@ -50,10 +52,6 @@
NDX_ID_KEY
]);
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
const nextOffset = (query, doc, startAt = 0) => Nat.LevenshteinDistanceSearch(
query,
doc.slice(startAt)
);
let Id;
// natural (NLP tools -- stemmers and tokenizers, etc)
@ -441,6 +439,7 @@ export default Archivist;
doc.contentSignature = contentSignature;
fuzzy.add(doc);
State.Docs.set(url, doc);
console.log(doc,url);
}
DEBUG && console.log("NDX updated", doc.ndx_id);
@ -660,6 +659,7 @@ export default Archivist;
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
doc.contentSignature = getContentSig(doc);
console.log(doc.url, doc);
return [doc.url, doc];
}));
await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc)));
@ -676,7 +676,7 @@ export default Archivist;
function saveFuzzy(basePath) {
const docs = [...State.Docs.values()]
.map(({url, title, content, id}) => ({url, title, content, id}));
.map(({i_url, url, title, content, id}) => ({i_url, url, title, content, id}));
const path = getFuzzyPath(basePath);
Fs.writeFileSync(
path,
@ -862,18 +862,12 @@ export default Archivist;
function findOffsets(query, doc, count) {
let res = [];
let i = 0;
while(i < doc.length) {
const result = nextOffset(query, doc, i);
console.log(result, i);
i += result.offset + result.substring.length + SNIP_CONTEXT;
const result = Nat.LevenshteinDistanceSearch(query, doc);
if ( result.distance/result.substring.length < 0.5 ) {
res.push(result);
}
res.sort(({distance:a}, {distance:b}) => a-b);
console.log({res});
res = res.slice(0, count);
return res;
}
@ -939,7 +933,15 @@ export default Archivist;
const results = combineResults({flex, ndx, fuzz});
return {query,results};
const highlights = fuzzRaw.map(obj => ({
id: obj.id,
url: fuzzy.highlight(obj.url),
title: fuzzy.highlight(State.Index.get(obj.id).title),
}));
const HL = new Map();
highlights.forEach(hl => HL.set(hl.id, hl));
return {query,results, HL};
}
function combineResults({flex,ndx,fuzz}) {

3071
lib/fz.js Normal file

File diff suppressed because it is too large Load Diff

View File

@ -55,7 +55,7 @@ function addHandlers() {
app.get('/search(.json)?', async (req, res) => {
await Archivist.isReady();
const {query, results:resultIds} = await Archivist.search(req.query.query);
const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
const results = resultIds.map(docId => Archivist.getDetails(docId));
if ( req.path.endsWith('.json') ) {
res.end(JSON.stringify({
@ -63,17 +63,9 @@ function addHandlers() {
}, null, 2));
} else {
results.forEach(r => {
const Offsets = Archivist.findOffsets(query, r.content, 3);
r.snippet = [];
for ( const {substring,offset} of Offsets ) {
r.snippet.push(r.content.substring(offset-SNIP_CONTEXT, offset) +
`<strong>${substring}</strong>` +
r.content.substr(offset+substring.length, SNIP_CONTEXT)
);
}
r.snippet = ['no snippet']
});
res.end(SearchResultView({results, query}));
res.end(SearchResultView({results, query, HL}));
}
});
@ -194,7 +186,7 @@ function IndexView(urls) {
`
}
function SearchResultView({results, query}) {
function SearchResultView({results, query, HL}) {
return `
<!DOCTYPE html>
<meta charset=utf-8>
@ -249,7 +241,9 @@ function SearchResultView({results, query}) {
${
results.map(({snippet, url,title,id}) => `
<li>
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${title||url}</a>
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
<br>
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
<p>${snippet.join('&hellip;')}</p>
</li>
`).join('\n')

1
todo
View File

@ -1,3 +1,4 @@
- fix snippets
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- Snippets with highlights via levenshtein distance search from natural
- Improve search page look