"Fixed an issue with fuzzy and i_url. need to improve snippets"

This commit is contained in:
Cris Stringfellow 2021-12-22 17:11:24 +08:00
parent c8517f6b0f
commit a050a1a43d
4 changed files with 3098 additions and 30 deletions

View File

@ -21,11 +21,12 @@
addDocumentToIndex as ndx, addDocumentToIndex as ndx,
removeDocumentFromIndex, removeDocumentFromIndex,
vacuumIndex vacuumIndex
} from './lib/ndx.js'; } from 'ndx';
import { query as NDXQuery } from 'ndx-query'; import { query as NDXQuery } from 'ndx-query';
import { toSerializable, fromSerializable } from 'ndx-serializable'; import { toSerializable, fromSerializable } from 'ndx-serializable';
//import { DocumentIndex } from 'ndx'; //import { DocumentIndex } from 'ndx';
import Fuzzy from 'fz-search'; //import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural'; import Nat from 'natural';
import args from './args.js'; import args from './args.js';
@ -41,6 +42,7 @@
// search related state: constants and variables // search related state: constants and variables
// common // common
const Fuzzy = globalThis.FuzzySearch;
const NDX_OLD = false; const NDX_OLD = false;
const USE_FLEX = true; const USE_FLEX = true;
const FTS_INDEX_DIR = args.fts_index_dir; const FTS_INDEX_DIR = args.fts_index_dir;
@ -50,10 +52,6 @@
NDX_ID_KEY NDX_ID_KEY
]); ]);
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key); const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
const nextOffset = (query, doc, startAt = 0) => Nat.LevenshteinDistanceSearch(
query,
doc.slice(startAt)
);
let Id; let Id;
// natural (NLP tools -- stemmers and tokenizers, etc) // natural (NLP tools -- stemmers and tokenizers, etc)
@ -441,6 +439,7 @@ export default Archivist;
doc.contentSignature = contentSignature; doc.contentSignature = contentSignature;
fuzzy.add(doc); fuzzy.add(doc);
State.Docs.set(url, doc); State.Docs.set(url, doc);
console.log(doc,url);
} }
DEBUG && console.log("NDX updated", doc.ndx_id); DEBUG && console.log("NDX updated", doc.ndx_id);
@ -660,6 +659,7 @@ export default Archivist;
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString(); const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => { State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
doc.contentSignature = getContentSig(doc); doc.contentSignature = getContentSig(doc);
console.log(doc.url, doc);
return [doc.url, doc]; return [doc.url, doc];
})); }));
await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc))); await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc)));
@ -676,7 +676,7 @@ export default Archivist;
function saveFuzzy(basePath) { function saveFuzzy(basePath) {
const docs = [...State.Docs.values()] const docs = [...State.Docs.values()]
.map(({url, title, content, id}) => ({url, title, content, id})); .map(({i_url, url, title, content, id}) => ({i_url, url, title, content, id}));
const path = getFuzzyPath(basePath); const path = getFuzzyPath(basePath);
Fs.writeFileSync( Fs.writeFileSync(
path, path,
@ -862,18 +862,12 @@ export default Archivist;
function findOffsets(query, doc, count) { function findOffsets(query, doc, count) {
let res = []; let res = [];
let i = 0; const result = Nat.LevenshteinDistanceSearch(query, doc);
while(i < doc.length) {
const result = nextOffset(query, doc, i); if ( result.distance/result.substring.length < 0.5 ) {
console.log(result, i);
i += result.offset + result.substring.length + SNIP_CONTEXT;
res.push(result); res.push(result);
} }
res.sort(({distance:a}, {distance:b}) => a-b);
console.log({res});
res = res.slice(0, count);
return res; return res;
} }
@ -939,7 +933,15 @@ export default Archivist;
const results = combineResults({flex, ndx, fuzz}); const results = combineResults({flex, ndx, fuzz});
return {query,results}; const highlights = fuzzRaw.map(obj => ({
id: obj.id,
url: fuzzy.highlight(obj.url),
title: fuzzy.highlight(State.Index.get(obj.id).title),
}));
const HL = new Map();
highlights.forEach(hl => HL.set(hl.id, hl));
return {query,results, HL};
} }
function combineResults({flex,ndx,fuzz}) { function combineResults({flex,ndx,fuzz}) {

3071
lib/fz.js Normal file

File diff suppressed because it is too large Load Diff

View File

@ -55,7 +55,7 @@ function addHandlers() {
app.get('/search(.json)?', async (req, res) => { app.get('/search(.json)?', async (req, res) => {
await Archivist.isReady(); await Archivist.isReady();
const {query, results:resultIds} = await Archivist.search(req.query.query); const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
const results = resultIds.map(docId => Archivist.getDetails(docId)); const results = resultIds.map(docId => Archivist.getDetails(docId));
if ( req.path.endsWith('.json') ) { if ( req.path.endsWith('.json') ) {
res.end(JSON.stringify({ res.end(JSON.stringify({
@ -63,17 +63,9 @@ function addHandlers() {
}, null, 2)); }, null, 2));
} else { } else {
results.forEach(r => { results.forEach(r => {
const Offsets = Archivist.findOffsets(query, r.content, 3); r.snippet = ['no snippet']
r.snippet = [];
for ( const {substring,offset} of Offsets ) {
r.snippet.push(r.content.substring(offset-SNIP_CONTEXT, offset) +
`<strong>${substring}</strong>` +
r.content.substr(offset+substring.length, SNIP_CONTEXT)
);
}
}); });
res.end(SearchResultView({results, query})); res.end(SearchResultView({results, query, HL}));
} }
}); });
@ -194,7 +186,7 @@ function IndexView(urls) {
` `
} }
function SearchResultView({results, query}) { function SearchResultView({results, query, HL}) {
return ` return `
<!DOCTYPE html> <!DOCTYPE html>
<meta charset=utf-8> <meta charset=utf-8>
@ -249,7 +241,9 @@ function SearchResultView({results, query}) {
${ ${
results.map(({snippet, url,title,id}) => ` results.map(({snippet, url,title,id}) => `
<li> <li>
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${title||url}</a> ${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
<br>
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
<p>${snippet.join('&hellip;')}</p> <p>${snippet.join('&hellip;')}</p>
</li> </li>
`).join('\n') `).join('\n')

1
todo
View File

@ -1,3 +1,4 @@
- fix snippets
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- Snippets with highlights via levenshtein distance search from natural - Snippets with highlights via levenshtein distance search from natural
- Improve search page look - Improve search page look