"Fixed an issue with fuzzy and i_url. need to improve snippets"
This commit is contained in:
parent
c8517f6b0f
commit
a050a1a43d
36
archivist.js
36
archivist.js
|
@ -21,11 +21,12 @@
|
||||||
addDocumentToIndex as ndx,
|
addDocumentToIndex as ndx,
|
||||||
removeDocumentFromIndex,
|
removeDocumentFromIndex,
|
||||||
vacuumIndex
|
vacuumIndex
|
||||||
} from './lib/ndx.js';
|
} from 'ndx';
|
||||||
import { query as NDXQuery } from 'ndx-query';
|
import { query as NDXQuery } from 'ndx-query';
|
||||||
import { toSerializable, fromSerializable } from 'ndx-serializable';
|
import { toSerializable, fromSerializable } from 'ndx-serializable';
|
||||||
//import { DocumentIndex } from 'ndx';
|
//import { DocumentIndex } from 'ndx';
|
||||||
import Fuzzy from 'fz-search';
|
//import Fuzzy from 'fz-search';
|
||||||
|
import * as _Fuzzy from './lib/fz.js';
|
||||||
import Nat from 'natural';
|
import Nat from 'natural';
|
||||||
|
|
||||||
import args from './args.js';
|
import args from './args.js';
|
||||||
|
@ -41,6 +42,7 @@
|
||||||
|
|
||||||
// search related state: constants and variables
|
// search related state: constants and variables
|
||||||
// common
|
// common
|
||||||
|
const Fuzzy = globalThis.FuzzySearch;
|
||||||
const NDX_OLD = false;
|
const NDX_OLD = false;
|
||||||
const USE_FLEX = true;
|
const USE_FLEX = true;
|
||||||
const FTS_INDEX_DIR = args.fts_index_dir;
|
const FTS_INDEX_DIR = args.fts_index_dir;
|
||||||
|
@ -50,10 +52,6 @@
|
||||||
NDX_ID_KEY
|
NDX_ID_KEY
|
||||||
]);
|
]);
|
||||||
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
|
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
|
||||||
const nextOffset = (query, doc, startAt = 0) => Nat.LevenshteinDistanceSearch(
|
|
||||||
query,
|
|
||||||
doc.slice(startAt)
|
|
||||||
);
|
|
||||||
let Id;
|
let Id;
|
||||||
|
|
||||||
// natural (NLP tools -- stemmers and tokenizers, etc)
|
// natural (NLP tools -- stemmers and tokenizers, etc)
|
||||||
|
@ -441,6 +439,7 @@ export default Archivist;
|
||||||
doc.contentSignature = contentSignature;
|
doc.contentSignature = contentSignature;
|
||||||
fuzzy.add(doc);
|
fuzzy.add(doc);
|
||||||
State.Docs.set(url, doc);
|
State.Docs.set(url, doc);
|
||||||
|
console.log(doc,url);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG && console.log("NDX updated", doc.ndx_id);
|
DEBUG && console.log("NDX updated", doc.ndx_id);
|
||||||
|
@ -660,6 +659,7 @@ export default Archivist;
|
||||||
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
|
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
|
||||||
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
|
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
|
||||||
doc.contentSignature = getContentSig(doc);
|
doc.contentSignature = getContentSig(doc);
|
||||||
|
console.log(doc.url, doc);
|
||||||
return [doc.url, doc];
|
return [doc.url, doc];
|
||||||
}));
|
}));
|
||||||
await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc)));
|
await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc)));
|
||||||
|
@ -676,7 +676,7 @@ export default Archivist;
|
||||||
|
|
||||||
function saveFuzzy(basePath) {
|
function saveFuzzy(basePath) {
|
||||||
const docs = [...State.Docs.values()]
|
const docs = [...State.Docs.values()]
|
||||||
.map(({url, title, content, id}) => ({url, title, content, id}));
|
.map(({i_url, url, title, content, id}) => ({i_url, url, title, content, id}));
|
||||||
const path = getFuzzyPath(basePath);
|
const path = getFuzzyPath(basePath);
|
||||||
Fs.writeFileSync(
|
Fs.writeFileSync(
|
||||||
path,
|
path,
|
||||||
|
@ -862,18 +862,12 @@ export default Archivist;
|
||||||
function findOffsets(query, doc, count) {
|
function findOffsets(query, doc, count) {
|
||||||
let res = [];
|
let res = [];
|
||||||
|
|
||||||
let i = 0;
|
const result = Nat.LevenshteinDistanceSearch(query, doc);
|
||||||
while(i < doc.length) {
|
|
||||||
const result = nextOffset(query, doc, i);
|
if ( result.distance/result.substring.length < 0.5 ) {
|
||||||
console.log(result, i);
|
|
||||||
i += result.offset + result.substring.length + SNIP_CONTEXT;
|
|
||||||
res.push(result);
|
res.push(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
res.sort(({distance:a}, {distance:b}) => a-b);
|
|
||||||
console.log({res});
|
|
||||||
res = res.slice(0, count);
|
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -939,7 +933,15 @@ export default Archivist;
|
||||||
|
|
||||||
const results = combineResults({flex, ndx, fuzz});
|
const results = combineResults({flex, ndx, fuzz});
|
||||||
|
|
||||||
return {query,results};
|
const highlights = fuzzRaw.map(obj => ({
|
||||||
|
id: obj.id,
|
||||||
|
url: fuzzy.highlight(obj.url),
|
||||||
|
title: fuzzy.highlight(State.Index.get(obj.id).title),
|
||||||
|
}));
|
||||||
|
const HL = new Map();
|
||||||
|
highlights.forEach(hl => HL.set(hl.id, hl));
|
||||||
|
|
||||||
|
return {query,results, HL};
|
||||||
}
|
}
|
||||||
|
|
||||||
function combineResults({flex,ndx,fuzz}) {
|
function combineResults({flex,ndx,fuzz}) {
|
||||||
|
|
|
@ -55,7 +55,7 @@ function addHandlers() {
|
||||||
|
|
||||||
app.get('/search(.json)?', async (req, res) => {
|
app.get('/search(.json)?', async (req, res) => {
|
||||||
await Archivist.isReady();
|
await Archivist.isReady();
|
||||||
const {query, results:resultIds} = await Archivist.search(req.query.query);
|
const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
|
||||||
const results = resultIds.map(docId => Archivist.getDetails(docId));
|
const results = resultIds.map(docId => Archivist.getDetails(docId));
|
||||||
if ( req.path.endsWith('.json') ) {
|
if ( req.path.endsWith('.json') ) {
|
||||||
res.end(JSON.stringify({
|
res.end(JSON.stringify({
|
||||||
|
@ -63,17 +63,9 @@ function addHandlers() {
|
||||||
}, null, 2));
|
}, null, 2));
|
||||||
} else {
|
} else {
|
||||||
results.forEach(r => {
|
results.forEach(r => {
|
||||||
const Offsets = Archivist.findOffsets(query, r.content, 3);
|
r.snippet = ['no snippet']
|
||||||
|
|
||||||
r.snippet = [];
|
|
||||||
for ( const {substring,offset} of Offsets ) {
|
|
||||||
r.snippet.push(r.content.substring(offset-SNIP_CONTEXT, offset) +
|
|
||||||
`<strong>${substring}</strong>` +
|
|
||||||
r.content.substr(offset+substring.length, SNIP_CONTEXT)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
res.end(SearchResultView({results, query}));
|
res.end(SearchResultView({results, query, HL}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -194,7 +186,7 @@ function IndexView(urls) {
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
function SearchResultView({results, query}) {
|
function SearchResultView({results, query, HL}) {
|
||||||
return `
|
return `
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<meta charset=utf-8>
|
<meta charset=utf-8>
|
||||||
|
@ -249,7 +241,9 @@ function SearchResultView({results, query}) {
|
||||||
${
|
${
|
||||||
results.map(({snippet, url,title,id}) => `
|
results.map(({snippet, url,title,id}) => `
|
||||||
<li>
|
<li>
|
||||||
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${title||url}</a>
|
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
|
||||||
|
<br>
|
||||||
|
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
|
||||||
<p>${snippet.join('…')}</p>
|
<p>${snippet.join('…')}</p>
|
||||||
</li>
|
</li>
|
||||||
`).join('\n')
|
`).join('\n')
|
||||||
|
|
1
todo
1
todo
|
@ -1,3 +1,4 @@
|
||||||
|
- fix snippets
|
||||||
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
|
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
|
||||||
- Snippets with highlights via levenshtein distance search from natural
|
- Snippets with highlights via levenshtein distance search from natural
|
||||||
- Improve search page look
|
- Improve search page look
|
||||||
|
|
Loading…
Reference in New Issue