"Fixed an issue with fuzzy and i_url. need to improve snippets"
This commit is contained in:
parent
c8517f6b0f
commit
a050a1a43d
36
archivist.js
36
archivist.js
|
@ -21,11 +21,12 @@
|
|||
addDocumentToIndex as ndx,
|
||||
removeDocumentFromIndex,
|
||||
vacuumIndex
|
||||
} from './lib/ndx.js';
|
||||
} from 'ndx';
|
||||
import { query as NDXQuery } from 'ndx-query';
|
||||
import { toSerializable, fromSerializable } from 'ndx-serializable';
|
||||
//import { DocumentIndex } from 'ndx';
|
||||
import Fuzzy from 'fz-search';
|
||||
//import Fuzzy from 'fz-search';
|
||||
import * as _Fuzzy from './lib/fz.js';
|
||||
import Nat from 'natural';
|
||||
|
||||
import args from './args.js';
|
||||
|
@ -41,6 +42,7 @@
|
|||
|
||||
// search related state: constants and variables
|
||||
// common
|
||||
const Fuzzy = globalThis.FuzzySearch;
|
||||
const NDX_OLD = false;
|
||||
const USE_FLEX = true;
|
||||
const FTS_INDEX_DIR = args.fts_index_dir;
|
||||
|
@ -50,10 +52,6 @@
|
|||
NDX_ID_KEY
|
||||
]);
|
||||
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
|
||||
const nextOffset = (query, doc, startAt = 0) => Nat.LevenshteinDistanceSearch(
|
||||
query,
|
||||
doc.slice(startAt)
|
||||
);
|
||||
let Id;
|
||||
|
||||
// natural (NLP tools -- stemmers and tokenizers, etc)
|
||||
|
@ -441,6 +439,7 @@ export default Archivist;
|
|||
doc.contentSignature = contentSignature;
|
||||
fuzzy.add(doc);
|
||||
State.Docs.set(url, doc);
|
||||
console.log(doc,url);
|
||||
}
|
||||
|
||||
DEBUG && console.log("NDX updated", doc.ndx_id);
|
||||
|
@ -660,6 +659,7 @@ export default Archivist;
|
|||
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
|
||||
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
|
||||
doc.contentSignature = getContentSig(doc);
|
||||
console.log(doc.url, doc);
|
||||
return [doc.url, doc];
|
||||
}));
|
||||
await Promise.all([...State.Docs.values()].map(async doc => fuzzy.add(doc)));
|
||||
|
@ -676,7 +676,7 @@ export default Archivist;
|
|||
|
||||
function saveFuzzy(basePath) {
|
||||
const docs = [...State.Docs.values()]
|
||||
.map(({url, title, content, id}) => ({url, title, content, id}));
|
||||
.map(({i_url, url, title, content, id}) => ({i_url, url, title, content, id}));
|
||||
const path = getFuzzyPath(basePath);
|
||||
Fs.writeFileSync(
|
||||
path,
|
||||
|
@ -862,18 +862,12 @@ export default Archivist;
|
|||
function findOffsets(query, doc, count) {
|
||||
let res = [];
|
||||
|
||||
let i = 0;
|
||||
while(i < doc.length) {
|
||||
const result = nextOffset(query, doc, i);
|
||||
console.log(result, i);
|
||||
i += result.offset + result.substring.length + SNIP_CONTEXT;
|
||||
const result = Nat.LevenshteinDistanceSearch(query, doc);
|
||||
|
||||
if ( result.distance/result.substring.length < 0.5 ) {
|
||||
res.push(result);
|
||||
}
|
||||
|
||||
res.sort(({distance:a}, {distance:b}) => a-b);
|
||||
console.log({res});
|
||||
res = res.slice(0, count);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -939,7 +933,15 @@ export default Archivist;
|
|||
|
||||
const results = combineResults({flex, ndx, fuzz});
|
||||
|
||||
return {query,results};
|
||||
const highlights = fuzzRaw.map(obj => ({
|
||||
id: obj.id,
|
||||
url: fuzzy.highlight(obj.url),
|
||||
title: fuzzy.highlight(State.Index.get(obj.id).title),
|
||||
}));
|
||||
const HL = new Map();
|
||||
highlights.forEach(hl => HL.set(hl.id, hl));
|
||||
|
||||
return {query,results, HL};
|
||||
}
|
||||
|
||||
function combineResults({flex,ndx,fuzz}) {
|
||||
|
|
|
@ -55,7 +55,7 @@ function addHandlers() {
|
|||
|
||||
app.get('/search(.json)?', async (req, res) => {
|
||||
await Archivist.isReady();
|
||||
const {query, results:resultIds} = await Archivist.search(req.query.query);
|
||||
const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
|
||||
const results = resultIds.map(docId => Archivist.getDetails(docId));
|
||||
if ( req.path.endsWith('.json') ) {
|
||||
res.end(JSON.stringify({
|
||||
|
@ -63,17 +63,9 @@ function addHandlers() {
|
|||
}, null, 2));
|
||||
} else {
|
||||
results.forEach(r => {
|
||||
const Offsets = Archivist.findOffsets(query, r.content, 3);
|
||||
|
||||
r.snippet = [];
|
||||
for ( const {substring,offset} of Offsets ) {
|
||||
r.snippet.push(r.content.substring(offset-SNIP_CONTEXT, offset) +
|
||||
`<strong>${substring}</strong>` +
|
||||
r.content.substr(offset+substring.length, SNIP_CONTEXT)
|
||||
);
|
||||
}
|
||||
r.snippet = ['no snippet']
|
||||
});
|
||||
res.end(SearchResultView({results, query}));
|
||||
res.end(SearchResultView({results, query, HL}));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -194,7 +186,7 @@ function IndexView(urls) {
|
|||
`
|
||||
}
|
||||
|
||||
function SearchResultView({results, query}) {
|
||||
function SearchResultView({results, query, HL}) {
|
||||
return `
|
||||
<!DOCTYPE html>
|
||||
<meta charset=utf-8>
|
||||
|
@ -249,7 +241,9 @@ function SearchResultView({results, query}) {
|
|||
${
|
||||
results.map(({snippet, url,title,id}) => `
|
||||
<li>
|
||||
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${title||url}</a>
|
||||
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
|
||||
<br>
|
||||
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
|
||||
<p>${snippet.join('…')}</p>
|
||||
</li>
|
||||
`).join('\n')
|
||||
|
|
Loading…
Reference in New Issue