"Snippets"

This commit is contained in:
Cris Stringfellow 2021-12-23 11:01:20 +08:00
parent d1c8cef7c9
commit 37013ed080
5 changed files with 135 additions and 28 deletions

View File

@ -28,6 +28,8 @@
//import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural';
//import match from 'autosuggest-highlight/match';
//import parse from 'autosuggest-highlight/parse';
import args from './args.js';
import {
@ -859,22 +861,10 @@ export default Archivist;
return {url, title, id, content};
}
function findOffsets(query, doc, count) {
// this is the slow part
let res = [];
const result = Nat.LevenshteinDistanceSearch(query, doc);
if ( result.distance/result.substring.length < 0.5 ) {
const {substring,offset} = result;
res.push(
doc.substring(offset-50, offset) +
`<strong>${substring}</strong>` +
doc.substr(substring.length + offset, 50)
);
}
return res;
function findOffsets(query, doc, count = 0) {
const hl = fuzzy.highlight(doc);
DEBUG && console.log(hl);
return hl;
}
function beforePathChanged() {
@ -944,8 +934,8 @@ export default Archivist;
const title = State.Index.get(obj.url)?.title;
return {
id: obj.id,
url: Archivist.findOffsets(query, obj.url)[0] || obj.url,
title: Archivist.findOffsets(query, title)[0] || title,
url: Archivist.findOffsets(query, obj.url) || obj.url,
title: Archivist.findOffsets(query, title) || title,
};
});
highlights.forEach(hl => HL.set(hl.id, hl));
@ -961,9 +951,14 @@ export default Archivist;
fuzz.forEach(countRank(score));
const results = [...Object.values(score)].map(obj => {
const {id} = State.Index.get(obj.url);
obj.id = id;
return obj;
try {
const {id} = State.Index.get(obj.url);
obj.id = id;
return obj;
} catch(e) {
console.log(obj, State.Index, e);
throw e;
}
});
results.sort(({score:scoreA}, {score:scoreB}) => scoreA-scoreB);
const resultIds = results.map(({id}) => id);

109
lib/fuzzy.js Normal file
View File

@ -0,0 +1,109 @@
/**
* Modified Dec 23 2021 by Cris Stringfellow
* fuzzy.js v0.1.0
* (c) 2016 Ben Ripkens
* @license: MIT
*/
// NOTES
/*
* Whether or not fuzzy.js should analyze sub-terms, i.e. also
* check term starting positions != 0.
*
* Example:
* Given the term 'Halleluja' and query 'luja'
*
* Fuzzy.js scores this combination with an 8, when analyzeSubTerms is
* set to false, as the following matching string will be calculated:
* Ha[l]lel[uja]
*
* If you activate sub temr analysis though, the query will reach a score
* of 10, as the matching string looks as following:
* Halle[luja]
*
* Naturally, the second version is more expensive than the first one.
* You should therefore configure how many sub terms you which to analyse.
* This can be configured through fuzzy.analyzeSubTermDepth = 10.
*/
fuzzy.analyzeSubTerms = false;
/*
* How many sub terms should be analyzed.
*/
fuzzy.analyzeSubTermDepth = 10;
fuzzy.highlighting = {
before: '<em>',
after: '</em>'
};
fuzzy.matchComparator = function matchComparator(m1, m2) {
return (m2.score - m1.score != 0) ? m2.score - m1.score : m1.term.length - m2.term.length;
};
export default function fuzzy(term, query) {
var max = calcFuzzyScore(term, query);
var termLength = term.length;
if (fuzzy.analyzeSubTerms) {
for (var i = 1; i < termLength && i < fuzzy.analyzeSubTermDepth; i++) {
var subTerm = term.substring(i);
var score = calcFuzzyScore(subTerm, query);
if (score.score > max.score) {
// we need to correct 'term' and 'matchedTerm', as calcFuzzyScore
// does not now that it operates on a substring. Doing it only for
// new maximum score to save some performance.
score.term = term;
score.highlightedTerm = term.substring(0, i) + score.highlightedTerm;
max = score;
}
}
}
return max;
}
function calcFuzzyScore(term, query) {
var score = 0;
var termLength = term.length;
var queryLength = query.length;
var highlighting = '';
var ti = 0;
// -1 would not work as this would break the calculations of bonus
// points for subsequent character matches. Something like
// Number.MIN_VALUE would be more appropriate, but unfortunately
// Number.MIN_VALUE + 1 equals 1...
var previousMatchingCharacter = -2;
for (var qi = 0; qi < queryLength && ti < termLength; qi++) {
var qc = query.charAt(qi);
var lowerQc = qc.toLowerCase();
for (; ti < termLength; ti++) {
var tc = term.charAt(ti);
if (lowerQc === tc.toLowerCase()) {
score++;
if ((previousMatchingCharacter + 1) === ti) {
score += 5;
}
highlighting += fuzzy.highlighting.before +
tc +
fuzzy.highlighting.after;
previousMatchingCharacter = ti;
ti++;
break;
} else {
highlighting += tc;
}
}
}
highlighting += term.substring(ti, term.length);
return {
score: score,
term: term,
query: query,
highlightedTerm: highlighting
};
};

6
lib/testFuzzy.js Normal file
View File

@ -0,0 +1,6 @@
import fuzzy from './fuzzy.js';
console.log(fuzzy);
const doc = 'Meghan Markle requested this unexpected Christmas present for Archie from the Queen';
console.log(fuzzy(doc, 'Queen'));

View File

@ -63,12 +63,7 @@ function addHandlers() {
}, null, 2));
} else {
results.forEach(r => {
const m = Archivist.findOffsets(query, r.content);
if ( m.length ) {
r.snippet = m;
} else {
r.snippet = [r.content.slice(0, 150)];
}
r.snippet = Archivist.findOffsets(query, r.content.slice(0,150));
});
res.end(SearchResultView({results, query, HL}));
}
@ -249,7 +244,7 @@ function SearchResultView({results, query, HL}) {
${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
<br>
<small>${(HL.get(id)?.url||url).slice(0,128)}</small>
<p>${snippet.join('&hellip;')}</p>
<p>${snippet}</p>
</li>
`).join('\n')
}

2
todo
View File

@ -1,3 +1,5 @@
- use ukkonen to find snippets
- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- Improve search page look