From a5f00b0dc023c12aeb60e385026c41f38cb82f50 Mon Sep 17 00:00:00 2001 From: Cris Stringfellow <22254235+crislin2046@users.noreply.github.com> Date: Sat, 25 Dec 2021 14:13:41 +0800 Subject: [PATCH] "Search and snippets are OK, let's improve the look." --- highlighter.js | 2 +- todo | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/highlighter.js b/highlighter.js index 7360876..a886e5b 100644 --- a/highlighter.js +++ b/highlighter.js @@ -4,7 +4,7 @@ import {DEBUG} from './common.js'; const MAX_ACCEPT_SCORE = 0.5; const CHUNK_SIZE = 12; -testHighlighter(); +//testHighlighter(); function params(qLength, chunkSize) { const MaxDist = CHUNK_SIZE; diff --git a/todo b/todo index 0db3ee5..5df7cc5 100644 --- a/todo +++ b/todo @@ -1,4 +1,6 @@ - get snippets earlier (before rendering in lib server) and use to add to signal +- if we have multiple query terms (multiple determined by some form of tokenization) then try to show all terms present in the snippet. even tho one term may be higher scoring. Should we do multiple passes of ukkonen distance one for whole query and one for each term? This will be easier / faster with trigrams I guess. Basically we want snippet to be a relevant summary that provides signal. +- Another way to improve snippet highlight is to 'revert back' the highlighted text, and calculate their match/ukkonen on the query term. So e.g. if we get q:'israle beverly', hl:['beverly', 'beverly'], it's good overlap, but if we get hl:['is it really'] even tho that might score ok for israle, it's not a good match. so can we 'score that back' if we go match('is it really', 'israel') and see it is low, so we exclude it? - implement trigram index - try an exact match on the query term if possible for highlight. first one. - we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)