diff --git a/archivist.js b/archivist.js index 465a7e0..61a20dc 100644 --- a/archivist.js +++ b/archivist.js @@ -413,7 +413,7 @@ export default Archivist; const flatDoc = await send("DOMSnapshot.captureSnapshot", { computedStyles: [], }, sessionId); - const pageText = processDoc(flatDoc); + const pageText = processDoc(flatDoc).replace(/\t\n/g, ' '); const {title, url} = Targets.get(sessionId); let id, ndx_id; diff --git a/highlighter.js b/highlighter.js index 86c6c50..7360876 100644 --- a/highlighter.js +++ b/highlighter.js @@ -2,9 +2,9 @@ import ukkonen from 'ukkonen'; import {DEBUG} from './common.js'; const MAX_ACCEPT_SCORE = 0.5; -const CHUNK_SIZE = 24; +const CHUNK_SIZE = 12; -//testHighlighter(); +testHighlighter(); function params(qLength, chunkSize) { const MaxDist = CHUNK_SIZE; @@ -216,5 +216,5 @@ function testHighlighter() { 46 points by helsinkiandrew 8 hours ago | hide | 17 comments More ` - ), null, 2)); + ).map(({fragment:{text,offset}}) => offset + ':' + text), null, 2)); } diff --git a/libraryServer.js b/libraryServer.js index 47e901d..a3e0b04 100644 --- a/libraryServer.js +++ b/libraryServer.js @@ -65,6 +65,7 @@ function addHandlers() { } else { results.forEach(r => { r.snippet = highlight(query, r.content, {maxLength:MAX_HIGHLIGHTABLE_LENGTH}) + .sort(({fragment:{offset:a}}, {fragment:{offset:b}}) => a-b) .map(hl => Archivist.findOffsets(query, hl.fragment.text)) .join(' ... '); }); diff --git a/todo b/todo index dfbba04..0db3ee5 100644 --- a/todo +++ b/todo @@ -1,3 +1,4 @@ +- get snippets earlier (before rendering in lib server) and use to add to signal - implement trigram index - try an exact match on the query term if possible for highlight. first one. - we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)