"Snippets getting better. Scoring still has some errors : 0 score for unrelated framgnet? how ... discover this"

2021-12-24 11:40:57 +08:00 · 2021-12-24 11:40:57 +08:00 · 0d37fe503b
parent 197ff25631
commit 0d37fe503b
3 changed files with 51 additions and 25 deletions
--- a/archivist.js
+++ b/archivist.js
@ -278,7 +278,7 @@ export default Archivist;
      const {sessionId} = install;
      if ( ! ConfirmedInstalls.has(sessionId) ) {
        ConfirmedInstalls.add(sessionId);
-        console.log({confirmedInstall:val, context});
+        DEBUG && console.log({confirmedInstall:val, context});
      }
    }

@ -293,8 +293,9 @@ export default Archivist;
    }

    function displayTargetInfo({targetInfo}) {
+      const DEBUG = true;
      if ( targetInfo.type === 'page' ) {
-        console.log("Target info", JSON.stringify(targetInfo, null, 2));
+        DEBUG && console.log("Target info", JSON.stringify(targetInfo, null, 2));
      }
    }

@ -385,7 +386,7 @@ export default Archivist;
      if ( info.url.startsWith('chrome') ) return;
      if ( dontCache(info) ) return;

-      console.log('Index URL called', info);
+      DEBUG && console.log('Index URL called', info);

      if ( State.Indexing.has(info.targetId) ) return;
      State.Indexing.add(info.targetId);
@ -441,14 +442,14 @@ export default Archivist;
        doc.contentSignature = contentSignature;
        fuzzy.add(doc);
        State.Docs.set(url, doc);
-        console.log(doc,url);
+        DEBUG && console.log({updateFuzz: {doc,url}});
      }

      DEBUG && console.log("NDX updated", doc.ndx_id);

      UpdatedKeys.add(url);

-      console.log({id: doc.id, title, url, indexed: true});
+      DEBUG && console.log({id: doc.id, title, url, indexed: true});

      State.Indexing.delete(info.targetId);
    }
@ -657,7 +658,6 @@ export default Archivist;
  }

  async function loadFuzzy() {
-    const DEBUG = true;
    const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
    State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
      doc.i_url = getURI(doc.url);
@ -684,7 +684,7 @@ export default Archivist;
      path,
      JSON.stringify(docs)
    );
-    console.log(`Wrote fuzzy to ${path}`);
+    DEBUG && console.log(`Wrote fuzzy to ${path}`);
  }

  function clearSavers() {
@ -725,7 +725,6 @@ export default Archivist;
    }

    try {
-      const DEBUG = true;
      const flexBase = getFlexBase();
      Fs.readdirSync(flexBase, {withFileTypes:true}).forEach(dirEnt => {
        if ( dirEnt.isFile() ) {
@ -862,7 +861,6 @@ export default Archivist;
  }

  function findOffsets(query, doc, count = 0) {
-    const DEBUG = true;
    const hl = fuzzy.highlight(doc); 
    DEBUG && console.log(query, hl);
    return hl;
@ -948,7 +946,6 @@ export default Archivist;
  }

  function combineResults({flex,ndx,fuzz}) {
-    const DEBUG = true;
    DEBUG && console.log({flex,ndx,fuzz});
    const score = {};
    flex.forEach(countRank(score));
@ -1014,7 +1011,7 @@ export default Archivist;
            console.error('Error writing full text search index', e);
          }
        });
-        console.log(`Wrote Flex to ${flexBase}`);
+        DEBUG && console.log(`Wrote Flex to ${flexBase}`);
        NDX_FTSIndex.save(dir);
        saveFuzzy(dir);
        UpdatedKeys.clear();
@ -1114,7 +1111,7 @@ export default Archivist;
            path,
            objStr
          );
-          console.log("Write NDX to ", path);
+          DEBUG && console.log("Write NDX to ", path);
        },
        load: newIndex => {
          retVal.index = newIndex;
@ -1143,7 +1140,6 @@ export default Archivist;
  }

  function loadNDXIndex(ndxFTSIndex) {
-    const DEBUG = true;
    if ( Fs.existsSync(getNDXPath()) ) {
      const indexContent = Fs.readFileSync(getNDXPath()).toString();
      const index = fromSerializable(JSON.parse(indexContent));
--- a/highlighter.js
+++ b/highlighter.js
@ -5,21 +5,29 @@ const CHUNK_SIZE = 24;

 testHighlighter();

-export function highlight(query, doc, {
-  maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE
-} = {}) {
+function params(qLength, chunkSize) {
  const MaxDist = CHUNK_SIZE;
+  const MinScore = Math.abs(qLength - CHUNK_SIZE);
+  const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
+  return {MaxDist,MinScore,MaxScore};
+}
+
+export function highlight(query, doc, {
+  maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE,
+  chunkSize: chunkSize = CHUNK_SIZE
+} = {}) {
+  doc = Array.from(doc);
  const highlights = [];
  // use array from then length rather than string length to 
  // give accurate length for all unicode
  const qLength = Array.from(query).length;
-  const MinScore = Math.abs(qLength - CHUNK_SIZE);
-  const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
-
-  const fragments = Array.from(doc).reduce(getFragmenter(CHUNK_SIZE), []);
+  const {MaxDist,MinScore,MaxScore} = params(qLength, chunkSize);
+  const fragments = doc.reduce(getFragmenter(chunkSize), []);
+  query.toLocaleLowerCase();
+  console.log(fragments);

  const scores = fragments.map(fragment => {
-    const distance = ukkonen(query, fragment, MaxDist);
+    const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
    // the min score possible = the minimum number of edits between 
    const scaledScore = (distance - MinScore)/MaxScore;
    return {score: scaledScore, fragment};
@ -27,7 +35,6 @@ export function highlight(query, doc, {

  // sort ascending (smallest scores win)
  scores.sort(({score:a}, {score:b}) => a-b);
-  console.log({scores});

  for( const {score, fragment} of scores ) {
    if ( score > maxAcceptScore ) {
@ -38,6 +45,26 @@ export function highlight(query, doc, {

  if ( highlights.length === 0 ) {
    console.log('Zero highlights, showing first score', scores[0]);
+    return scores.slice(0,1);
+  } else {
+    let better = JSON.parse(JSON.stringify(highlights)).slice(0, 10);
+    better = better.map(hl => {
+      const length = Array.from(hl.fragment.text).length;
+      const extra = Math.round(length/2);
+      let {offset} = hl.fragment;
+      const newText = doc.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + doc.slice(offset + length, offset + length + extra).join('');
+      //console.log({newText, oldText:hl.fragment.text});
+      hl.fragment.text = newText;
+      const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
+      const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
+      // the min score possible = the minimum number of edits between 
+      const scaledScore = (distance - MinScore)/MaxScore;
+      hl.score = scaledScore;
+      return hl;
+    });
+    better.sort(({score:a}, {score:b}) => a-b);
+    console.log(JSON.stringify({better},null,2));
+    return better.slice(0,3);
  }

  return highlights;
@ -60,9 +87,9 @@ function getFragmenter(chunkSize) {
      // keep adding to the currentFragment
    if ( frags.length && ((currentLength + 1) <= chunkSize) ) {
      currentFrag = frags.pop();
-      currentFrag += nextSymbol;
+      currentFrag.text += nextSymbol;
    } else {
-      currentFrag = nextSymbol;
+      currentFrag = {text:nextSymbol, offset:index};
      currentLength = 0;
    }
    currentLength++;
--- a/libraryServer.js
+++ b/libraryServer.js
@ -5,6 +5,7 @@ import express from 'express';
 import args from './args.js';
 import {DEBUG, say, sleep, APP_ROOT, SNIP_CONTEXT} from './common.js';
 import Archivist from './archivist.js';
+import {highlight} from './highlighter.js';

 const SITE_PATH = path.resolve(APP_ROOT, 'public');

@ -63,7 +64,9 @@ function addHandlers() {
      }, null, 2));
    } else {
      results.forEach(r => {
-        r.snippet = Archivist.findOffsets(query, r.content.slice(0,150));
+        r.snippet = Archivist.findOffsets(query, 
+          highlight(query, r.content).map(hl => hl.fragment.text).join('&hellip;')
+        );
      });
      res.end(SearchResultView({results, query, HL}));
    }