"Snippets"

2021-12-23 11:01:20 +08:00 · 2021-12-23 11:01:20 +08:00 · 37013ed080
parent d1c8cef7c9
commit 37013ed080
5 changed files with 135 additions and 28 deletions
--- a/archivist.js
+++ b/archivist.js
@ -28,6 +28,8 @@
    //import Fuzzy from 'fz-search';
    import * as _Fuzzy from './lib/fz.js';
    import Nat from 'natural';
+    //import match from 'autosuggest-highlight/match';
+    //import parse from 'autosuggest-highlight/parse';

  import args from './args.js';
  import {
@ -859,22 +861,10 @@ export default Archivist;
    return {url, title, id, content};
  }

-  function findOffsets(query, doc, count) {
-    // this is the slow part
-    let res = [];
-      
-    const result = Nat.LevenshteinDistanceSearch(query, doc);
-
-    if ( result.distance/result.substring.length < 0.5 ) {
-      const {substring,offset} = result;
-      res.push(
-        doc.substring(offset-50, offset) +
-        `<strong>${substring}</strong>` + 
-        doc.substr(substring.length + offset, 50)
-      );
-    }
-
-    return res;
+  function findOffsets(query, doc, count = 0) {
+    const hl = fuzzy.highlight(doc); 
+    DEBUG && console.log(hl);
+    return hl;
  }

  function beforePathChanged() {
@ -944,8 +934,8 @@ export default Archivist;
      const title = State.Index.get(obj.url)?.title;
      return {
        id: obj.id,
-        url: Archivist.findOffsets(query, obj.url)[0] || obj.url,
-        title: Archivist.findOffsets(query, title)[0] || title,
+        url: Archivist.findOffsets(query, obj.url) || obj.url,
+        title: Archivist.findOffsets(query, title) || title,
      };
    });
    highlights.forEach(hl => HL.set(hl.id, hl));
@ -961,9 +951,14 @@ export default Archivist;
    fuzz.forEach(countRank(score));
  
    const results = [...Object.values(score)].map(obj => {
-      const {id} = State.Index.get(obj.url); 
-      obj.id = id;
-      return obj;
+      try {
+        const {id} = State.Index.get(obj.url); 
+        obj.id = id;
+        return obj;
+      } catch(e) {
+        console.log(obj, State.Index, e);
+        throw e;
+      }
    });
    results.sort(({score:scoreA}, {score:scoreB}) => scoreA-scoreB);
    const resultIds = results.map(({id}) => id);
--- a/lib/fuzzy.js
+++ b/lib/fuzzy.js
@ -0,0 +1,109 @@
+/**
+ * Modified Dec 23 2021 by Cris Stringfellow
+ * fuzzy.js v0.1.0
+ * (c) 2016 Ben Ripkens
+ * @license: MIT
+ */
+  // NOTES
+    /*
+     * Whether or not fuzzy.js should analyze sub-terms, i.e. also
+     * check term starting positions != 0.
+     *
+     * Example:
+     * Given the term 'Halleluja' and query 'luja'
+     *
+     * Fuzzy.js scores this combination with an 8, when analyzeSubTerms is
+     * set to false, as the following matching string will be calculated:
+     * Ha[l]lel[uja]
+     *
+     * If you activate sub temr analysis though, the query will reach a score
+     * of 10, as the matching string looks as following:
+     * Halle[luja]
+     *
+     * Naturally, the second version is more expensive than the first one.
+     * You should therefore configure how many sub terms you which to analyse.
+     * This can be configured through fuzzy.analyzeSubTermDepth = 10.
+     */
+  fuzzy.analyzeSubTerms = false;
+    /*
+     * How many sub terms should be analyzed.
+     */
+  fuzzy.analyzeSubTermDepth = 10;
+  fuzzy.highlighting = {
+    before: '<em>',
+    after: '</em>'
+  };
+  fuzzy.matchComparator = function matchComparator(m1, m2) {
+    return (m2.score - m1.score != 0) ? m2.score - m1.score : m1.term.length - m2.term.length;
+  };
+
+  export default function fuzzy(term, query) {
+    var max = calcFuzzyScore(term, query);
+    var termLength = term.length;
+
+    if (fuzzy.analyzeSubTerms) {
+
+      for (var i = 1; i < termLength && i < fuzzy.analyzeSubTermDepth; i++) {
+        var subTerm = term.substring(i);
+        var score = calcFuzzyScore(subTerm, query);
+        if (score.score > max.score) {
+          // we need to correct 'term' and 'matchedTerm', as calcFuzzyScore
+          // does not now that it operates on a substring. Doing it only for
+          // new maximum score to save some performance.
+          score.term = term;
+          score.highlightedTerm = term.substring(0, i) + score.highlightedTerm;
+          max = score;
+        }
+      }
+    }
+
+    return max;
+  }
+
+  function calcFuzzyScore(term, query) {
+    var score = 0;
+    var termLength = term.length;
+    var queryLength = query.length;
+    var highlighting = '';
+    var ti = 0;
+    // -1 would not work as this would break the calculations of bonus
+    // points for subsequent character matches. Something like
+    // Number.MIN_VALUE would be more appropriate, but unfortunately
+    // Number.MIN_VALUE + 1 equals 1...
+    var previousMatchingCharacter = -2;
+
+    for (var qi = 0; qi < queryLength && ti < termLength; qi++) {
+      var qc = query.charAt(qi);
+      var lowerQc = qc.toLowerCase();
+
+      for (; ti < termLength; ti++) {
+        var tc = term.charAt(ti);
+
+        if (lowerQc === tc.toLowerCase()) {
+          score++;
+
+          if ((previousMatchingCharacter + 1) === ti) {
+            score += 5;
+          }
+
+          highlighting += fuzzy.highlighting.before +
+              tc +
+              fuzzy.highlighting.after;
+          previousMatchingCharacter = ti;
+          ti++;
+          break;
+        } else {
+          highlighting += tc;
+        }
+      }
+    }
+
+    highlighting += term.substring(ti, term.length);
+
+    return {
+      score: score,
+      term: term,
+      query: query,
+      highlightedTerm: highlighting
+    };
+  };
--- a/lib/testFuzzy.js
+++ b/lib/testFuzzy.js
@ -0,0 +1,6 @@
+import fuzzy from './fuzzy.js';
+
+console.log(fuzzy);
+
+const doc = 'Meghan Markle requested this unexpected Christmas present for Archie from the Queen';
+console.log(fuzzy(doc, 'Queen'));
--- a/libraryServer.js
+++ b/libraryServer.js
@ -63,12 +63,7 @@ function addHandlers() {
      }, null, 2));
    } else {
      results.forEach(r => {
-        const m = Archivist.findOffsets(query, r.content);
-        if ( m.length ) {
-          r.snippet = m;
-        } else {
-          r.snippet = [r.content.slice(0, 150)];
-        }
+        r.snippet = Archivist.findOffsets(query, r.content.slice(0,150));
      });
      res.end(SearchResultView({results, query, HL}));
    }
@ -249,7 +244,7 @@ function SearchResultView({results, query, HL}) {
          ${DEBUG ? id + ':' : ''} <a target=_blank href=${url}>${HL.get(id)?.title||title||url}</a>
          <br>
          <small>${(HL.get(id)?.url||url).slice(0,128)}</small>
-          <p>${snippet.join('&hellip;')}</p>
+          <p>${snippet}</p>
        </li>
      `).join('\n')
    }
--- a/2
+++ b/2
@ -1,3 +1,5 @@
+- use ukkonen to find snippets
+- we could also add signal from the highlighting to just in time alter the order (e.g. 'hell wiki' search brings google search to top rank, but the Hell wikipedia page has more highlight visible)
 - Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
 - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
 - Improve search page look