"on way to better snippets and term highlighting. Next steps are grow and expand the best segments to improve scores"

This commit is contained in:
Cris Stringfellow 2021-12-22 23:13:51 +08:00
parent 9b5e4120c1
commit 6b4bb97352
3 changed files with 879 additions and 0 deletions

176
highlighter.js Normal file
View File

@ -0,0 +1,176 @@
import ukkonen from 'ukkonen';
const MAX_ACCEPT_SCORE = 0.5;
const CHUNK_SIZE = 24;
testHighlighter();
export function highlight(query, doc, {
maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE
} = {}) {
const MaxDist = CHUNK_SIZE;
const highlights = [];
// use array from then length rather than string length to
// give accurate length for all unicode
const qLength = Array.from(query).length;
const MinScore = Math.abs(qLength - CHUNK_SIZE);
const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore;
const fragments = Array.from(doc).reduce(getFragmenter(CHUNK_SIZE), []);
const scores = fragments.map(fragment => {
const distance = ukkonen(query, fragment, MaxDist);
// the min score possible = the minimum number of edits between
const scaledScore = (distance - MinScore)/MaxScore;
return {score: scaledScore, fragment};
});
// sort ascending (smallest scores win)
scores.sort(({score:a}, {score:b}) => a-b);
console.log({scores});
for( const {score, fragment} of scores ) {
if ( score > maxAcceptScore ) {
break;
}
highlights.push({score,fragment});
}
if ( highlights.length === 0 ) {
console.log('Zero highlights, showing first score', scores[0]);
}
return highlights;
}
// returns a function that creates non-overlapping fragments
function getFragmenter(chunkSize) {
if ( !Number.isInteger(chunkSize) || chunkSize < 1 ) {
throw new TypeError(`chunkSize needs to be a whole number greater than 0`);
}
let currentLength;
return function fragment(frags, nextSymbol, index, symbols) {
let currentFrag;
// logic:
// if there are no running fragments OR
// adding the next symbol would exceed chunkSize
// then start a new fragment OTHERWISE
// keep adding to the currentFragment
if ( frags.length && ((currentLength + 1) <= chunkSize) ) {
currentFrag = frags.pop();
currentFrag += nextSymbol;
} else {
currentFrag = nextSymbol;
currentLength = 0;
}
currentLength++;
frags.push(currentFrag);
return frags;
}
}
// returns a function that creates overlapping fragments
// todo - try this one as well
// tests
function testHighlighter() {
console.log(JSON.stringify(highlight(
'metahead search',
`
Hacker News new | past | comments | ask | show | jobs | submit login
1.
AWS appears to be down again
417 points by riknox 2 hours ago | hide | 260 comments
2.
FreeBSD Jails for Fun and Profit (topikettunen.com)
42 points by kettunen 1 hour ago | hide | discuss
3.
IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com)
33 points by pueblito 1 hour ago | hide | 18 comments
4.
DNA seen through the eyes of a coder (berthub.eu)
116 points by dunefox 3 hours ago | hide | 37 comments
5.
Pure Bash lightweight web server (github.com/remileduc)
74 points by turrini 2 hours ago | hide | 46 comments
6.
Parser Combinators in Haskell (serokell.io)
18 points by aroccoli 1 hour ago | hide | 3 comments
7.
DeepMinds New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com)
233 points by darkscape 9 hours ago | hide | 88 comments
8.
Tinder just permabanned me or the problem with big tech (paulefou.com)
90 points by svalee 1 hour ago | hide | 106 comments
9.
Rocky Mountain Basic (wikipedia.org)
12 points by mattowen_uk 1 hour ago | hide | 5 comments
10.
Teller Reveals His Secrets (2012) (smithsonianmag.com)
56 points by Tomte 4 hours ago | hide | 26 comments
11.
Heroku Is Currently Down (heroku.com)
129 points by iamricks 2 hours ago | hide | 29 comments
12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com)
2 hours ago | hide
13.
Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com)
187 points by Petiver 9 hours ago | hide | 111 comments
14.
I did a Mixergy interview so bad they didn't even release it (robfitz.com)
15 points by robfitz 1 hour ago | hide | 7 comments
15.
Now DuckDuckGo is building its own desktop browser (zdnet.com)
132 points by waldekm 2 hours ago | hide | 64 comments
16.
English has been my pain for 15 years (2013) (antirez.com)
105 points by Tomte 1 hour ago | hide | 169 comments
17.
Polish opposition duo hacked with NSO spyware (apnews.com)
102 points by JumpCrisscross 2 hours ago | hide | 35 comments
18.
Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com)
119 points by rbanffy 3 hours ago | hide | 105 comments
19.
LGs new 16:18 monitor (theverge.com)
50 points by tosh 1 hour ago | hide | 25 comments
20.
Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk)
117 points by marcodiego 9 hours ago | hide | 16 comments
21.
Everything I've seen on optimizing Postgres on ZFS (vadosware.io)
27 points by EntICOnc 4 hours ago | hide | 2 comments
22.
Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security)
269 points by kerm1t 4 hours ago | hide | 196 comments
23.
Analog computers were the most powerful computers for thousands of years [video] (youtube.com)
103 points by jdkee 9 hours ago | hide | 55 comments
24.
Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Years Best Mysteries (atlasobscura.com)
8 points by CapitalistCartr 1 hour ago | hide | 1 comment
25.
Isolating Xwayland in a VM (roscidus.com)
94 points by pmarin 9 hours ago | hide | 32 comments
26.
Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz)
4 points by jawerty 1 hour ago | hide | 15 comments
27.
Quantum theory based on real numbers can be experimentally falsified (nature.com)
159 points by SquibblesRedux 14 hours ago | hide | 93 comments
28.
Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com)
29 points by healsdata 1 hour ago | hide | 7 comments
29.
Waffle House Poet Laureate (2019) (atlantamagazine.com)
5 points by brudgers 1 hour ago | hide | 4 comments
30.
Earths magnetic field illuminates Biblical history (economist.com)
46 points by helsinkiandrew 8 hours ago | hide | 17 comments
More
`
), null, 2));
}

1
todo
View File

@ -1,4 +1,5 @@
- switch to a new query term highlighter in search result / title and url. Nat works OK for short documents (title, url), but fucks up sometimes for long ones.
DONE - made my own! ;p ;) xx;p
- Create instant search (or at least instant queries (so search over previous queries -- not results necessarily))
- an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this
- Improve search page look

702
x Normal file
View File

@ -0,0 +1,702 @@
[
{
"score": 0.2,
"fragment": "\n\t\t\t\tHacker News new | p"
},
{
"score": 0.2,
"fragment": "ast | comments | ask | s"
},
{
"score": 0.2,
"fragment": "how | jobs | submit \tlog"
},
{
"score": 0.2,
"fragment": "in\n\t\t\t1. \t\n\t\t\t\tAWS appea"
},
{
"score": 0.2,
"fragment": "rs to be down again\n\t\t\t\t"
},
{
"score": 0.2,
"fragment": "417 points by riknox 2 h"
},
{
"score": 0.2,
"fragment": "ours ago | hide | 260 co"
},
{
"score": 0.2,
"fragment": "mments\n\t\t\t2. \t\n\t\t\t\tFreeB"
},
{
"score": 0.2,
"fragment": "SD Jails for Fun and Pro"
},
{
"score": 0.2,
"fragment": "fit (topikettunen.com)\n\t"
},
{
"score": 0.2,
"fragment": "\t\t\t42 points by kettunen"
},
{
"score": 0.2,
"fragment": " 1 hour ago | hide | dis"
},
{
"score": 0.2,
"fragment": "cuss\n\t\t\t3. \t\n\t\t\t\tIMF, 10"
},
{
"score": 0.2,
"fragment": " countries simulate cybe"
},
{
"score": 0.2,
"fragment": "r attack on global finan"
},
{
"score": 0.2,
"fragment": "cial system (nasdaq.com)"
},
{
"score": 0.2,
"fragment": "\n\t\t\t\t33 points by puebli"
},
{
"score": 0.2,
"fragment": "to 1 hour ago | hide | 1"
},
{
"score": 0.2,
"fragment": "8 comments\n\t\t\t4. \t\n\t\t\t\tD"
},
{
"score": 0.2,
"fragment": "NA seen through the eyes"
},
{
"score": 0.2,
"fragment": " of a coder (berthub.eu)"
},
{
"score": 0.2,
"fragment": "\n\t\t\t\t116 points by dunef"
},
{
"score": 0.2,
"fragment": "ox 3 hours ago | hide | "
},
{
"score": 0.2,
"fragment": "37 comments\n\t\t\t5. \t\n\t\t\t\t"
},
{
"score": 0.2,
"fragment": "Pure Bash lightweight we"
},
{
"score": 0.2,
"fragment": "b server (github.com/rem"
},
{
"score": 0.2,
"fragment": "ileduc)\n\t\t\t\t74 points by"
},
{
"score": 0.2,
"fragment": " turrini 2 hours ago | h"
},
{
"score": 0.2,
"fragment": "ide | 46 comments\n\t\t\t6. "
},
{
"score": 0.2,
"fragment": "\t\n\t\t\t\tParser Combinators"
},
{
"score": 0.2,
"fragment": " in Haskell (serokell.io"
},
{
"score": 0.2,
"fragment": ")\n\t\t\t\t18 points by arocc"
},
{
"score": 0.2,
"fragment": "oli 1 hour ago | hide | "
},
{
"score": 0.2,
"fragment": "3 comments\n\t\t\t7. \t\n\t\t\t\tD"
},
{
"score": 0.2,
"fragment": "eepMinds New AI with a "
},
{
"score": 0.2,
"fragment": "Memory Outperforms Algor"
},
{
"score": 0.2,
"fragment": "ithms 25 Times Its Size "
},
{
"score": 0.2,
"fragment": "(singularityhub.com)\n\t\t\t"
},
{
"score": 0.2,
"fragment": "\t233 points by darkscape"
},
{
"score": 0.2,
"fragment": " 9 hours ago | hide | 88"
},
{
"score": 0.2,
"fragment": " comments\n\t\t\t8. \t\n\t\t\t\tTi"
},
{
"score": 0.2,
"fragment": "nder just permabanned me"
},
{
"score": 0.2,
"fragment": " or the problem with big"
},
{
"score": 0.2,
"fragment": " tech (paulefou.com)\n\t\t\t"
},
{
"score": 0.2,
"fragment": "\t90 points by svalee 1 h"
},
{
"score": 0.2,
"fragment": "our ago | hide | 106 com"
},
{
"score": 0.2,
"fragment": "ments\n\t\t\t9. \t\n\t\t\t\tRocky "
},
{
"score": 0.2,
"fragment": "Mountain Basic (wikipedi"
},
{
"score": 0.2,
"fragment": "a.org)\n\t\t\t\t12 points by "
},
{
"score": 0.2,
"fragment": "mattowen_uk 1 hour ago |"
},
{
"score": 0.2,
"fragment": " hide | 5 comments\n\t\t\t10"
},
{
"score": 0.2,
"fragment": ". \t\n\t\t\t\tTeller Reveals H"
},
{
"score": 0.2,
"fragment": "is Secrets (2012) (smith"
},
{
"score": 0.2,
"fragment": "sonianmag.com)\n\t\t\t\t56 po"
},
{
"score": 0.2,
"fragment": "ints by Tomte 4 hours ag"
},
{
"score": 0.2,
"fragment": "o | hide | 26 comments\n\t"
},
{
"score": 0.2,
"fragment": "\t\t11. \t\n\t\t\t\tHeroku Is Cu"
},
{
"score": 0.2,
"fragment": "rrently Down (heroku.com"
},
{
"score": 0.2,
"fragment": ")\n\t\t\t\t129 points by iamr"
},
{
"score": 0.2,
"fragment": "icks 2 hours ago | hide "
},
{
"score": 0.2,
"fragment": "| 29 comments\n\t\t\t12. \t\tC"
},
{
"score": 0.2,
"fragment": "onvictional (YC W19) is "
},
{
"score": 0.2,
"fragment": "hiring engineers to buil"
},
{
"score": 0.2,
"fragment": "d the future of B2B trad"
},
{
"score": 0.2,
"fragment": "e-Remote (ashbyhq.com)\n\t"
},
{
"score": 0.2,
"fragment": "\t\t\t2 hours ago | hide\n\t\t"
},
{
"score": 0.2,
"fragment": "\t13. \t\n\t\t\t\tScientists fi"
},
{
"score": 0.2,
"fragment": "nd preserved dinosaur em"
},
{
"score": 0.2,
"fragment": "bryo preparing to hatch "
},
{
"score": 0.2,
"fragment": "like a bird (theguardian"
},
{
"score": 0.2,
"fragment": ".com)\n\t\t\t\t187 points by "
},
{
"score": 0.2,
"fragment": "Petiver 9 hours ago | hi"
},
{
"score": 0.2,
"fragment": "de | 111 comments\n\t\t\t14."
},
{
"score": 0.2,
"fragment": " \t\n\t\t\t\tI did a Mixergy i"
},
{
"score": 0.2,
"fragment": "nterview so bad they did"
},
{
"score": 0.2,
"fragment": "n't even release it (rob"
},
{
"score": 0.2,
"fragment": "fitz.com)\n\t\t\t\t15 points "
},
{
"score": 0.2,
"fragment": "by robfitz 1 hour ago | "
},
{
"score": 0.2,
"fragment": "hide | 7 comments\n\t\t\t15."
},
{
"score": 0.2,
"fragment": " \t\n\t\t\t\tNow DuckDuckGo is"
},
{
"score": 0.2,
"fragment": " building its own deskto"
},
{
"score": 0.2,
"fragment": "p browser (zdnet.com)\n\t\t"
},
{
"score": 0.2,
"fragment": "\t\t132 points by waldekm "
},
{
"score": 0.2,
"fragment": "2 hours ago | hide | 64 "
},
{
"score": 0.2,
"fragment": "comments\n\t\t\t16. \t\n\t\t\t\tEn"
},
{
"score": 0.2,
"fragment": "glish has been my pain f"
},
{
"score": 0.2,
"fragment": "or 15 years (2013) (anti"
},
{
"score": 0.2,
"fragment": "rez.com)\n\t\t\t\t105 points "
},
{
"score": 0.2,
"fragment": "by Tomte 1 hour ago | hi"
},
{
"score": 0.2,
"fragment": "de | 169 comments\n\t\t\t17."
},
{
"score": 0.2,
"fragment": " \t\n\t\t\t\tPolish opposition"
},
{
"score": 0.2,
"fragment": " duo hacked with NSO spy"
},
{
"score": 0.2,
"fragment": "ware (apnews.com)\n\t\t\t\t10"
},
{
"score": 0.2,
"fragment": "2 points by JumpCrisscro"
},
{
"score": 0.2,
"fragment": "ss 2 hours ago | hide | "
},
{
"score": 0.2,
"fragment": "35 comments\n\t\t\t18. \t\n\t\t\t"
},
{
"score": 0.2,
"fragment": "\tLinux Has Grown into a "
},
{
"score": 0.2,
"fragment": "Viable PC Gaming Platfor"
},
{
"score": 0.2,
"fragment": "m and the Steam Stats Pr"
},
{
"score": 0.2,
"fragment": "ove It (hothardware.com)"
},
{
"score": 0.2,
"fragment": "\n\t\t\t\t119 points by rbanf"
},
{
"score": 0.2,
"fragment": "fy 3 hours ago | hide | "
},
{
"score": 0.2,
"fragment": "105 comments\n\t\t\t19. \t\n\t\t"
},
{
"score": 0.2,
"fragment": "\t\tLGs new 16:18 monitor"
},
{
"score": 0.2,
"fragment": " (theverge.com)\n\t\t\t\t50 p"
},
{
"score": 0.2,
"fragment": "oints by tosh 1 hour ago"
},
{
"score": 0.2,
"fragment": " | hide | 25 comments\n\t\t"
},
{
"score": 0.2,
"fragment": "\t20. \t\n\t\t\t\tConstruction "
},
{
"score": 0.2,
"fragment": "of radio equipment in a "
},
{
"score": 0.2,
"fragment": "Japanese PoW camp (bourn"
},
{
"score": 0.2,
"fragment": "emouth.ac.uk)\n\t\t\t\t117 po"
},
{
"score": 0.2,
"fragment": "ints by marcodiego 9 hou"
},
{
"score": 0.2,
"fragment": "rs ago | hide | 16 comme"
},
{
"score": 0.2,
"fragment": "nts\n\t\t\t21. \t\n\t\t\t\tEveryth"
},
{
"score": 0.2,
"fragment": "ing I've seen on optimiz"
},
{
"score": 0.2,
"fragment": "ing Postgres on ZFS (vad"
},
{
"score": 0.2,
"fragment": "osware.io)\n\t\t\t\t27 points"
},
{
"score": 0.2,
"fragment": " by EntICOnc 4 hours ago"
},
{
"score": 0.2,
"fragment": " | hide | 2 comments\n\t\t\t"
},
{
"score": 0.2,
"fragment": "22. \t\n\t\t\t\tMicrosoft Team"
},
{
"score": 0.2,
"fragment": "s: 1 feature, 4 vulnerab"
},
{
"score": 0.2,
"fragment": "ilities (positive.securi"
},
{
"score": 0.2,
"fragment": "ty)\n\t\t\t\t269 points by ke"
},
{
"score": 0.2,
"fragment": "rm1t 4 hours ago | hide "
},
{
"score": 0.2,
"fragment": "| 196 comments\n\t\t\t23. \t\n"
},
{
"score": 0.2,
"fragment": "\t\t\t\tAnalog computers wer"
},
{
"score": 0.2,
"fragment": "e the most powerful comp"
},
{
"score": 0.2,
"fragment": "uters for thousands of y"
},
{
"score": 0.2,
"fragment": "ears [video] (youtube.co"
},
{
"score": 0.2,
"fragment": "m)\n\t\t\t\t103 points by jdk"
},
{
"score": 0.2,
"fragment": "ee 9 hours ago | hide | "
},
{
"score": 0.2,
"fragment": "55 comments\n\t\t\t24. \t\n\t\t\t"
},
{
"score": 0.2,
"fragment": "\tShipwrecks, Stolen Jewe"
},
{
"score": 0.2,
"fragment": "ls, Skull-Blasting Are S"
},
{
"score": 0.2,
"fragment": "ome of This Years Best "
},
{
"score": 0.2,
"fragment": "Mysteries (atlasobscura."
},
{
"score": 0.2,
"fragment": "com)\n\t\t\t\t8 points by Cap"
},
{
"score": 0.2,
"fragment": "italistCartr 1 hour ago "
},
{
"score": 0.2,
"fragment": "| hide | 1 comment\n\t\t\t25"
},
{
"score": 0.2,
"fragment": ". \t\n\t\t\t\tIsolating Xwayla"
},
{
"score": 0.2,
"fragment": "nd in a VM (roscidus.com"
},
{
"score": 0.2,
"fragment": ")\n\t\t\t\t94 points by pmari"
},
{
"score": 0.2,
"fragment": "n 9 hours ago | hide | 3"
},
{
"score": 0.2,
"fragment": "2 comments\n\t\t\t26. \t\n\t\t\t\t"
},
{
"score": 0.2,
"fragment": "Show HN: Metaheads, a se"
},
{
"score": 0.2,
"fragment": "arch engine for Facebook"
},
{
"score": 0.2,
"fragment": " comments (metaheads.xyz"
},
{
"score": 0.2,
"fragment": ")\n\t\t\t\t4 points by jawert"
},
{
"score": 0.2,
"fragment": "y 1 hour ago | hide | 15"
},
{
"score": 0.2,
"fragment": " comments\n\t\t\t27. \t\n\t\t\t\tQ"
},
{
"score": 0.2,
"fragment": "uantum theory based on r"
},
{
"score": 0.2,
"fragment": "eal numbers can be exper"
},
{
"score": 0.2,
"fragment": "imentally falsified (nat"
},
{
"score": 0.2,
"fragment": "ure.com)\n\t\t\t\t159 points "
},
{
"score": 0.2,
"fragment": "by SquibblesRedux 14 hou"
},
{
"score": 0.2,
"fragment": "rs ago | hide | 93 comme"
},
{
"score": 0.2,
"fragment": "nts\n\t\t\t28. \t\n\t\t\t\tFounder"
},
{
"score": 0.2,
"fragment": " of Black Girls Code has"
},
{
"score": 0.2,
"fragment": " been ousted as head of "
},
{
"score": 0.2,
"fragment": "the nonprofit (businessi"
},
{
"score": 0.2,
"fragment": "nsider.com)\n\t\t\t\t29 point"
},
{
"score": 0.2,
"fragment": "s by healsdata 1 hour ag"
},
{
"score": 0.2,
"fragment": "o | hide | 7 comments\n\t\t"
},
{
"score": 0.2,
"fragment": "\t29. \t\n\t\t\t\tWaffle House "
},
{
"score": 0.2,
"fragment": "Poet Laureate (2019) (at"
},
{
"score": 0.2,
"fragment": "lantamagazine.com)\n\t\t\t\t5"
},
{
"score": 0.2,
"fragment": " points by brudgers 1 ho"
},
{
"score": 0.2,
"fragment": "ur ago | hide | 4 commen"
},
{
"score": 0.2,
"fragment": "ts\n\t\t\t30. \t\n\t\t\t\tEarths "
},
{
"score": 0.2,
"fragment": "magnetic field illuminat"
},
{
"score": 0.2,
"fragment": "es Biblical history (eco"
},
{
"score": 0.2,
"fragment": "nomist.com)\n\t\t\t\t46 point"
},
{
"score": 0.2,
"fragment": "s by helsinkiandrew 8 ho"
},
{
"score": 0.2,
"fragment": "urs ago | hide | 17 comm"
},
{
"score": 0.2,
"fragment": "ents\n\t\t\t\tMore\n "
}
]