diff --git a/highlighter.js b/highlighter.js new file mode 100644 index 0000000..a1b50b0 --- /dev/null +++ b/highlighter.js @@ -0,0 +1,176 @@ +import ukkonen from 'ukkonen'; + +const MAX_ACCEPT_SCORE = 0.5; +const CHUNK_SIZE = 24; + +testHighlighter(); + +export function highlight(query, doc, { + maxAcceptScore: maxAcceptScore = MAX_ACCEPT_SCORE +} = {}) { + const MaxDist = CHUNK_SIZE; + const highlights = []; + // use array from then length rather than string length to + // give accurate length for all unicode + const qLength = Array.from(query).length; + const MinScore = Math.abs(qLength - CHUNK_SIZE); + const MaxScore = Math.max(qLength, CHUNK_SIZE) - MinScore; + + const fragments = Array.from(doc).reduce(getFragmenter(CHUNK_SIZE), []); + + const scores = fragments.map(fragment => { + const distance = ukkonen(query, fragment, MaxDist); + // the min score possible = the minimum number of edits between + const scaledScore = (distance - MinScore)/MaxScore; + return {score: scaledScore, fragment}; + }); + + // sort ascending (smallest scores win) + scores.sort(({score:a}, {score:b}) => a-b); + console.log({scores}); + + for( const {score, fragment} of scores ) { + if ( score > maxAcceptScore ) { + break; + } + highlights.push({score,fragment}); + } + + if ( highlights.length === 0 ) { + console.log('Zero highlights, showing first score', scores[0]); + } + + return highlights; +} + +// returns a function that creates non-overlapping fragments +function getFragmenter(chunkSize) { + if ( !Number.isInteger(chunkSize) || chunkSize < 1 ) { + throw new TypeError(`chunkSize needs to be a whole number greater than 0`); + } + + let currentLength; + + return function fragment(frags, nextSymbol, index, symbols) { + let currentFrag; + // logic: + // if there are no running fragments OR + // adding the next symbol would exceed chunkSize + // then start a new fragment OTHERWISE + // keep adding to the currentFragment + if ( frags.length && ((currentLength + 1) <= chunkSize) ) { + currentFrag = frags.pop(); + currentFrag += nextSymbol; + } else { + currentFrag = nextSymbol; + currentLength = 0; + } + currentLength++; + frags.push(currentFrag); + return frags; + } +} + +// returns a function that creates overlapping fragments +// todo - try this one as well + + +// tests +function testHighlighter() { + console.log(JSON.stringify(highlight( + 'metahead search', + ` + Hacker News new | past | comments | ask | show | jobs | submit login + 1. + AWS appears to be down again + 417 points by riknox 2 hours ago | hide | 260 comments + 2. + FreeBSD Jails for Fun and Profit (topikettunen.com) + 42 points by kettunen 1 hour ago | hide | discuss + 3. + IMF, 10 countries simulate cyber attack on global financial system (nasdaq.com) + 33 points by pueblito 1 hour ago | hide | 18 comments + 4. + DNA seen through the eyes of a coder (berthub.eu) + 116 points by dunefox 3 hours ago | hide | 37 comments + 5. + Pure Bash lightweight web server (github.com/remileduc) + 74 points by turrini 2 hours ago | hide | 46 comments + 6. + Parser Combinators in Haskell (serokell.io) + 18 points by aroccoli 1 hour ago | hide | 3 comments + 7. + DeepMind’s New AI with a Memory Outperforms Algorithms 25 Times Its Size (singularityhub.com) + 233 points by darkscape 9 hours ago | hide | 88 comments + 8. + Tinder just permabanned me or the problem with big tech (paulefou.com) + 90 points by svalee 1 hour ago | hide | 106 comments + 9. + Rocky Mountain Basic (wikipedia.org) + 12 points by mattowen_uk 1 hour ago | hide | 5 comments + 10. + Teller Reveals His Secrets (2012) (smithsonianmag.com) + 56 points by Tomte 4 hours ago | hide | 26 comments + 11. + Heroku Is Currently Down (heroku.com) + 129 points by iamricks 2 hours ago | hide | 29 comments + 12. Convictional (YC W19) is hiring engineers to build the future of B2B trade-Remote (ashbyhq.com) + 2 hours ago | hide + 13. + Scientists find preserved dinosaur embryo preparing to hatch like a bird (theguardian.com) + 187 points by Petiver 9 hours ago | hide | 111 comments + 14. + I did a Mixergy interview so bad they didn't even release it (robfitz.com) + 15 points by robfitz 1 hour ago | hide | 7 comments + 15. + Now DuckDuckGo is building its own desktop browser (zdnet.com) + 132 points by waldekm 2 hours ago | hide | 64 comments + 16. + English has been my pain for 15 years (2013) (antirez.com) + 105 points by Tomte 1 hour ago | hide | 169 comments + 17. + Polish opposition duo hacked with NSO spyware (apnews.com) + 102 points by JumpCrisscross 2 hours ago | hide | 35 comments + 18. + Linux Has Grown into a Viable PC Gaming Platform and the Steam Stats Prove It (hothardware.com) + 119 points by rbanffy 3 hours ago | hide | 105 comments + 19. + LG’s new 16:18 monitor (theverge.com) + 50 points by tosh 1 hour ago | hide | 25 comments + 20. + Construction of radio equipment in a Japanese PoW camp (bournemouth.ac.uk) + 117 points by marcodiego 9 hours ago | hide | 16 comments + 21. + Everything I've seen on optimizing Postgres on ZFS (vadosware.io) + 27 points by EntICOnc 4 hours ago | hide | 2 comments + 22. + Microsoft Teams: 1 feature, 4 vulnerabilities (positive.security) + 269 points by kerm1t 4 hours ago | hide | 196 comments + 23. + Analog computers were the most powerful computers for thousands of years [video] (youtube.com) + 103 points by jdkee 9 hours ago | hide | 55 comments + 24. + Shipwrecks, Stolen Jewels, Skull-Blasting Are Some of This Year’s Best Mysteries (atlasobscura.com) + 8 points by CapitalistCartr 1 hour ago | hide | 1 comment + 25. + Isolating Xwayland in a VM (roscidus.com) + 94 points by pmarin 9 hours ago | hide | 32 comments + 26. + Show HN: Metaheads, a search engine for Facebook comments (metaheads.xyz) + 4 points by jawerty 1 hour ago | hide | 15 comments + 27. + Quantum theory based on real numbers can be experimentally falsified (nature.com) + 159 points by SquibblesRedux 14 hours ago | hide | 93 comments + 28. + Founder of Black Girls Code has been ousted as head of the nonprofit (businessinsider.com) + 29 points by healsdata 1 hour ago | hide | 7 comments + 29. + Waffle House Poet Laureate (2019) (atlantamagazine.com) + 5 points by brudgers 1 hour ago | hide | 4 comments + 30. + Earth’s magnetic field illuminates Biblical history (economist.com) + 46 points by helsinkiandrew 8 hours ago | hide | 17 comments + More + ` + ), null, 2)); +} diff --git a/todo b/todo index 020da9e..9cbda53 100644 --- a/todo +++ b/todo @@ -1,4 +1,5 @@ - switch to a new query term highlighter in search result / title and url. Nat works OK for short documents (title, url), but fucks up sometimes for long ones. +DONE - made my own! ;p ;) xx;p - Create instant search (or at least instant queries (so search over previous queries -- not results necessarily)) - an error in Full text search can corrupt the index and make it unrecoverable...we need to guard against this - Improve search page look diff --git a/x b/x new file mode 100644 index 0000000..f35c37e --- /dev/null +++ b/x @@ -0,0 +1,702 @@ +[ + { + "score": 0.2, + "fragment": "\n\t\t\t\tHacker News new | p" + }, + { + "score": 0.2, + "fragment": "ast | comments | ask | s" + }, + { + "score": 0.2, + "fragment": "how | jobs | submit \tlog" + }, + { + "score": 0.2, + "fragment": "in\n\t\t\t1. \t\n\t\t\t\tAWS appea" + }, + { + "score": 0.2, + "fragment": "rs to be down again\n\t\t\t\t" + }, + { + "score": 0.2, + "fragment": "417 points by riknox 2 h" + }, + { + "score": 0.2, + "fragment": "ours ago | hide | 260 co" + }, + { + "score": 0.2, + "fragment": "mments\n\t\t\t2. \t\n\t\t\t\tFreeB" + }, + { + "score": 0.2, + "fragment": "SD Jails for Fun and Pro" + }, + { + "score": 0.2, + "fragment": "fit (topikettunen.com)\n\t" + }, + { + "score": 0.2, + "fragment": "\t\t\t42 points by kettunen" + }, + { + "score": 0.2, + "fragment": " 1 hour ago | hide | dis" + }, + { + "score": 0.2, + "fragment": "cuss\n\t\t\t3. \t\n\t\t\t\tIMF, 10" + }, + { + "score": 0.2, + "fragment": " countries simulate cybe" + }, + { + "score": 0.2, + "fragment": "r attack on global finan" + }, + { + "score": 0.2, + "fragment": "cial system (nasdaq.com)" + }, + { + "score": 0.2, + "fragment": "\n\t\t\t\t33 points by puebli" + }, + { + "score": 0.2, + "fragment": "to 1 hour ago | hide | 1" + }, + { + "score": 0.2, + "fragment": "8 comments\n\t\t\t4. \t\n\t\t\t\tD" + }, + { + "score": 0.2, + "fragment": "NA seen through the eyes" + }, + { + "score": 0.2, + "fragment": " of a coder (berthub.eu)" + }, + { + "score": 0.2, + "fragment": "\n\t\t\t\t116 points by dunef" + }, + { + "score": 0.2, + "fragment": "ox 3 hours ago | hide | " + }, + { + "score": 0.2, + "fragment": "37 comments\n\t\t\t5. \t\n\t\t\t\t" + }, + { + "score": 0.2, + "fragment": "Pure Bash lightweight we" + }, + { + "score": 0.2, + "fragment": "b server (github.com/rem" + }, + { + "score": 0.2, + "fragment": "ileduc)\n\t\t\t\t74 points by" + }, + { + "score": 0.2, + "fragment": " turrini 2 hours ago | h" + }, + { + "score": 0.2, + "fragment": "ide | 46 comments\n\t\t\t6. " + }, + { + "score": 0.2, + "fragment": "\t\n\t\t\t\tParser Combinators" + }, + { + "score": 0.2, + "fragment": " in Haskell (serokell.io" + }, + { + "score": 0.2, + "fragment": ")\n\t\t\t\t18 points by arocc" + }, + { + "score": 0.2, + "fragment": "oli 1 hour ago | hide | " + }, + { + "score": 0.2, + "fragment": "3 comments\n\t\t\t7. \t\n\t\t\t\tD" + }, + { + "score": 0.2, + "fragment": "eepMind’s New AI with a " + }, + { + "score": 0.2, + "fragment": "Memory Outperforms Algor" + }, + { + "score": 0.2, + "fragment": "ithms 25 Times Its Size " + }, + { + "score": 0.2, + "fragment": "(singularityhub.com)\n\t\t\t" + }, + { + "score": 0.2, + "fragment": "\t233 points by darkscape" + }, + { + "score": 0.2, + "fragment": " 9 hours ago | hide | 88" + }, + { + "score": 0.2, + "fragment": " comments\n\t\t\t8. \t\n\t\t\t\tTi" + }, + { + "score": 0.2, + "fragment": "nder just permabanned me" + }, + { + "score": 0.2, + "fragment": " or the problem with big" + }, + { + "score": 0.2, + "fragment": " tech (paulefou.com)\n\t\t\t" + }, + { + "score": 0.2, + "fragment": "\t90 points by svalee 1 h" + }, + { + "score": 0.2, + "fragment": "our ago | hide | 106 com" + }, + { + "score": 0.2, + "fragment": "ments\n\t\t\t9. \t\n\t\t\t\tRocky " + }, + { + "score": 0.2, + "fragment": "Mountain Basic (wikipedi" + }, + { + "score": 0.2, + "fragment": "a.org)\n\t\t\t\t12 points by " + }, + { + "score": 0.2, + "fragment": "mattowen_uk 1 hour ago |" + }, + { + "score": 0.2, + "fragment": " hide | 5 comments\n\t\t\t10" + }, + { + "score": 0.2, + "fragment": ". \t\n\t\t\t\tTeller Reveals H" + }, + { + "score": 0.2, + "fragment": "is Secrets (2012) (smith" + }, + { + "score": 0.2, + "fragment": "sonianmag.com)\n\t\t\t\t56 po" + }, + { + "score": 0.2, + "fragment": "ints by Tomte 4 hours ag" + }, + { + "score": 0.2, + "fragment": "o | hide | 26 comments\n\t" + }, + { + "score": 0.2, + "fragment": "\t\t11. \t\n\t\t\t\tHeroku Is Cu" + }, + { + "score": 0.2, + "fragment": "rrently Down (heroku.com" + }, + { + "score": 0.2, + "fragment": ")\n\t\t\t\t129 points by iamr" + }, + { + "score": 0.2, + "fragment": "icks 2 hours ago | hide " + }, + { + "score": 0.2, + "fragment": "| 29 comments\n\t\t\t12. \t\tC" + }, + { + "score": 0.2, + "fragment": "onvictional (YC W19) is " + }, + { + "score": 0.2, + "fragment": "hiring engineers to buil" + }, + { + "score": 0.2, + "fragment": "d the future of B2B trad" + }, + { + "score": 0.2, + "fragment": "e-Remote (ashbyhq.com)\n\t" + }, + { + "score": 0.2, + "fragment": "\t\t\t2 hours ago | hide\n\t\t" + }, + { + "score": 0.2, + "fragment": "\t13. \t\n\t\t\t\tScientists fi" + }, + { + "score": 0.2, + "fragment": "nd preserved dinosaur em" + }, + { + "score": 0.2, + "fragment": "bryo preparing to hatch " + }, + { + "score": 0.2, + "fragment": "like a bird (theguardian" + }, + { + "score": 0.2, + "fragment": ".com)\n\t\t\t\t187 points by " + }, + { + "score": 0.2, + "fragment": "Petiver 9 hours ago | hi" + }, + { + "score": 0.2, + "fragment": "de | 111 comments\n\t\t\t14." + }, + { + "score": 0.2, + "fragment": " \t\n\t\t\t\tI did a Mixergy i" + }, + { + "score": 0.2, + "fragment": "nterview so bad they did" + }, + { + "score": 0.2, + "fragment": "n't even release it (rob" + }, + { + "score": 0.2, + "fragment": "fitz.com)\n\t\t\t\t15 points " + }, + { + "score": 0.2, + "fragment": "by robfitz 1 hour ago | " + }, + { + "score": 0.2, + "fragment": "hide | 7 comments\n\t\t\t15." + }, + { + "score": 0.2, + "fragment": " \t\n\t\t\t\tNow DuckDuckGo is" + }, + { + "score": 0.2, + "fragment": " building its own deskto" + }, + { + "score": 0.2, + "fragment": "p browser (zdnet.com)\n\t\t" + }, + { + "score": 0.2, + "fragment": "\t\t132 points by waldekm " + }, + { + "score": 0.2, + "fragment": "2 hours ago | hide | 64 " + }, + { + "score": 0.2, + "fragment": "comments\n\t\t\t16. \t\n\t\t\t\tEn" + }, + { + "score": 0.2, + "fragment": "glish has been my pain f" + }, + { + "score": 0.2, + "fragment": "or 15 years (2013) (anti" + }, + { + "score": 0.2, + "fragment": "rez.com)\n\t\t\t\t105 points " + }, + { + "score": 0.2, + "fragment": "by Tomte 1 hour ago | hi" + }, + { + "score": 0.2, + "fragment": "de | 169 comments\n\t\t\t17." + }, + { + "score": 0.2, + "fragment": " \t\n\t\t\t\tPolish opposition" + }, + { + "score": 0.2, + "fragment": " duo hacked with NSO spy" + }, + { + "score": 0.2, + "fragment": "ware (apnews.com)\n\t\t\t\t10" + }, + { + "score": 0.2, + "fragment": "2 points by JumpCrisscro" + }, + { + "score": 0.2, + "fragment": "ss 2 hours ago | hide | " + }, + { + "score": 0.2, + "fragment": "35 comments\n\t\t\t18. \t\n\t\t\t" + }, + { + "score": 0.2, + "fragment": "\tLinux Has Grown into a " + }, + { + "score": 0.2, + "fragment": "Viable PC Gaming Platfor" + }, + { + "score": 0.2, + "fragment": "m and the Steam Stats Pr" + }, + { + "score": 0.2, + "fragment": "ove It (hothardware.com)" + }, + { + "score": 0.2, + "fragment": "\n\t\t\t\t119 points by rbanf" + }, + { + "score": 0.2, + "fragment": "fy 3 hours ago | hide | " + }, + { + "score": 0.2, + "fragment": "105 comments\n\t\t\t19. \t\n\t\t" + }, + { + "score": 0.2, + "fragment": "\t\tLG’s new 16:18 monitor" + }, + { + "score": 0.2, + "fragment": " (theverge.com)\n\t\t\t\t50 p" + }, + { + "score": 0.2, + "fragment": "oints by tosh 1 hour ago" + }, + { + "score": 0.2, + "fragment": " | hide | 25 comments\n\t\t" + }, + { + "score": 0.2, + "fragment": "\t20. \t\n\t\t\t\tConstruction " + }, + { + "score": 0.2, + "fragment": "of radio equipment in a " + }, + { + "score": 0.2, + "fragment": "Japanese PoW camp (bourn" + }, + { + "score": 0.2, + "fragment": "emouth.ac.uk)\n\t\t\t\t117 po" + }, + { + "score": 0.2, + "fragment": "ints by marcodiego 9 hou" + }, + { + "score": 0.2, + "fragment": "rs ago | hide | 16 comme" + }, + { + "score": 0.2, + "fragment": "nts\n\t\t\t21. \t\n\t\t\t\tEveryth" + }, + { + "score": 0.2, + "fragment": "ing I've seen on optimiz" + }, + { + "score": 0.2, + "fragment": "ing Postgres on ZFS (vad" + }, + { + "score": 0.2, + "fragment": "osware.io)\n\t\t\t\t27 points" + }, + { + "score": 0.2, + "fragment": " by EntICOnc 4 hours ago" + }, + { + "score": 0.2, + "fragment": " | hide | 2 comments\n\t\t\t" + }, + { + "score": 0.2, + "fragment": "22. \t\n\t\t\t\tMicrosoft Team" + }, + { + "score": 0.2, + "fragment": "s: 1 feature, 4 vulnerab" + }, + { + "score": 0.2, + "fragment": "ilities (positive.securi" + }, + { + "score": 0.2, + "fragment": "ty)\n\t\t\t\t269 points by ke" + }, + { + "score": 0.2, + "fragment": "rm1t 4 hours ago | hide " + }, + { + "score": 0.2, + "fragment": "| 196 comments\n\t\t\t23. \t\n" + }, + { + "score": 0.2, + "fragment": "\t\t\t\tAnalog computers wer" + }, + { + "score": 0.2, + "fragment": "e the most powerful comp" + }, + { + "score": 0.2, + "fragment": "uters for thousands of y" + }, + { + "score": 0.2, + "fragment": "ears [video] (youtube.co" + }, + { + "score": 0.2, + "fragment": "m)\n\t\t\t\t103 points by jdk" + }, + { + "score": 0.2, + "fragment": "ee 9 hours ago | hide | " + }, + { + "score": 0.2, + "fragment": "55 comments\n\t\t\t24. \t\n\t\t\t" + }, + { + "score": 0.2, + "fragment": "\tShipwrecks, Stolen Jewe" + }, + { + "score": 0.2, + "fragment": "ls, Skull-Blasting Are S" + }, + { + "score": 0.2, + "fragment": "ome of This Year’s Best " + }, + { + "score": 0.2, + "fragment": "Mysteries (atlasobscura." + }, + { + "score": 0.2, + "fragment": "com)\n\t\t\t\t8 points by Cap" + }, + { + "score": 0.2, + "fragment": "italistCartr 1 hour ago " + }, + { + "score": 0.2, + "fragment": "| hide | 1 comment\n\t\t\t25" + }, + { + "score": 0.2, + "fragment": ". \t\n\t\t\t\tIsolating Xwayla" + }, + { + "score": 0.2, + "fragment": "nd in a VM (roscidus.com" + }, + { + "score": 0.2, + "fragment": ")\n\t\t\t\t94 points by pmari" + }, + { + "score": 0.2, + "fragment": "n 9 hours ago | hide | 3" + }, + { + "score": 0.2, + "fragment": "2 comments\n\t\t\t26. \t\n\t\t\t\t" + }, + { + "score": 0.2, + "fragment": "Show HN: Metaheads, a se" + }, + { + "score": 0.2, + "fragment": "arch engine for Facebook" + }, + { + "score": 0.2, + "fragment": " comments (metaheads.xyz" + }, + { + "score": 0.2, + "fragment": ")\n\t\t\t\t4 points by jawert" + }, + { + "score": 0.2, + "fragment": "y 1 hour ago | hide | 15" + }, + { + "score": 0.2, + "fragment": " comments\n\t\t\t27. \t\n\t\t\t\tQ" + }, + { + "score": 0.2, + "fragment": "uantum theory based on r" + }, + { + "score": 0.2, + "fragment": "eal numbers can be exper" + }, + { + "score": 0.2, + "fragment": "imentally falsified (nat" + }, + { + "score": 0.2, + "fragment": "ure.com)\n\t\t\t\t159 points " + }, + { + "score": 0.2, + "fragment": "by SquibblesRedux 14 hou" + }, + { + "score": 0.2, + "fragment": "rs ago | hide | 93 comme" + }, + { + "score": 0.2, + "fragment": "nts\n\t\t\t28. \t\n\t\t\t\tFounder" + }, + { + "score": 0.2, + "fragment": " of Black Girls Code has" + }, + { + "score": 0.2, + "fragment": " been ousted as head of " + }, + { + "score": 0.2, + "fragment": "the nonprofit (businessi" + }, + { + "score": 0.2, + "fragment": "nsider.com)\n\t\t\t\t29 point" + }, + { + "score": 0.2, + "fragment": "s by healsdata 1 hour ag" + }, + { + "score": 0.2, + "fragment": "o | hide | 7 comments\n\t\t" + }, + { + "score": 0.2, + "fragment": "\t29. \t\n\t\t\t\tWaffle House " + }, + { + "score": 0.2, + "fragment": "Poet Laureate (2019) (at" + }, + { + "score": 0.2, + "fragment": "lantamagazine.com)\n\t\t\t\t5" + }, + { + "score": 0.2, + "fragment": " points by brudgers 1 ho" + }, + { + "score": 0.2, + "fragment": "ur ago | hide | 4 commen" + }, + { + "score": 0.2, + "fragment": "ts\n\t\t\t30. \t\n\t\t\t\tEarth’s " + }, + { + "score": 0.2, + "fragment": "magnetic field illuminat" + }, + { + "score": 0.2, + "fragment": "es Biblical history (eco" + }, + { + "score": 0.2, + "fragment": "nomist.com)\n\t\t\t\t46 point" + }, + { + "score": 0.2, + "fragment": "s by helsinkiandrew 8 ho" + }, + { + "score": 0.2, + "fragment": "urs ago | hide | 17 comm" + }, + { + "score": 0.2, + "fragment": "ents\n\t\t\t\tMore\n " + } +]