"Adding ndx stuff"

This commit is contained in:
Cris Stringfellow 2021-12-17 14:42:51 +08:00
parent ac45a389b8
commit e8a15d4eed
3 changed files with 957 additions and 720 deletions

View File

@ -1,36 +1,58 @@
import hasha from 'hasha';
import {URL} from 'url';
import Path from 'path';
import fs from 'fs';
import FlexSearch from 'flexsearch';
import args from './args.js';
import {
// module imports
import hasha from 'hasha';
import {URL} from 'url';
import Path from 'path';
import fs from 'fs';
// search related
import FlexSearch from 'flexsearch';
import { createIndex as NDX, addDocumentToIndex as ndx } from 'ndx';
import { query as NDXQuery } from 'ndx-query';
import Nat from 'natural';
import args from './args.js';
import {
APP_ROOT, context, sleep, DEBUG,
clone,
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
} from './common.js';
import {connect} from './protocol.js';
import {getInjection} from './public/injection.js';
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
} from './common.js';
import {connect} from './protocol.js';
import {getInjection} from './public/injection.js';
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
//import xapian from 'xapian';
// cache is a simple map
// that holds the serialized requests
// that are saved on disk
let Fs, Mode, Close;
const {Index: FTSIndex, registerCharset, registerLanguage} = FlexSearch;
const FLEX_OPTS = {
// search related state: constants and variables
// common
const FTS_INDEX_DIR = args.fts_index_dir;
// FlexSearch
const {Index: FTSIndex, registerCharset, registerLanguage} = FlexSearch;
const FLEX_OPTS = {
context: true,
language: "en"
};
const Targets = new Map();
const UpdatedKeys = new Set();
const Flex = new FTSIndex(FLEX_OPTS);
const Cache = new Map();
const Index = new Map();
const Indexing = new Set();
const State = {
};
const Flex = new FTSIndex(FLEX_OPTS);
// natural (NLP tools -- stemmers and tokenizers, etc)
const Tokenizer = new Nat.WordTokenizer();
const StemmerEn = Nat.PorterStemmer;
// NDX
const NDX_FIELDS = ndxDocFields();
const words = Tokenizer.tokenize.bind(Tokenizer);
const termFilter = StemmerEn.stem.bind(StemmerEn);
// module state: constants and variables
// cache is a simple map
// that holds the serialized requests
// that are saved on disk
let Fs, Mode, Close;
const Targets = new Map();
const UpdatedKeys = new Set();
const Cache = new Map();
const Index = new Map();
const Indexing = new Set();
const State = {
Indexing,
Cache,
Index,
@ -42,53 +64,53 @@ const State = {
ftsIndexSaver: null,
saveInProgress: false,
ftsSaveInProgress: false
}
}
const IGNORE_NODES = new Set([
const IGNORE_NODES = new Set([
'script',
'style',
'noscript',
'datalist'
]);
const TextNode = 3;
const AttributeNode = 2;
]);
const TextNode = 3;
const AttributeNode = 2;
const Archivist = {
const Archivist = {
collect, getMode, changeMode, shutdown, handlePathChanged, saveIndex,
search,
getTitle
}
}
const BODYLESS = new Set([
const BODYLESS = new Set([
301,
302,
303,
307
]);
const NEVER_CACHE = new Set([
]);
const NEVER_CACHE = new Set([
`http://localhost:${args.server_port}`,
`http://localhost:${args.chrome_port}`
]);
const SORT_URLS = ([urlA],[urlB]) => urlA < urlB ? -1 : 1;
const CACHE_FILE = args.cache_file;
const INDEX_FILE = args.index_file;
const FTS_INDEX_DIR = args.fts_index_dir;
const NO_FILE = args.no_file;
const TBL = /:\/\//g;
const HASH_OPTS = {algorithm: 'sha1'};
const UNCACHED_BODY = b64('We have not saved this data');
const UNCACHED_CODE = 404;
const UNCACHED_HEADERS = [
]);
const SORT_URLS = ([urlA],[urlB]) => urlA < urlB ? -1 : 1;
const CACHE_FILE = args.cache_file;
const INDEX_FILE = args.index_file;
const NO_FILE = args.no_file;
const TBL = /:\/\//g;
const HASH_OPTS = {algorithm: 'sha1'};
const UNCACHED_BODY = b64('We have not saved this data');
const UNCACHED_CODE = 404;
const UNCACHED_HEADERS = [
{ name: 'Content-type', value: 'text/plain' },
{ name: 'Content-length', value: '26' }
];
const UNCACHED = {
];
const UNCACHED = {
body:UNCACHED_BODY, responseCode:UNCACHED_CODE, responseHeaders:UNCACHED_HEADERS
}
}
export default Archivist;
async function collect({chrome_port:port, mode} = {}) {
// main
async function collect({chrome_port:port, mode} = {}) {
if ( context == 'node' ) {
const {default:fs} = await import('fs');
Fs = fs;
@ -565,9 +587,10 @@ async function collect({chrome_port:port, mode} = {}) {
return `${method}${url}`;
//return `${url}${urlFragment}:${method}:${sortedHeaders}:${postData}:${hasPostData}`;
}
}
}
function clearSavers() {
// helpers
function clearSavers() {
if ( State.saver ) {
clearInterval(State.saver);
State.saver = null;
@ -582,9 +605,9 @@ function clearSavers() {
clearTimeout(State.ftsIndexSaver);
State.ftsIndexSaver = null;
}
}
}
function loadFiles() {
function loadFiles() {
try {
const cacheFile = CACHE_FILE();
const indexFile = INDEX_FILE();
@ -630,11 +653,11 @@ function loadFiles() {
DEBUG && console.warn('Error compiling regex from No file', e);
State.No = null;
}
}
}
function getMode() { return Mode; }
function getMode() { return Mode; }
async function changeMode(mode) {
async function changeMode(mode) {
DEBUG && console.log({modeChange:mode});
clearSavers();
saveCache();
@ -643,13 +666,13 @@ async function changeMode(mode) {
Close && Close();
Mode = mode;
await collect({chrome_port:args.chrome_port, mode});
}
}
function getTitle(url) {
function getTitle(url) {
return State.Index.get(url);
}
}
function handlePathChanged() {
function handlePathChanged() {
DEBUG && console.log({libraryPathChange:args.library_path()});
clearSavers();
// saves the old cache path
@ -658,16 +681,16 @@ function handlePathChanged() {
saveFTS(State.SavedFTSIndexDirPath);
// reloads from new path and updates Saved FilePaths
loadFiles();
}
}
function saveCache(path) {
function saveCache(path) {
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || CACHE_FILE());
Fs.writeFileSync(path || CACHE_FILE(), JSON.stringify([...State.Cache.entries()],null,2));
}
}
}
function saveIndex(path) {
function saveIndex(path) {
if ( State.saveInProgress ) return;
State.saveInProgress = true;
@ -685,13 +708,13 @@ function saveIndex(path) {
State.indexSaver = setTimeout(saveIndex, 11001);
State.saveInProgress = false;
}
}
async function search(query) {
async function search(query) {
return await Flex.searchAsync(query, args.results_per_page);
}
}
async function saveFTS(path) {
async function saveFTS(path) {
if ( State.ftsSaveInProgress ) return;
State.ftsSaveInProgress = true;
@ -722,23 +745,82 @@ async function saveFTS(path) {
State.ftsIndexSaver = setTimeout(saveFTS, 31001);
State.ftsSaveInProgress = false;
}
}
function shutdown() {
function shutdown() {
DEBUG && console.log(`Archivist shutting down...`);
saveCache();
Close && Close();
DEBUG && console.log(`Archivist shut down.`);
}
}
function b64(s) {
function b64(s) {
if ( context == 'node' ) {
return Buffer.from(s).toString('base64');
} else {
return btoa(s);
}
}
}
function NDXIndex(fields) {
if ( ! new.target ) { throw `NDXIndex must be called with 'new'`; }
// `createIndex()` creates an index data structure.
// First argument specifies how many different fields we want to index.
const index = NDX(fields.length);
// `fieldAccessors` is an array with functions that used to retrieve data from different fields.
const fieldAccessors = fields.map(f => doc => doc[f.name]);
// `fieldBoostFactors` is an array of boost factors for each field, in this example all fields will have
// identical factors.
const fieldBoostFactors = fields.map(() => 1);
return {
// `add()` function will add documents to the index.
add: doc => ndx(
index,
fieldAccessors,
// Tokenizer is a function that breaks text into words, phrases, symbols, or other meaningful elements
// called tokens.
// Lodash function `words()` splits string into an array of its words, see https://lodash.com/docs/#words for
// details.
words,
// Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to
// index documents.
termFilter,
// Document key, it can be a unique document id or a refernce to a document if you want to store all documents
// in memory.
doc.url,
// Document.
doc,
),
// `search()` function will be used to perform queries.
search: q => NDXQuery(
index,
fieldBoostFactors,
// BM25 ranking function constants:
1.2, // BM25 k1 constant, controls non-linear term frequency normalization (saturation).
0.75, // BM25 b constant, controls to what degree document length normalizes tf values.
words,
termFilter,
// Set of removed documents, in this example we don't want to support removing documents from the index,
// so we can ignore it by specifying this set as `undefined` value.
undefined,
q,
),
};
}
function toNDXDoc({url, title, pageText}) {
return {
url, title,
content: pageText
};
}
function ndxDocFields() {
return [
{ name: "url" },
{ name: "title" },
{ name: "content" },
];
}

151
package-lock.json generated
View File

@ -15,6 +15,10 @@
"express": "latest",
"flexsearch": "latest",
"hasha": "latest",
"natural": "^5.1.11",
"ndx": "^1.0.2",
"ndx-query": "^1.0.1",
"ndx-serializable": "^1.0.0",
"node-fetch": "latest",
"ws": "latest"
},
@ -448,6 +452,15 @@
"node": ">=0.4.0"
}
},
"node_modules/afinn-165": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/afinn-165/-/afinn-165-1.0.4.tgz",
"integrity": "sha512-7+Wlx3BImrK0HiG6y3lU4xX7SpBPSSu8T9iguPMlaueRFxjbYwAQrp9lqZUuFikqKbd/en8lVREILvP2J80uJA==",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/ajv": {
"version": "6.12.6",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
@ -528,6 +541,17 @@
"node": ">= 8"
}
},
"node_modules/apparatus": {
"version": "0.0.10",
"resolved": "https://registry.npmjs.org/apparatus/-/apparatus-0.0.10.tgz",
"integrity": "sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==",
"dependencies": {
"sylvester": ">= 0.0.8"
},
"engines": {
"node": ">=0.2.6"
}
},
"node_modules/argparse": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
@ -3931,6 +3955,22 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"node_modules/natural": {
"version": "5.1.11",
"resolved": "https://registry.npmjs.org/natural/-/natural-5.1.11.tgz",
"integrity": "sha512-ecITGKjUNXxj6+g0oD0nEzYgmUxK3WRbye3zy1OUmRhgWx04BEQGILc5LnqpdYpKsA+D3CDDJMImyX70Li8dyw==",
"dependencies": {
"afinn-165": "^1.0.2",
"apparatus": "^0.0.10",
"safe-stable-stringify": "^2.2.0",
"sylvester": "^0.0.12",
"underscore": "^1.9.1",
"wordnet-db": "^3.1.11"
},
"engines": {
"node": ">=0.4.10"
}
},
"node_modules/natural-compare": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
@ -3946,6 +3986,24 @@
"ncp": "bin/ncp"
}
},
"node_modules/ndx": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/ndx/-/ndx-1.0.2.tgz",
"integrity": "sha512-/TbqqemJ80lGKRoRuXsz7VgA0erkIxilCUbkMfRL1h2VBGBLGvQnI+FdHvWDqJnUhgOP/T9+SYeWS84wbXGBFA=="
},
"node_modules/ndx-query": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/ndx-query/-/ndx-query-1.0.1.tgz",
"integrity": "sha512-ybm/bt2WDwDzoUDXKrqW+oHKPV9qF9E8ICqZUWZDYgPvogMZ49eaXnCJ1jP9V+bkgR98EebS7ylE1DIjwqvl4g==",
"dependencies": {
"ndx": "^1.0.2"
}
},
"node_modules/ndx-serializable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/ndx-serializable/-/ndx-serializable-1.0.0.tgz",
"integrity": "sha512-CViD3O8GRcWrQ2IPubwGnlmuxB81kEihjLH6SZLxUCxxL9pM6IH7RZah0SmrTuUCNx4kjiaM2S49ReaA5wiNtA=="
},
"node_modules/negotiator": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
@ -6421,6 +6479,14 @@
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"node_modules/safe-stable-stringify": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.3.1.tgz",
"integrity": "sha512-kYBSfT+troD9cDA85VDnHZ1rpHC50O0g1e6WlGHVCz/g+JS+9WKLj+XwFYyR8UbrZN8ll9HUpDAAddY58MGisg==",
"engines": {
"node": ">=10"
}
},
"node_modules/safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
@ -6870,6 +6936,14 @@
"node": ">=4"
}
},
"node_modules/sylvester": {
"version": "0.0.12",
"resolved": "https://registry.npmjs.org/sylvester/-/sylvester-0.0.12.tgz",
"integrity": "sha1-WohEFc0tACxX56OqyZRip1zp/bQ=",
"engines": {
"node": ">=0.2.6"
}
},
"node_modules/syntax-error": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/syntax-error/-/syntax-error-1.4.0.tgz",
@ -7176,6 +7250,11 @@
"integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==",
"dev": true
},
"node_modules/underscore": {
"version": "1.13.2",
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.2.tgz",
"integrity": "sha512-ekY1NhRzq0B08g4bGuX4wd2jZx5GnKz6mKSqFL4nqBlfyMGiG10gDFhDTMEfYmDL6Jy0FUIZp7wiRB+0BP7J2g=="
},
"node_modules/unique-string": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/unique-string/-/unique-string-2.0.0.tgz",
@ -7564,6 +7643,14 @@
"node": ">=0.10.0"
}
},
"node_modules/wordnet-db": {
"version": "3.1.14",
"resolved": "https://registry.npmjs.org/wordnet-db/-/wordnet-db-3.1.14.tgz",
"integrity": "sha512-zVyFsvE+mq9MCmwXUWHIcpfbrHHClZWZiVOzKSxNJruIcFn2RbY55zkhiAMMxM8zCVSmtNiViq8FsAZSFpMYag==",
"engines": {
"node": ">=0.6.0"
}
},
"node_modules/wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
@ -8097,6 +8184,11 @@
"integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA==",
"dev": true
},
"afinn-165": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/afinn-165/-/afinn-165-1.0.4.tgz",
"integrity": "sha512-7+Wlx3BImrK0HiG6y3lU4xX7SpBPSSu8T9iguPMlaueRFxjbYwAQrp9lqZUuFikqKbd/en8lVREILvP2J80uJA=="
},
"ajv": {
"version": "6.12.6",
"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
@ -8156,6 +8248,14 @@
"picomatch": "^2.0.4"
}
},
"apparatus": {
"version": "0.0.10",
"resolved": "https://registry.npmjs.org/apparatus/-/apparatus-0.0.10.tgz",
"integrity": "sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==",
"requires": {
"sylvester": ">= 0.0.8"
}
},
"argparse": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
@ -10881,6 +10981,19 @@
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"natural": {
"version": "5.1.11",
"resolved": "https://registry.npmjs.org/natural/-/natural-5.1.11.tgz",
"integrity": "sha512-ecITGKjUNXxj6+g0oD0nEzYgmUxK3WRbye3zy1OUmRhgWx04BEQGILc5LnqpdYpKsA+D3CDDJMImyX70Li8dyw==",
"requires": {
"afinn-165": "^1.0.2",
"apparatus": "^0.0.10",
"safe-stable-stringify": "^2.2.0",
"sylvester": "^0.0.12",
"underscore": "^1.9.1",
"wordnet-db": "^3.1.11"
}
},
"natural-compare": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
@ -10893,6 +11006,24 @@
"integrity": "sha1-GVoh1sRuNh0vsSgbo4uR6d9727M=",
"dev": true
},
"ndx": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/ndx/-/ndx-1.0.2.tgz",
"integrity": "sha512-/TbqqemJ80lGKRoRuXsz7VgA0erkIxilCUbkMfRL1h2VBGBLGvQnI+FdHvWDqJnUhgOP/T9+SYeWS84wbXGBFA=="
},
"ndx-query": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/ndx-query/-/ndx-query-1.0.1.tgz",
"integrity": "sha512-ybm/bt2WDwDzoUDXKrqW+oHKPV9qF9E8ICqZUWZDYgPvogMZ49eaXnCJ1jP9V+bkgR98EebS7ylE1DIjwqvl4g==",
"requires": {
"ndx": "^1.0.2"
}
},
"ndx-serializable": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/ndx-serializable/-/ndx-serializable-1.0.0.tgz",
"integrity": "sha512-CViD3O8GRcWrQ2IPubwGnlmuxB81kEihjLH6SZLxUCxxL9pM6IH7RZah0SmrTuUCNx4kjiaM2S49ReaA5wiNtA=="
},
"negotiator": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
@ -12755,6 +12886,11 @@
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"safe-stable-stringify": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.3.1.tgz",
"integrity": "sha512-kYBSfT+troD9cDA85VDnHZ1rpHC50O0g1e6WlGHVCz/g+JS+9WKLj+XwFYyR8UbrZN8ll9HUpDAAddY58MGisg=="
},
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
@ -13124,6 +13260,11 @@
"has-flag": "^3.0.0"
}
},
"sylvester": {
"version": "0.0.12",
"resolved": "https://registry.npmjs.org/sylvester/-/sylvester-0.0.12.tgz",
"integrity": "sha1-WohEFc0tACxX56OqyZRip1zp/bQ="
},
"syntax-error": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/syntax-error/-/syntax-error-1.4.0.tgz",
@ -13352,6 +13493,11 @@
"integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==",
"dev": true
},
"underscore": {
"version": "1.13.2",
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.2.tgz",
"integrity": "sha512-ekY1NhRzq0B08g4bGuX4wd2jZx5GnKz6mKSqFL4nqBlfyMGiG10gDFhDTMEfYmDL6Jy0FUIZp7wiRB+0BP7J2g=="
},
"unique-string": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/unique-string/-/unique-string-2.0.0.tgz",
@ -13643,6 +13789,11 @@
"integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
"dev": true
},
"wordnet-db": {
"version": "3.1.14",
"resolved": "https://registry.npmjs.org/wordnet-db/-/wordnet-db-3.1.14.tgz",
"integrity": "sha512-zVyFsvE+mq9MCmwXUWHIcpfbrHHClZWZiVOzKSxNJruIcFn2RbY55zkhiAMMxM8zCVSmtNiViq8FsAZSFpMYag=="
},
"wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",

View File

@ -38,6 +38,10 @@
"express": "latest",
"flexsearch": "latest",
"hasha": "latest",
"natural": "^5.1.11",
"ndx": "^1.0.2",
"ndx-query": "^1.0.1",
"ndx-serializable": "^1.0.0",
"node-fetch": "latest",
"ws": "latest"
},