Diskernet/archivist.js

1257 lines
40 KiB
JavaScript

// Licenses
// FlexSearch is Apache-2.0 licensed
// Source: https://github.com/nextapps-de/flexsearch/blob/bffb255b7904cb7f79f027faeb963ecef0a85dba/LICENSE
// NDX is MIT licensed
// Source: https://github.com/ndx-search/ndx/blob/cc9ec2780d88918338d4edcfca2d4304af9dc721/LICENSE
// module imports
import hasha from 'hasha';
import {URL} from 'url';
import Path from 'path';
import os from 'os';
import Fs from 'fs';
import { stdin as input, stdout as output } from 'process';
import util from 'util';
import readline from 'readline';
// search related
import FlexSearch from 'flexsearch';
import {
createIndex as NDX,
addDocumentToIndex as ndx,
removeDocumentFromIndex,
vacuumIndex
} from 'ndx';
import { query as NDXQuery } from 'ndx-query';
import { toSerializable, fromSerializable } from 'ndx-serializable';
//import { DocumentIndex } from 'ndx';
//import Fuzzy from 'fz-search';
import * as _Fuzzy from './lib/fz.js';
import Nat from 'natural';
//import match from 'autosuggest-highlight/match';
//import parse from 'autosuggest-highlight/parse';
import args from './args.js';
import {
APP_ROOT, context, sleep, DEBUG,
clone,
SNIP_CONTEXT,
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
} from './common.js';
import {connect} from './protocol.js';
import {getInjection} from './public/injection.js';
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
// search related state: constants and variables
// common
const Fuzzy = globalThis.FuzzySearch;
const NDX_OLD = false;
const USE_FLEX = true;
const FTS_INDEX_DIR = args.fts_index_dir;
const URI_SPLIT = /[\/.]/g;
const NDX_ID_KEY = 'ndx_id';
const INDEX_HIDDEN_KEYS = new Set([
NDX_ID_KEY
]);
const hiddenKey = key => key.startsWith('ndx') || INDEX_HIDDEN_KEYS.has(key);
let Id;
// natural (NLP tools -- stemmers and tokenizers, etc)
const Tokenizer = new Nat.WordTokenizer();
const Stemmer = Nat.PorterStemmer;
//const Stemmer = Nat.LancasterStemmer; // EN only
const words = Tokenizer.tokenize.bind(Tokenizer);
const termFilter = Stemmer.stem.bind(Stemmer);
//const termFilter = s => s.toLocaleLowerCase();
// FlexSearch
const {Index: FTSIndex, registerCharset, registerLanguage} = FlexSearch;
const FLEX_OPTS = {
charset: "utf8",
context: true,
language: "en",
tokenize: "reverse"
};
let Flex = new FTSIndex(FLEX_OPTS);
DEBUG && console.log({Flex});
// NDX
const NDXRemoved = new Set();
const REMOVED_CAP_TO_VACUUM_NDX = 10;
const NDX_FIELDS = ndxDocFields();
let NDX_FTSIndex = new NDXIndex(NDX_FIELDS);
let NDXId;
DEBUG && console.log({NDX_FTSIndex});
// fuzzy (maybe just for queries ?)
const FUZZ_OPTS = {
keys: ndxDocFields({namesOnly:true})
};
const Docs = new Map();
let fuzzy = new Fuzzy({source: [...Docs.values()], keys: FUZZ_OPTS.keys});
// module state: constants and variables
// cache is a simple map
// that holds the serialized requests
// that are saved on disk
const Status = {
loaded: false
};
const Targets = new Map();
const UpdatedKeys = new Set();
const Cache = new Map();
const Index = new Map();
const Indexing = new Set();
const BLANK_STATE = {
Docs,
Indexing,
Cache,
Index,
NDX_FTSIndex,
Flex,
SavedCacheFilePath: null,
SavedIndexFilePath: null,
SavedFTSIndexDirPath: null,
SavedFuzzyIndexDirPath: null,
saver: null,
indexSaver: null,
ftsIndexSaver: null,
saveInProgress: false,
ftsSaveInProgress: false
};
const State = Object.assign({}, BLANK_STATE);
const IGNORE_NODES = new Set([
'script',
'style',
'noscript',
'datalist'
]);
const TextNode = 3;
const AttributeNode = 2;
const Archivist = {
NDX_OLD,
USE_FLEX,
collect, getMode, changeMode, shutdown,
beforePathChanged,
afterPathChanged,
saveIndex,
getIndex,
search,
getDetails,
isReady,
findOffsets,
}
const BODYLESS = new Set([
301,
302,
303,
307
]);
const NEVER_CACHE = new Set([
`http://localhost:${args.server_port}`,
`http://localhost:${args.chrome_port}`
]);
const SORT_URLS = ([urlA],[urlB]) => urlA < urlB ? -1 : 1;
const CACHE_FILE = args.cache_file;
const INDEX_FILE = args.index_file;
const NO_FILE = args.no_file;
const TBL = /:\/\//g;
const HASH_OPTS = {algorithm: 'sha1'};
const UNCACHED_BODY = b64('We have not saved this data');
const UNCACHED_CODE = 404;
const UNCACHED_HEADERS = [
{ name: 'Content-type', value: 'text/plain' },
{ name: 'Content-length', value: '26' }
];
const UNCACHED = {
body:UNCACHED_BODY, responseCode:UNCACHED_CODE, responseHeaders:UNCACHED_HEADERS
}
let Mode, Close;
// shutdown and cleanup
// handle writing out indexes and closing browser connection when resetting under nodemon
process.once('SIGUSR2', function () {
shutdown(function () {
process.kill(process.pid, 'SIGUSR2');
});
});
export default Archivist;
// main
async function collect({chrome_port:port, mode} = {}) {
const {library_path} = args;
const {send, on, close} = await connect({port});
const Sessions = new Map();
const Installations = new Set();
const ConfirmedInstalls = new Set();
const DELAY = 100; // 500 ?
Close = close;
Mode = mode;
let requestStage;
await loadFiles();
clearSavers();
if ( Mode == 'save' ) {
requestStage = "Response";
// in case we get a updateBasePath call before an interval
// and we don't clear it in time, leading us to erroneously save the old
// cache to the new path, we always used our saved copy
State.saver = setInterval(() => saveCache(State.SavedCacheFilePath), 17000);
// we use timeout because we can trigger this ourself
// so in order to not get a race condition (overlapping calls) we ensure
// only 1 call at 1 time
State.indexSaver = setTimeout(() => saveIndex(State.SavedIndexFilePath), 11001);
State.ftsIndexSaver = setTimeout(() => saveFTS(State.SavedFTSIndexDirPath), 31001);
} else if ( Mode == 'serve' ) {
requestStage = "Request";
} else {
throw new TypeError(`Must specify mode`);
}
//on("Target.targetInfoChanged", displayTargetInfo);
on("Target.targetInfoChanged", attachToTarget);
on("Target.targetInfoChanged", reloadIfNotLive);
on("Target.targetInfoChanged", updateTargetInfo);
on("Target.targetInfoChanged", indexURL);
on("Target.attachedToTarget", installForSession);
on("Fetch.requestPaused", cacheRequest);
on("Runtime.consoleAPICalled", handleMessage);
await send("Target.setDiscoverTargets", {discover:true});
await send("Target.setAutoAttach", {autoAttach:true, waitForDebuggerOnStart:false, flatten: true});
await send("Fetch.enable", {
patterns: [
{
urlPattern: "http*://*",
requestStage
}
],
});
const {targetInfos:targets} = await send("Target.getTargets", {});
const pageTargets = targets.filter(({type}) => type == 'page');
await Promise.all(pageTargets.map(attachToTarget));
Status.loaded = true;
return Status.loaded;
function guard(func, text = '') {
return (...args) => {
//DEBUG && console.log({text, func:func.name, args:JSON.stringify(args,null,2)});
return func(...args);
};
}
function handleMessage(args) {
const {type, args:[{value:strVal}], context} = args;
if ( type == 'info' ) {
try {
const val = JSON.parse(strVal);
// possible messages
const {install, titleChange} = val;
switch(true) {
case !!install: {
confirmInstall({install});
}; break;
case !!titleChange: {
reindexOnTitleChange({titleChange});
}; break;
default: {
if ( DEBUG ) {
console.warn(`Unknown message`, strVal);
}
}; break;
}
} catch(e) {
DEBUG && console.info('Not the message we expected to confirm install. This is OK.', {originalMessage:args});
} finally {}
}
}
function confirmInstall({install}) {
const {sessionId} = install;
if ( ! ConfirmedInstalls.has(sessionId) ) {
ConfirmedInstalls.add(sessionId);
console.log({confirmedInstall:val, context});
}
}
async function reindexOnTitleChange({titleChange}) {
const {currentTitle, url, sessionId} = titleChange;
DEBUG && console.log('Received titleChange', titleChange);
const latestTargetInfo = clone(await untilHas(Targets, sessionId));
latestTargetInfo.title = currentTitle;
Targets.set(sessionId, latestTargetInfo);
DEBUG && console.log('Updated stored target info', latestTargetInfo);
indexURL({targetInfo:latestTargetInfo});
}
function displayTargetInfo({targetInfo}) {
if ( targetInfo.type === 'page' ) {
console.log("Target info", JSON.stringify(targetInfo, null, 2));
}
}
function updateTargetInfo({targetInfo}) {
if ( targetInfo.type === 'page' ) {
const sessionId = Sessions.get(targetInfo.targetId);
DEBUG && console.log('Updating target info', targetInfo, sessionId);
if ( sessionId ) {
const existingTargetInfo = Targets.get(sessionId);
// if we have an existing target info for this URL and have saved an updated title
DEBUG && console.log('Existing target info', existingTargetInfo);
if ( existingTargetInfo && existingTargetInfo.url === targetInfo.url ) {
// keep that title (because targetInfo does not reflect the latest title)
if ( existingTargetInfo.title !== existingTargetInfo.url ) {
DEBUG && console.log('Setting title to existing', existingTargetInfo);
targetInfo.title = existingTargetInfo.title;
}
}
Targets.set(sessionId, clone(targetInfo));
}
}
}
async function reloadIfNotLive({targetInfo}) {
if ( Mode == 'serve' ) return;
const {attached, type} = targetInfo;
if ( attached && type == 'page' ) {
const {url, targetId} = targetInfo;
const sessionId = Sessions.get(targetId);
if ( !!sessionId && !ConfirmedInstalls.has(sessionId) ) {
DEBUG && console.log({reloadingAsNotConfirmedInstalled:{url, sessionId}});
send("Page.stopLoading", {}, sessionId);
send("Page.reload", {}, sessionId);
}
}
}
function neverCache(url) {
return url == "about:blank" || url?.startsWith('chrome') || NEVER_CACHE.has(url);
}
async function installForSession({sessionId, targetInfo, waitingForDebugger}) {
if ( ! sessionId ) {
throw new TypeError(`installForSession needs a sessionId`);
}
const {targetId, url} = targetInfo;
if ( Mode == 'serve' ) return;
if ( dontInstall(targetInfo) ) return;
if ( targetInfo.type != 'page' ) return;
if ( Installations.has(sessionId) ) return;
DEBUG && console.log("installForSession running on target " + targetId);
Sessions.set(targetId, sessionId);
Targets.set(sessionId, clone(targetInfo));
if ( Mode == 'save' ) {
send("Network.setCacheDisabled", {cacheDisabled:true}, sessionId);
send("Network.setBypassServiceWorker", {bypass:true}, sessionId);
await send("Runtime.enable", {}, sessionId);
await send("Page.enable", {}, sessionId);
await send("DOMSnapshot.enable", {}, sessionId);
await send("Page.addScriptToEvaluateOnNewDocument", {
source: getInjection({sessionId}),
worldName: "Context-22120-Indexing"
}, sessionId);
DEBUG && console.log("Just request install", targetId, url);
}
Installations.add(sessionId);
DEBUG && console.log('Installed sessionId', sessionId);
indexURL({targetInfo});
}
async function indexURL({targetInfo:info = {}, sessionId, waitingForDebugger} = {}) {
if ( Mode == 'serve' ) return;
if ( info.type != 'page' ) return;
if ( ! info.url || info.url == 'about:blank' ) return;
if ( info.url.startsWith('chrome') ) return;
if ( dontCache(info) ) return;
console.log('Index URL called', info);
if ( State.Indexing.has(info.targetId) ) return;
State.Indexing.add(info.targetId);
if ( ! sessionId ) {
sessionId = await untilHas(Sessions, info.targetId);
}
if ( !Installations.has(sessionId) ) {
await untilHas(Installations, sessionId);
}
send("DOMSnapshot.enable", {}, sessionId);
await sleep(500);
const flatDoc = await send("DOMSnapshot.captureSnapshot", {
computedStyles: [],
}, sessionId);
const pageText = processDoc(flatDoc);
const {title, url} = Targets.get(sessionId);
let id, ndx_id;
if ( State.Index.has(url) ) {
({ndx_id, id} = State.Index.get(url));
} else {
Id++;
id = Id;
}
const doc = toNDXDoc({id, url, title, pageText});
State.Index.set(url, {id:doc.id, ndx_id:doc.ndx_id, title});
State.Index.set(doc.id, url);
State.Index.set('ndx'+doc.ndx_id, url);
const contentSignature = getContentSig(doc);
//Flex code
Flex.update(doc.id, contentSignature);
//New NDX code
const res = NDX_FTSIndex.update(doc, ndx_id);
// Fuzzy
// eventually we can use this update logic for everyone
let updateFuzz = true;
if ( State.Docs.has(url) ) {
const current = State.Docs.get(url);
if ( current.contentSignature === contentSignature ) {
updateFuzz = false;
}
}
if ( updateFuzz ) {
doc.contentSignature = contentSignature;
fuzzy.add(doc);
State.Docs.set(url, doc);
console.log(doc,url);
}
DEBUG && console.log("NDX updated", doc.ndx_id);
UpdatedKeys.add(url);
console.log({id: doc.id, title, url, indexed: true});
State.Indexing.delete(info.targetId);
}
function processDoc({documents, strings}) {
/*
Info
Implementation Notes
1. Code uses spec at:
https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#type-NodeTreeSnapshot
2. Note that so far the below will NOT produce text for and therefore we will NOT
index textarea or input elements. We can access those by using the textValue and
inputValue array properties of the doc, if we want to implement that.
*/
const texts = [];
for( const doc of documents) {
const textIndices = doc.nodes.nodeType.reduce((Indices, type, index) => {
if ( type === TEXT_NODE ) {
const parentIndex = doc.nodes.parentIndex[index];
const forbiddenParent = parentIndex >= 0 &&
FORBIDDEN_TEXT_PARENT.has(strings[
doc.nodes.nodeName[
parentIndex
]
])
if ( ! forbiddenParent ) {
Indices.push(index);
}
}
return Indices;
}, []);
textIndices.forEach(index => {
const stringsIndex = doc.nodes.nodeValue[index];
if ( stringsIndex >= 0 ) {
const text = strings[stringsIndex];
texts.push(text);
}
});
}
const pageText = texts.filter(t => t.trim()).join(' ');
DEBUG && console.log('Page text>>>', pageText);
return pageText;
}
async function attachToTarget(targetInfo) {
if ( dontInstall(targetInfo) ) return;
const {url} = targetInfo;
if ( url && targetInfo.type == 'page' ) {
if ( ! targetInfo.attached ) {
const {sessionId} = await send("Target.attachToTarget", {
targetId: targetInfo.targetId,
flatten: true
});
Sessions.set(targetInfo.targetId, sessionId);
}
}
}
async function cacheRequest(pausedRequest) {
const {
requestId, request, resourceType,
responseStatusCode, responseHeaders, responseErrorReason
} = pausedRequest;
const {url} = request;
const isNavigationRequest = resourceType == "Document";
const isFont = resourceType == "Font";
if ( dontCache(request) ) {
DEBUG && console.log("Not caching", request.url);
return send("Fetch.continueRequest", {requestId});
}
const key = serializeRequest(request);
if ( Mode == 'serve' ) {
if ( State.Cache.has(key) ) {
let {body, responseCode, responseHeaders} = await getResponseData(State.Cache.get(key));
responseCode = responseCode || 200;
//DEBUG && console.log("Fulfilling", key, responseCode, responseHeaders, body.slice(0,140));
DEBUG && console.log("Fulfilling", key, responseCode, body.slice(0,140));
await send("Fetch.fulfillRequest", {
requestId, body, responseCode, responseHeaders
});
} else {
DEBUG && console.log("Sending cache stub", key);
await send("Fetch.fulfillRequest", {
requestId, ...UNCACHED
});
}
} else if ( Mode == 'save' ) {
const response = {key, responseCode: responseStatusCode, responseHeaders};
const resp = await getBody({requestId, responseStatusCode});
if ( !! resp ) {
let {body, base64Encoded} = resp;
if ( ! base64Encoded ) {
body = b64(body);
}
response.body = body;
const responsePath = await saveResponseData(key, request.url, response);
State.Cache.set(key, responsePath);
} else {
DEBUG && console.warn("get response body error", key, responseStatusCode, responseHeaders, pausedRequest.responseErrorReason);
response.body = '';
}
await sleep(DELAY);
if ( !isFont && responseErrorReason ) {
if ( isNavigationRequest ) {
await send("Fetch.fulfillRequest", {
requestId,
responseHeaders: BLOCKED_HEADERS,
responseCode: BLOCKED_CODE,
body: Buffer.from(responseErrorReason).toString("base64"),
},
);
} else {
await send("Fetch.failRequest", {
requestId,
errorReason: responseErrorReason
},
);
}
} else {
try {
await send("Fetch.continueRequest", {
requestId,
},
);
} catch(e) {
console.warn("Issue with continuing request", e, message);
}
}
}
}
async function getBody({requestId, responseStatusCode}) {
let resp;
if ( ! BODYLESS.has(responseStatusCode) ) {
resp = await send("Fetch.getResponseBody", {requestId});
} else {
resp = {body:'', base64Encoded:true};
}
return resp;
}
function dontInstall(targetInfo) {
return targetInfo.type !== 'page';
}
function dontCache(request) {
if ( ! request.url ) return true;
if ( neverCache(request.url) ) return true;
const url = new URL(request.url);
return NEVER_CACHE.has(url.origin) || (State.No && State.No.test(url.host));
}
async function getResponseData(path) {
try {
return JSON.parse(await Fs.promises.readFile(path));
} catch(e) {
console.warn(`Error with ${path}`, e);
return UNCACHED;
}
}
async function saveResponseData(key, url, response) {
const origin = (new URL(url).origin);
let originDir = State.Cache.get(origin);
if ( ! originDir ) {
originDir = Path.resolve(library_path(), origin.replace(TBL, '_'));
try {
await Fs.promises.mkdir(originDir, {recursive:true});
} catch(e) {
console.warn(`Issue with origin directory ${Path.dirname(responsePath)}`, e);
}
State.Cache.set(origin, originDir);
}
const fileName = `${await hasha(key, HASH_OPTS)}.json`;
const responsePath = Path.resolve(originDir, fileName);
await Fs.promises.writeFile(responsePath, JSON.stringify(response,null,2));
return responsePath;
}
function serializeRequest(request) {
const {url, urlFragment, method, headers, postData, hasPostData} = request;
/**
let sortedHeaders = '';
for( const key of Object.keys(headers).sort() ) {
sortedHeaders += `${key}:${headers[key]}/`;
}
**/
return `${method}${url}`;
//return `${url}${urlFragment}:${method}:${sortedHeaders}:${postData}:${hasPostData}`;
}
}
// helpers
async function isReady() {
return await untilHas(Status, 'loaded');
}
async function loadFuzzy() {
const DEBUG = true;
const fuzzyDocs = Fs.readFileSync(getFuzzyPath()).toString();
State.Docs = new Map(JSON.parse(fuzzyDocs).map(doc => {
doc.i_url = getURI(doc.url);
doc.contentSignature = getContentSig(doc);
return [doc.url, doc];
}));
State.Fuzzy = fuzzy = new Fuzzy({source: [...State.Docs.values()], keys: FUZZ_OPTS.keys});
DEBUG && console.log('Fuzzy loaded');
}
function getContentSig(doc) {
return doc.title + ' ' + doc.content + ' ' + getURI(doc.url);
}
function getURI(url) {
return url.split(URI_SPLIT).join(' ');
}
function saveFuzzy(basePath) {
const docs = [...State.Docs.values()]
.map(({url, title, content, id}) => ({url, title, content, id}));
const path = getFuzzyPath(basePath);
Fs.writeFileSync(
path,
JSON.stringify(docs)
);
console.log(`Wrote fuzzy to ${path}`);
}
function clearSavers() {
if ( State.saver ) {
clearInterval(State.saver);
State.saver = null;
}
if ( State.indexSaver ) {
clearTimeout(State.indexSaver);
State.indexSaver = null;
}
if ( State.ftsIndexSaver ) {
clearTimeout(State.ftsIndexSaver);
State.ftsIndexSaver = null;
}
}
async function loadFiles() {
let cacheFile = CACHE_FILE();
let indexFile = INDEX_FILE();
let ftsDir = FTS_INDEX_DIR();
let someError = false;
try {
State.Cache = new Map(JSON.parse(Fs.readFileSync(cacheFile)));
} catch(e) {
State.Cache = new Map();
someError = true;
}
try {
State.Index = new Map(JSON.parse(Fs.readFileSync(indexFile)));
} catch(e) {
State.Index = new Map();
someError = true;
}
try {
const DEBUG = true;
const flexBase = getFlexBase();
Fs.readdirSync(flexBase, {withFileTypes:true}).forEach(dirEnt => {
if ( dirEnt.isFile() ) {
const content = Fs.readFileSync(Path.resolve(flexBase, dirEnt.name)).toString();
Flex.import(dirEnt.name, JSON.parse(content));
}
});
DEBUG && console.log('Flex loaded');
} catch(e) {
someError = true;
}
try {
loadNDXIndex(NDX_FTSIndex);
} catch(e) {
someError = true;
}
try {
loadFuzzy();
} catch(e) {
someError = true;
}
if ( someError ) {
const rl = readline.createInterface({input, output});
const question = util.promisify(rl.question).bind(rl);
console.warn('Error reading archive file. Your archive directory is corrupted. We will attempt to patch it so you can use it going forward, but because we replace a missing or corrupt index, cache, or full-text search index files with new blank copies, existing resources already indexed and cached may become inaccessible from your new index. A future version of this software should be able to more completely repair your archive directory, reconnecting and re-existing all cached resources and notifying you about and purging from the index any missing resources.\n');
console.log('Sorry about this, we are not sure why this happened, but we know this must be very distressing for you.\n');
console.log(`For your information, the corruped archive directory is at: ${args.getBasePath()}\n`);
console.info('Because this repair as described above is not a perfect solution, we will give you a choice of how to proceed. You have two options: 1) attempt a basic repair that may leave some resources inaccessible from the repaired archive, or 2) do not touch the corrupted archive, but instead create a new fresh blank archive to begin saving to. Which option would you like to proceed with?');
console.log('1) Basic repair with possible inaccessible pages');
console.log('2) Leave the corrupt archive untouched, start a new archive');
let correctAnswer = false;
let newBasePath = '';
while(!correctAnswer) {
let answer = await question('Which option would you like (1 or 2)? ');
answer = parseInt(answer);
switch(answer) {
case 1: {
console.log('Alright, selecting option 1. Using the existing archive and patching a simple repair.');
newBasePath = args.getBasePath();
correctAnswer = true;
}; break;
case 2: {
console.log('Alright, selection option 2. Leaving the existing archive along and creating a new, fresh, blank archive.');
let correctAnswer2 = false;
while( ! correctAnswer2 ) {
try {
newBasePath = Path.resolve(os.homedir(), await question(
'Please enter a directory name for your new archive.\n' +
`${os.homedir()}/`
));
correctAnswer2 = true;
} catch(e2) {
console.warn(e2);
console.info('Sorry that was not a valid directory name.');
await question('enter to continue');
}
}
correctAnswer = true;
}; break;
default: {
correctAnswer = false;
console.log('Sorry, that was not a valid option. Please input 1 or 2.');
}; break;
}
}
console.log('Resetting base path', newBasePath);
args.updateBasePath(newBasePath, {force:true});
saveFiles({forceSave:true});
}
Id = Math.round(State.Index.size / 2) + 3;
NDXId = State.Index.has(NDX_ID_KEY) ? State.Index.get(NDX_ID_KEY) + 3000 : (Id + 1000000);
if ( !Number.isInteger(NDXId) ) NDXId = Id;
DEBUG && console.log({firstFreeId: Id, firstFreeNDXId: NDXId});
State.SavedCacheFilePath = cacheFile;
State.SavedIndexFilePath = indexFile;
State.SavedFTSIndexDirPath = ftsDir;
DEBUG && console.log(`Loaded cache key file ${cacheFile}`);
DEBUG && console.log(`Loaded index file ${indexFile}`);
DEBUG && console.log(`Need to load FTS index dir ${ftsDir}`);
try {
if ( !Fs.existsSync(NO_FILE()) ) {
DEBUG && console.log(`The 'No file' (${NO_FILE()}) does not exist, ignoring...`);
State.No = null;
} else {
State.No = new RegExp(JSON.parse(Fs.readFileSync(NO_FILE))
.join('|')
.replace(/\./g, '\\.')
.replace(/\*/g, '.*')
.replace(/\?/g, '.?')
);
}
} catch(e) {
DEBUG && console.warn('Error compiling regex from No file', e);
State.No = null;
}
}
function getMode() { return Mode; }
function saveFiles({useState: useState = false, forceSave:forceSave = false} = {}) {
clearSavers();
State.Index.set(NDX_ID_KEY, NDXId);
if ( useState ) {
// saves the old cache path
saveCache(State.SavedCacheFilePath);
saveIndex(State.SavedIndexFilePath);
saveFTS(State.SavedFTSIndexDirPath, {forceSave});
} else {
saveCache();
saveIndex();
saveFTS(null, {forceSave});
}
}
async function changeMode(mode) {
DEBUG && console.log({modeChange:mode});
saveFiles({forceSave:true});
Close && Close();
Mode = mode;
await collect({chrome_port:args.chrome_port, mode});
}
function getDetails(id) {
const url = State.Index.get(id);
const {title} = State.Index.get(url);
const {content} = State.Docs.get(url);
return {url, title, id, content};
}
function findOffsets(query, doc, count = 0) {
const DEBUG = true;
const hl = fuzzy.highlight(doc);
DEBUG && console.log(query, hl);
return hl;
}
function beforePathChanged() {
saveFiles({useState:true, forceSave:true});
// clear all memory cache, index and full text indexes
State.Index.clear();
State.Cache.clear();
State.Docs.clear();
State.NDX_FTSIndex = NDX_FTSIndex = new NDXIndex(NDX_FIELDS);
State.Flex = Flex = new FTSIndex(FLEX_OPTS);
State.fuzzy = fuzzy = new Fuzzy({source: [...State.Docs.values()], keys: FUZZ_OPTS.keys});
}
async function afterPathChanged() {
DEBUG && console.log({libraryPathChange:args.library_path()});
saveFiles({useState:true, forceSave:true});
// reloads from new path and updates Saved FilePaths
await loadFiles();
}
function saveCache(path) {
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || CACHE_FILE());
Fs.writeFileSync(path || CACHE_FILE(), JSON.stringify([...State.Cache.entries()],null,2));
}
}
function saveIndex(path) {
if ( State.saveInProgress ) return;
State.saveInProgress = true;
clearTimeout(State.indexSaver);
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || INDEX_FILE());
//DEBUG && console.log([...State.Index.entries()].sort(SORT_URLS));
Fs.writeFileSync(
path || INDEX_FILE(),
JSON.stringify([...State.Index.entries()].sort(SORT_URLS),null,2)
);
}
State.indexSaver = setTimeout(saveIndex, 11001);
State.saveInProgress = false;
}
function getIndex() {
return JSON.parse(Fs.readFileSync(INDEX_FILE()))
.filter(([key, val]) => typeof key === 'string' && !hiddenKey(key));
}
async function search(query) {
const flex = (await Flex.searchAsync(query, args.results_per_page))
.map(id=> ({id, url: State.Index.get(id)}));
const ndx = NDX_FTSIndex.search(query)
.map(r => ({
ndx_id: r.key,
url: State.Index.get('ndx'+r.key),
score: r.score
}));
const fuzzRaw = fuzzy.search(query);
const fuzz = processFuzzResults(fuzzRaw);
const results = combineResults({flex, ndx, fuzz});
const ids = new Set(results);
const HL = new Map();
const highlights = fuzzRaw.filter(({id}) => ids.has(id)).map(obj => {
const title = State.Index.get(obj.url)?.title;
return {
id: obj.id,
url: Archivist.findOffsets(query, obj.url) || obj.url,
title: Archivist.findOffsets(query, title) || title,
};
});
highlights.forEach(hl => HL.set(hl.id, hl));
return {query,results, HL};
}
function combineResults({flex,ndx,fuzz}) {
const DEBUG = true;
DEBUG && console.log({flex,ndx,fuzz});
const score = {};
flex.forEach(countRank(score));
ndx.forEach(countRank(score));
fuzz.forEach(countRank(score));
const results = [...Object.values(score)].map(obj => {
try {
const {id} = State.Index.get(obj.url);
obj.id = id;
return obj;
} catch(e) {
console.log({obj, index:State.Index, e, ndx, flex, fuzz});
throw e;
}
});
results.sort(({score:scoreA}, {score:scoreB}) => scoreA-scoreB);
const resultIds = results.map(({id}) => id);
return resultIds;
}
function countRank(record, weight = 1.0) {
return ({url,id}, rank, all) => {
let score = record[url];
if ( ! score ) {
score = record[url] = {
url,
value: 0
};
}
score.value += weight*(all.length - rank)
};
}
function processFuzzResults(docs) {
const docIds = docs.map(({id}) => id);
const uniqueIds = new Set(docIds);
return [...uniqueIds.keys()].map(id => ({id, url:State.Index.get(id)}));
}
async function saveFTS(path = undefined, {forceSave:forceSave = false} = {}) {
if ( State.ftsSaveInProgress ) return;
State.ftsSaveInProgress = true;
clearTimeout(State.ftsIndexSaver);
if ( context == 'node' ) {
DEBUG && console.log("Writing FTS index to", path || FTS_INDEX_DIR());
const dir = path || FTS_INDEX_DIR();
if ( forceSave || UpdatedKeys.size ) {
DEBUG && console.log(`${UpdatedKeys.size} keys updated since last write`);
const flexBase = getFlexBase(dir);
Flex.export((key, data) => {
key = key.split('.').pop();
try {
Fs.writeFileSync(
Path.resolve(flexBase, key),
JSON.stringify(data)
);
} catch(e) {
console.error('Error writing full text search index', e);
}
});
console.log(`Wrote Flex to ${flexBase}`);
NDX_FTSIndex.save(dir);
saveFuzzy(dir);
UpdatedKeys.clear();
} else {
DEBUG && console.log("No FTS keys updated, no writes needed this time.");
}
}
State.ftsIndexSaver = setTimeout(saveFTS, 31001);
State.ftsSaveInProgress = false;
}
function shutdown(then) {
DEBUG && console.log(`Archivist shutting down...`);
saveFiles({forceSave:true});
Close && Close();
DEBUG && console.log(`Archivist shut down.`);
return then && then();
}
function b64(s) {
if ( context == 'node' ) {
return Buffer.from(s).toString('base64');
} else {
return btoa(s);
}
}
function NDXIndex(fields) {
let retVal;
if ( ! NDX_OLD ) {
// Old code (from newer, in my opinion, worse, version)
// source:
// adapted from:
// https://github.com/ndx-search/docs/blob/94530cbff6ae8ea66c54bba4c97bdd972518b8b4/README.md#creating-a-simple-indexer-with-a-search-function
if ( ! new.target ) { throw `NDXIndex must be called with 'new'`; }
// `createIndex()` creates an index data structure.
// First argument specifies how many different fields we want to index.
const index = NDX(fields.length);
// `fieldAccessors` is an array with functions that used to retrieve data from different fields.
const fieldAccessors = fields.map(f => doc => doc[f.name]);
// `fieldBoostFactors` is an array of boost factors for each field, in this example all fields will have
// identical factors.
const fieldBoostFactors = fields.map(() => 1);
retVal = {
index,
// `add()` function will add documents to the index.
add: doc => ndx(
retVal.index,
fieldAccessors,
// Tokenizer is a function that breaks text into words, phrases, symbols, or other meaningful elements
// called tokens.
// Lodash function `words()` splits string into an array of its words, see https://lodash.com/docs/#words for
// details.
words,
// Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to
// index documents.
termFilter,
// Document key, it can be a unique document id or a refernce to a document if you want to store all documents
// in memory.
doc.ndx_id,
// Document.
doc,
),
remove: id => {
removeDocumentFromIndex(retVal.index, NDXRemoved, id);
maybeClean();
},
update: (doc, old_id) => {
retVal.remove(old_id);
retVal.add(doc);
},
// `search()` function will be used to perform queries.
search: q => NDXQuery(
retVal.index,
fieldBoostFactors,
// BM25 ranking function constants:
1.2, // BM25 k1 constant, controls non-linear term frequency normalization (saturation).
0.75, // BM25 b constant, controls to what degree document length normalizes tf values.
words,
termFilter,
// Set of removed documents, in this example we don't want to support removing documents from the index,
// so we can ignore it by specifying this set as `undefined` value.
NDXRemoved,
q,
),
save: (basePath) => {
maybeClean(true);
const obj = toSerializable(retVal.index);
const objStr = JSON.stringify(obj);
const path = getNDXPath(basePath);
Fs.writeFileSync(
path,
objStr
);
console.log("Write NDX to ", path);
},
load: newIndex => {
retVal.index = newIndex;
}
};
} else {
// Even older code (from older but, to me, much better, version: 0.4.1)
const index = new DocumentIndex();
fields.forEach(name => index.addField(name));
retVal = {
index,
search: query => retVal.index.search(query),
add: doc => retVal.index.add(doc.id, doc),
};
}
DEBUG && console.log('ndx setup', {retVal});
return retVal;
function maybeClean(doIt = false) {
if ( (doIt && NDXRemoved.size) || NDXRemoved.size >= REMOVED_CAP_TO_VACUUM_NDX ) {
vacuumIndex(retVal.index, NDXRemoved);
}
}
}
function loadNDXIndex(ndxFTSIndex) {
const DEBUG = true;
if ( Fs.existsSync(getNDXPath()) ) {
const indexContent = Fs.readFileSync(getNDXPath()).toString();
const index = fromSerializable(JSON.parse(indexContent));
ndxFTSIndex.load(index);
}
DEBUG && console.log('NDX loaded');
}
function toNDXDoc({id, url, title, pageText}) {
// use existing defined id or a new one
return {
id,
ndx_id: NDXId++,
url,
i_url: getURI(url),
title,
content: pageText
};
}
function ndxDocFields({namesOnly:namesOnly = false} = {}) {
if ( !namesOnly && !NDX_OLD ) {
/* old format (for newer ndx >= v1 ) */
return [
/* we index over the special indexable url field, not the regular url field */
{ name: "i_url" },
{ name: "title" },
{ name: "content" },
];
} else {
/* new format (for older ndx ~ v0.4 ) */
return [
"i_url",
"title",
"content"
];
}
}
async function untilHas(thing, key) {
if ( thing instanceof Map ) {
if ( thing.has(key) ) {
return thing.get(key);
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(thing.get(key));
} else {
DEBUG && console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else if ( thing instanceof Set ) {
if ( thing.has(key) ) {
return true;
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(true);
} else {
DEBUG && console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else if ( typeof thing === "object" ) {
if ( thing[key] ) {
return true;
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing[key] ) {
clearInterval(checker);
resolve(true);
} else {
DEBUG && console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else {
throw new TypeError(`untilHas with thing of type ${thing} is not yet implemented!`);
}
}
function getNDXPath(basePath) {
return Path.resolve(args.ndx_fts_index_dir(basePath), 'index.ndx');
}
function getFuzzyPath(basePath) {
return Path.resolve(args.fuzzy_fts_index_dir(basePath), 'docs.fzz');
}
function getFlexBase(basePath) {
return args.flex_fts_index_dir(basePath);
}