Diskernet/archivist.js

588 lines
17 KiB
JavaScript

import hasha from 'hasha';
import {URL} from 'url';
import path from 'path';
import fs from 'fs';
import FlexSearch from 'flexsearch';
import args from './args.js';
import {
APP_ROOT, context, sleep, DEBUG,
CHECK_INTERVAL, TEXT_NODE, FORBIDDEN_TEXT_PARENT
} from './common.js';
import {connect} from './protocol.js';
import {getInjection} from './public/injection.js';
import {BLOCKED_BODY, BLOCKED_CODE, BLOCKED_HEADERS} from './blockedResponse.js';
//import xapian from 'xapian';
// cache is a simple map
// that holds the serialized requests
// that are saved on disk
let Fs, Mode, Close;
const {Index, registerCharset, registerLanguage} = FlexSearch;
const FLEX_OPTS = {
context: true,
};
const Flex = new Index(FLEX_OPTS);
const Cache = new Map();
const Indexing = new Set();
const State = {
Indexing,
Cache,
SavedCacheFilePath: null,
SavedIndexFilePath: null,
saver: null,
indexSaver: null
}
const IGNORE_NODES = new Set([
'script',
'style',
'noscript',
'datalist'
]);
const TextNode = 3;
const AttributeNode = 2;
const Archivist = {
collect, getMode, changeMode, shutdown, handlePathChanged
}
const BODYLESS = new Set([
301,
302,
303,
307
]);
const NEVER_CACHE = new Set([
`http://localhost:${args.server_port}`,
`http://localhost:${args.chrome_port}`
]);
const SORT_URLS = ([urlA],[urlB]) => urlA < urlB ? -1 : 1;
const CACHE_FILE = args.cache_file;
const INDEX_FILE = args.index_file;
const NO_FILE = args.no_file;
const TBL = /:\/\//g;
const HASH_OPTS = {algorithm: 'sha1'};
const UNCACHED_BODY = b64('We have not saved this data');
const UNCACHED_CODE = 404;
const UNCACHED_HEADERS = [
{ name: 'Content-type', value: 'text/plain' },
{ name: 'Content-length', value: '26' }
];
const UNCACHED = {
body:UNCACHED_BODY, responseCode:UNCACHED_CODE, responseHeaders:UNCACHED_HEADERS
}
export default Archivist;
async function collect({chrome_port:port, mode} = {}) {
if ( context == 'node' ) {
const {default:fs} = await import('fs');
Fs = fs;
}
const {library_path} = args;
const {send, on, close} = await connect({port});
const Sessions = new Map();
const Installations = new Set();
const ConfirmedInstalls = new Set();
const DELAY = 100; // 500 ?
Close = close;
Mode = mode;
let requestStage;
loadFiles();
clearSavers();
if ( Mode == 'save' ) {
requestStage = "Response";
// in case we get a updateBasePath call before an interval
// and we don't clear it in time, leading us to erroneously save the old
// cache to the new path, we always used our saved copy
State.saver = setInterval(() => saveCache(State.SavedCacheFilePath), 10000);
State.indexSaver = setInterval(() => saveIndex(State.SavedIndexFilePath), 10001);
} else if ( Mode == 'serve' ) {
requestStage = "Request";
} else {
throw new TypeError(`Must specify mode`);
}
on("Target.targetInfoChanged", indexURL);
on("Target.targetInfoChanged", reloadIfNotLive);
on("Target.targetInfoChanged", attachToTarget);
on("Target.attachedToTarget", installForSession);
on("Fetch.requestPaused", cacheRequest);
on("Runtime.consoleAPICalled", confirmInstall);
await send("Fetch.enable", {
patterns: [
{
urlPattern: "http*://*",
requestStage
}
],
});
await send("Target.setDiscoverTargets", {discover:true});
await send("Target.setAutoAttach", {autoAttach:true, waitForDebuggerOnStart:false, flatten: true});
const {targetInfos:targets} = await send("Target.getTargets", {});
const pageTargets = targets.filter(({type}) => type == 'page');
pageTargets.forEach(attachToTarget);
function guard(func, text = '') {
return (...args) => {
//DEBUG && console.log({text, func:func.name, args:JSON.stringify(args,null,2)});
return func(...args);
};
}
function confirmInstall(args) {
const {type, args:[{value:strVal}], context} = args;
if ( type == 'info' ) {
try {
const val = JSON.parse(strVal);
const {installed:{sessionId}} = val;
if ( ! ConfirmedInstalls.has(sessionId) ) {
ConfirmedInstalls.add(sessionId);
console.log({confirmedInstall:val, context});
}
} finally {}
}
}
async function reloadIfNotLive({targetInfo}) {
if ( Mode == 'serve' ) return;
const {attached, type} = targetInfo;
if ( attached && type == 'page' ) {
const {url, targetId} = targetInfo;
const sessionId = Sessions.get(targetId);
if ( !!sessionId && !!url && url != "about:blank" && !url.startsWith('chrome') && !ConfirmedInstalls.has(sessionId) ) {
console.log({reloadingAsNotConfirmedInstalled:{url, sessionId}});
send("Page.stopLoading", {}, sessionId);
send("Page.reload", {}, sessionId);
}
}
}
async function installForSession({sessionId, targetInfo, waitingForDebugger}) {
console.log("installForSession called");
if ( ! sessionId ) {
throw new TypeError(`installForSession needs a sessionId`);
}
const {targetId, url} = targetInfo;
if ( Installations.has(sessionId) ) return;
if ( targetInfo.type != 'page' ) return;
if ( Mode == 'serve' ) return;
Sessions.set(targetId, sessionId);
if ( Mode == 'save' ) {
send("Network.setCacheDisabled", {cacheDisabled:true}, sessionId);
send("Network.setBypassServiceWorker", {bypass:true}, sessionId);
await send("Runtime.enable", {}, sessionId);
await send("Page.enable", {}, sessionId);
await send("DOMSnapshot.enable", {}, sessionId);
await send("Page.addScriptToEvaluateOnNewDocument", {
source: getInjection({sessionId}),
worldName: "Context-22120-Indexing"
}, sessionId);
DEBUG && console.log("Just request install", targetId, url);
}
Installations.add(sessionId);
console.log('Installed sessionId', sessionId);
indexURL({targetInfo});
}
async function indexURL({targetInfo:info = {}, sessionId, waitingForDebugger} = {}) {
if ( Mode == 'serve' ) return;
if ( info.type != 'page' ) return;
if ( ! info.url || info.url == 'about:blank' ) return;
if ( info.url.startsWith('chrome') ) return;
if ( dontCache(info) ) return;
if ( State.Indexing.has(info.targetId) ) return;
State.Indexing.add(info.targetId);
State.Index.set(info.url, info.title);
if ( ! sessionId ) {
sessionId = await untilHas(Sessions, info.targetId);
}
if ( !Installations.has(sessionId) ) {
await untilHas(Installations, sessionId);
}
console.log('hi', sessionId);
send("DOMSnapshot.enable", {}, sessionId);
await sleep(500);
const flatDoc = await send("DOMSnapshot.captureSnapshot", {
computedStyles: [],
}, sessionId);
const pageText = processDoc(flatDoc);
//Flex.updateAsync(info.url, pageText).then(r => console.log('Search index update done'));
//Flex.addAsync(info.url, pageText).then(r => console.log('Search index update done'));
const res = Flex.add(info.url, pageText);
DEBUG && console.log('Flex Index Result>>>', res);
State.Indexing.delete(info.targetId);
console.log(`Indexed ${info.url} to ${info.title}`);
}
async function untilHas(thing, key) {
if ( thing instanceof Map ) {
if ( thing.has(key) ) {
return thing.get(key);
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(thing.get(key));
} else {
console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else if ( thing instanceof Set ) {
if ( thing.has(key) ) {
return true;
} else {
let resolve;
const pr = new Promise(res => resolve = res);
const checker = setInterval(() => {
if ( thing.has(key) ) {
clearInterval(checker);
resolve(true);
} else {
console.log(thing, "not have", key);
}
}, CHECK_INTERVAL);
return pr;
}
} else {
throw new TypeError(`untilHas with thing of type ${thing} is not yet implemented!`);
}
}
function processDoc({documents, strings}) {
/*
Info
Implementation Notes
1. Code uses spec at:
https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#type-NodeTreeSnapshot
2. Note that so far the below will NOT produce text for and therefore we will NOT
index textarea or input elements. We can access those by using the textValue and
inputValue array properties of the doc, if we want to implement that.
*/
const texts = [];
for( const doc of documents) {
const textIndices = doc.nodes.nodeType.reduce((Indices, type, index) => {
if ( type === TEXT_NODE ) {
const parentIndex = doc.nodes.parentIndex[index];
const forbiddenParent = parentIndex >= 0 &&
FORBIDDEN_TEXT_PARENT.has(strings[
doc.nodes.nodeName[
parentIndex
]
])
if ( ! forbiddenParent ) {
Indices.push(index);
}
}
return Indices;
}, []);
textIndices.forEach(index => {
const stringsIndex = doc.nodes.nodeValue[index];
const text = strings[stringsIndex];
texts.push(text);
});
}
const pageText = texts.filter(t => t.trim()).join(' ');
DEBUG && console.log('Page text>>>', pageText);
return pageText;
}
async function attachToTarget(targetInfo) {
if ( dontCache(targetInfo) ) return;
const {url} = targetInfo;
if ( !!url && url != "about:blank" && !url.startsWith('chrome') ) {
if ( targetInfo.type == 'page' ) {
if ( ! targetInfo.attached ) {
const {sessionId} = await send("Target.attachToTarget", {
targetId: targetInfo.targetId,
flatten: true
});
Sessions.set(targetInfo.targetId, sessionId);
}
}
}
}
async function cacheRequest(pausedRequest) {
const {
requestId, request, resourceType,
responseStatusCode, responseHeaders, responseErrorReason
} = pausedRequest;
const {url} = request;
const isNavigationRequest = resourceType == "Document";
const isFont = resourceType == "Font";
if ( dontCache(request) ) {
DEBUG && console.log("Not caching", request.url);
return send("Fetch.continueRequest", {requestId});
}
const key = serializeRequest(request);
if ( Mode == 'serve' ) {
if ( State.Cache.has(key) ) {
let {body, responseCode, responseHeaders} = await getResponseData(State.Cache.get(key));
responseCode = responseCode || 200;
//DEBUG && console.log("Fulfilling", key, responseCode, responseHeaders, body.slice(0,140));
DEBUG && console.log("Fulfilling", key, responseCode, body.slice(0,140));
await send("Fetch.fulfillRequest", {
requestId, body, responseCode, responseHeaders
});
} else {
DEBUG && console.log("Sending cache stub", key);
await send("Fetch.fulfillRequest", {
requestId, ...UNCACHED
});
}
} else if ( Mode == 'save' ) {
const response = {key, responseCode: responseStatusCode, responseHeaders};
const resp = await getBody({requestId, responseStatusCode});
if ( !! resp ) {
let {body, base64Encoded} = resp;
if ( ! base64Encoded ) {
body = b64(body);
}
response.body = body;
const responsePath = await saveResponseData(key, request.url, response);
State.Cache.set(key, responsePath);
} else {
DEBUG && console.warn("get response body error", key, responseStatusCode, responseHeaders, pausedRequest.responseErrorReason);
response.body = '';
}
await sleep(DELAY);
if ( !isFont && responseErrorReason ) {
if ( isNavigationRequest ) {
await send("Fetch.fulfillRequest", {
requestId,
responseHeaders: BLOCKED_HEADERS,
responseCode: BLOCKED_CODE,
body: Buffer.from(responseErrorReason).toString("base64"),
},
);
} else {
await send("Fetch.failRequest", {
requestId,
errorReason: responseErrorReason
},
);
}
} else {
try {
await send("Fetch.continueRequest", {
requestId,
},
);
} catch(e) {
console.warn("Issue with continuing request", e, message);
}
}
}
}
async function getBody({requestId, responseStatusCode}) {
let resp;
if ( ! BODYLESS.has(responseStatusCode) ) {
resp = await send("Fetch.getResponseBody", {requestId});
} else {
resp = {body:'', base64Encoded:true};
}
return resp;
}
function dontCache(request) {
if ( ! request.url ) return false;
const url = new URL(request.url);
return NEVER_CACHE.has(url.origin) || (State.No && State.No.test(url.host));
}
async function getResponseData(path) {
try {
return JSON.parse(await Fs.promises.readFile(path));
} catch(e) {
console.warn(`Error with ${path}`, e);
return UNCACHED;
}
}
async function saveResponseData(key, url, response) {
const origin = (new URL(url).origin);
let originDir = State.Cache.get(origin);
if ( ! originDir ) {
originDir = path.resolve(library_path(), origin.replace(TBL, '_'));
try {
await Fs.promises.mkdir(originDir, {recursive:true});
} catch(e) {
console.warn(`Issue with origin directory ${path.dirname(responsePath)}`, e);
}
State.Cache.set(origin, originDir);
}
const fileName = `${await hasha(key, HASH_OPTS)}.json`;
const responsePath = path.resolve(originDir, fileName);
await Fs.promises.writeFile(responsePath, JSON.stringify(response,null,2));
return responsePath;
}
function serializeRequest(request) {
const {url, urlFragment, method, headers, postData, hasPostData} = request;
/**
let sortedHeaders = '';
for( const key of Object.keys(headers).sort() ) {
sortedHeaders += `${key}:${headers[key]}/`;
}
**/
return `${method}${url}`;
//return `${url}${urlFragment}:${method}:${sortedHeaders}:${postData}:${hasPostData}`;
}
}
function clearSavers() {
if ( State.saver ) {
clearInterval(State.saver);
State.saver = null;
}
if ( State.indexSaver ) {
clearInterval(State.indexSaver);
State.indexSaver = null;
}
}
function loadFiles() {
try {
State.Cache = new Map(JSON.parse(Fs.readFileSync(CACHE_FILE())));
State.Index = new Map(JSON.parse(Fs.readFileSync(INDEX_FILE())));
State.SavedCacheFilePath = CACHE_FILE();
State.SavedIndexFilePath = INDEX_FILE();
DEBUG && console.log(`Loaded cache key file ${CACHE_FILE()}`);
DEBUG && console.log(`Loaded index file ${INDEX_FILE()}`);
} catch(e) {
DEBUG && console.warn('Error reading file', e);
State.Cache = new Map();
State.Index = new Map();
}
try {
if ( !Fs.existsSync(NO_FILE()) ) {
DEBUG && console.log(`The 'No file' (${NO_FILE()}) does not exist, ignoring...`);
State.No = null;
} else {
State.No = new RegExp(JSON.parse(Fs.readFileSync(NO_FILE))
.join('|')
.replace(/\./g, '\\.')
.replace(/\*/g, '.*')
.replace(/\?/g, '.?')
);
}
} catch(e) {
DEBUG && console.warn('Error compiling regex from No file', e);
State.No = null;
}
}
function getMode() { return Mode; }
async function changeMode(mode) {
DEBUG && console.log({modeChange:mode});
clearSavers();
saveCache();
saveIndex();
Close && Close();
Mode = mode;
await collect({chrome_port:args.chrome_port, mode});
}
function handlePathChanged() {
DEBUG && console.log({libraryPathChange:args.library_path()});
clearSavers();
// saves the old cache path
saveCache(State.SavedCacheFilePath);
saveIndex(State.SavedIndexFilePath);
// reloads from new path and updates Saved FilePaths
loadFiles();
}
function saveCache(path) {
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || CACHE_FILE());
Fs.writeFileSync(path || CACHE_FILE(), JSON.stringify([...State.Cache.entries()],null,2));
}
}
function saveIndex(path) {
if ( context == 'node' ) {
//DEBUG && console.log("Writing to", path || INDEX_FILE());
//DEBUG && console.log([...State.Index.entries()].sort(SORT_URLS));
Fs.writeFileSync(
path || INDEX_FILE(),
JSON.stringify([...State.Index.entries()].sort(SORT_URLS),null,2)
);
}
}
function shutdown() {
DEBUG && console.log(`Archivist shutting down...`);
saveCache();
Close && Close();
DEBUG && console.log(`Archivist shut down.`);
}
function b64(s) {
if ( context == 'node' ) {
return Buffer.from(s).toString('base64');
} else {
return btoa(s);
}
}