diff --git a/.date.npm.release b/.date.npm.release deleted file mode 100644 index a2b5740..0000000 --- a/.date.npm.release +++ /dev/null @@ -1 +0,0 @@ -Sun Jan 2 05:40:42 AM HKT 2022 diff --git a/.eslintrc.cjs b/.eslintrc.cjs deleted file mode 100644 index ca9fa77..0000000 --- a/.eslintrc.cjs +++ /dev/null @@ -1,16 +0,0 @@ -module.exports = { - "env": { - "es2021": true, - "node": true - }, - "extends": "eslint:recommended", - "parserOptions": { - "ecmaVersion": 13, - "sourceType": "module" - }, - "ignorePatterns": [ - "build/**/*.js" - ], - "rules": { - } -}; diff --git a/.no-release b/.no-release deleted file mode 100644 index 1f970dd..0000000 --- a/.no-release +++ /dev/null @@ -1 +0,0 @@ -Wed Jan 5 02:35:00 PM HKT 2022 diff --git a/.npmignore b/.npmignore deleted file mode 100644 index 69422de..0000000 --- a/.npmignore +++ /dev/null @@ -1,5 +0,0 @@ - -.*.swp - -# Bundling and packaging -bin/* diff --git a/.package.build.json b/.package.build.json deleted file mode 100644 index 17c58b0..0000000 --- a/.package.build.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "name": "diskernet", - "version": "2.6.0", - "description": "Library server and an archivist browser controller.", - "main": "build/22120.js", - "module": "src/app.js", - "bin": { - "diskernet": "build/22120.js" - }, - "scripts": { - "bundle": "npx rollup --config", - "start": "node src/app.js", - "setup": "bash ./scripts/build_setup.sh", - "build": "echo 'please run ./scripts/build_setup.sh first (one time only) and then ./scripts/compile.sh'", - "clean": "rm -rf build/* bin/*", - "test": "nodemon src/app.js", - "save": "nodemon src/app.js 22120 save", - "serve": "nodemon src/app.js 22120 serve", - "lint": "watch -n 5 npx eslint .", - "test-hl": "node src/highlighter.js", - "postpublish": "cp package.json .package.build.json", - "prepublishOnly": "npm run bundle && npx webpack ." - }, - "repository": { - "type": "git", - "url": "git+https://github.com/dosyago/22120.git" - }, - "pkg": { - "patches": { - "./node_modules/fetch-blob/streams.cjs": [ - "Object.assign(globalThis, require('node:stream/web'))", - "Object.assign(globalThis, require('stream').web)" - ] - }, - "scripts": "build/*.js", - "assets": "public/**/*", - "outputPath": "bin" - }, - "keywords": [ - "web-archive", - "search-engine", - "self-hosted", - "offline", - "archivist", - "library" - ], - "author": "@dosy", - "license": "AGPL-3.0", - "bugs": { - "url": "https://github.com/dosyago/22120/issues" - }, - "homepage": "https://github.com/dosyago/22120#readme", - "dependencies": { - "chrome-launcher": "latest", - "express": "latest", - "flexsearch": "^0.7.21", - "fz-search": "^1.0.0", - "hasha": "latest", - "natural": "^5.1.11", - "ndx": "^1.0.2", - "ndx-query": "^1.0.1", - "ndx-serializable": "^1.0.0", - "node-fetch": "latest", - "ukkonen": "^1.4.0", - "ws": "latest" - }, - "devDependencies": { - "eslint": "^8.4.1", - "esm": "^3.2.25", - "nexe": "^1.1.6", - "nodemon": "latest", - "npx": "^3.0.0", - "webpack": "latest", - "webpack-cli": "latest", - "rollup-plugin-terser": "^7.0.2" - } -} diff --git a/.package.dev.json b/.package.dev.json deleted file mode 100644 index ec153cb..0000000 --- a/.package.dev.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "name": "diskernet", - "version": "2.4.11", - "type": "module", - "description": "Library server and an archivist browser controller.", - "main": "src/app.js", - "module": "dist/22120-module.js", - "bin": { - "diskernet": "build/22120.js" - }, - "scripts": { - "bundle": "npx rollup --config", - "start": "node src/app.js", - "setup": "bash ./scripts/build_setup.sh", - "build": "echo 'please run ./scripts/build_setup.sh first (one time only) and then ./scripts/compile.sh'", - "clean": "rm -rf build/* bin/*", - "test": "nodemon src/app.js", - "save": "nodemon src/app.js 22120 save", - "serve": "nodemon src/app.js 22120 serve", - "lint": "watch -n 5 npx eslint .", - "test-hl": "node src/highlighter.js", - "postpublish": "cp package.json .package.build.json", - "prepublishOnly": "npm run bundle && npx webpack ." - }, - "repository": { - "type": "git", - "url": "git+https://github.com/dosyago/22120.git" - }, - "pkg": { - "scripts": "build/*.js", - "assets": "public/**/*", - "outputPath": "bin" - }, - "keywords": [ - "archivist", - "library" - ], - "author": "@dosy", - "license": "AGPL-3.0", - "bugs": { - "url": "https://github.com/dosyago/22120/issues" - }, - "homepage": "https://github.com/dosyago/22120#readme", - "dependencies": { - "chrome-launcher": "latest", - "express": "latest", - "flexsearch": "^0.7.21", - "fz-search": "^1.0.0", - "hasha": "latest", - "natural": "^5.1.11", - "ndx": "^1.0.2", - "ndx-query": "^1.0.1", - "ndx-serializable": "^1.0.0", - "node-fetch": "latest", - "ukkonen": "^1.4.0", - "ws": "latest" - }, - "devDependencies": { - "eslint": "^8.4.1", - "esm": "^3.2.25", - "nexe": "^1.1.6", - "nodemon": "latest", - "npx": "^3.0.0", - "webpack": "latest", - "webpack-cli": "latest" - } -} diff --git a/src/.common.build.js b/src/.common.build.js deleted file mode 100644 index 8afa646..0000000 --- a/src/.common.build.js +++ /dev/null @@ -1,54 +0,0 @@ -import path from 'path'; -//import {fileURLToPath} from 'url'; - -export const DEBUG = process.env.DEBUG_22120 || false; -export const SHOW_FETCH = false; - -export const CHECK_INTERVAL = 400; -export const TEXT_NODE = 3; -export const MAX_HIGHLIGHTABLE_LENGTH = 0; /* 0 is no max length for highlight */ -export const MAX_TITLE_LENGTH = 140; -export const MAX_URL_LENGTH = 140; -export const MAX_HEAD = 140; - -/* text nodes inside these elements that are ignored */ -export const FORBIDDEN_TEXT_PARENT = new Set([ - 'STYLE', - 'SCRIPT', - 'NOSCRIPT', - /* we could remove these last two so as to index them as well */ - 'DATALIST', - 'OPTION' -]); -export const ERROR_CODE_SAFE_TO_IGNORE = new Set([ - -32000, /* message: - Can only get response body on requests captured after headers received. - * ignore because: - seems to only happen when new navigation aborts all - pending requests of the unloading page - */ - -32602, /* message: - Invalid InterceptionId. - * ignore because: - seems to only happen when new navigation aborts all - pending requests of the unloading page - */ -]); - -export const SNIP_CONTEXT = 31; - -export const NO_SANDBOX = process.env.DEBUG_22120 || false; - -//export const APP_ROOT = '.'; -export const APP_ROOT = __dirname; -//export const APP_ROOT = path.dirname(fileURLToPath(import.meta.url)); - -export const sleep = ms => new Promise(res => setTimeout(res, ms)); - -export function say(o) { - console.log(JSON.stringify(o)); -} - -export function clone(o) { - return JSON.parse(JSON.stringify(o)); -} diff --git a/src/.common.dev.js b/src/.common.dev.js deleted file mode 100644 index ce7e0e8..0000000 --- a/src/.common.dev.js +++ /dev/null @@ -1,54 +0,0 @@ -import path from 'path'; -import {fileURLToPath} from 'url'; - -export const DEBUG = process.env.DEBUG_22120 || false; -export const SHOW_FETCH = false; - -export const CHECK_INTERVAL = 400; -export const TEXT_NODE = 3; -export const MAX_HIGHLIGHTABLE_LENGTH = 0; /* 0 is no max length for highlight */ -export const MAX_TITLE_LENGTH = 140; -export const MAX_URL_LENGTH = 140; -export const MAX_HEAD = 140; - -/* text nodes inside these elements that are ignored */ -export const FORBIDDEN_TEXT_PARENT = new Set([ - 'STYLE', - 'SCRIPT', - 'NOSCRIPT', - /* we could remove these last two so as to index them as well */ - 'DATALIST', - 'OPTION' -]); -export const ERROR_CODE_SAFE_TO_IGNORE = new Set([ - -32000, /* message: - Can only get response body on requests captured after headers received. - * ignore because: - seems to only happen when new navigation aborts all - pending requests of the unloading page - */ - -32602, /* message: - Invalid InterceptionId. - * ignore because: - seems to only happen when new navigation aborts all - pending requests of the unloading page - */ -]); - -export const SNIP_CONTEXT = 31; - -export const NO_SANDBOX = process.env.DEBUG_22120 || false; - -//export const APP_ROOT = '.'; -//export const APP_ROOT = __dirname; -export const APP_ROOT = path.dirname(fileURLToPath(import.meta.url)); - -export const sleep = ms => new Promise(res => setTimeout(res, ms)); - -export function say(o) { - console.log(JSON.stringify(o)); -} - -export function clone(o) { - return JSON.parse(JSON.stringify(o)); -} diff --git a/src/common.js b/src/common.js index ce7e0e8..b6ae2a2 100644 --- a/src/common.js +++ b/src/common.js @@ -1,9 +1,25 @@ import path from 'path'; import {fileURLToPath} from 'url'; +import fs from 'fs'; +import os from 'os'; -export const DEBUG = process.env.DEBUG_22120 || false; +export const DEBUG = { + debug: process.env.DEBUG_22120 || false, + checkPred: true +} export const SHOW_FETCH = false; +// server related +export const PUBLIC_SERVER = true; + +// crawl related +export const MIN_TIME_PER_PAGE = 10000; +export const MAX_TIME_PER_PAGE = 32000; +export const MIN_WAIT = 200; +export const MAX_WAITS = 300; +export const BATCH_SIZE = 5; // crawl batch size (how many concurrent tabs for crawling) +export const MAX_REAL_URL_LENGTH = 2**15 - 1; + export const CHECK_INTERVAL = 400; export const TEXT_NODE = 3; export const MAX_HIGHLIGHTABLE_LENGTH = 0; /* 0 is no max length for highlight */ @@ -11,6 +27,21 @@ export const MAX_TITLE_LENGTH = 140; export const MAX_URL_LENGTH = 140; export const MAX_HEAD = 140; +export const GO_SECURE = fs.existsSync(path.resolve(os.homedir(), 'local-sslcerts', 'privkey.pem')); + +export class RichError extends Error { + constructor(msg) { + let textMessage; + try { + textMessage = JSON.stringify(msg); + } catch(e) { + console.warn(`Could not create RichError from argument ${msg.toString ? msg.toString() : msg} as JSON serialization failed. RichError argument MUST be JSON serializable. Failure error was:`, e); + return; + } + super(textMessage); + } +} + /* text nodes inside these elements that are ignored */ export const FORBIDDEN_TEXT_PARENT = new Set([ 'STYLE', @@ -37,7 +68,7 @@ export const ERROR_CODE_SAFE_TO_IGNORE = new Set([ export const SNIP_CONTEXT = 31; -export const NO_SANDBOX = process.env.DEBUG_22120 || false; +export const NO_SANDBOX = (process.env.DEBUG_22120 && process.env.SET_22120_NO_SANDBOX) || false; //export const APP_ROOT = '.'; //export const APP_ROOT = __dirname; @@ -52,3 +83,29 @@ export function say(o) { export function clone(o) { return JSON.parse(JSON.stringify(o)); } + +export async function untilTrue(pred, waitOverride = MIN_WAIT, maxWaits = MAX_WAITS) { + if ( waitOverride < 0 ) { + maxWaits = -1; + waitOverride = MIN_WAIT; + } + let waitCount = 0; + let resolve; + const pr = new Promise(res => resolve = res); + setTimeout(checkPred, 0); + return pr; + + async function checkPred() { + DEBUG.checkPred && console.log('Checking', pred.toString()); + if ( await pred() ) { + return resolve(true); + } else { + waitCount++; + if ( waitCount < maxWaits || maxWaits < 0 ) { + setTimeout(checkPred, waitOverride); + } else { + resolve(false); + } + } + } +} diff --git a/src/protocol.js b/src/protocol.js index 0134b6b..f3ff893 100644 --- a/src/protocol.js +++ b/src/protocol.js @@ -1,6 +1,6 @@ import Ws from 'ws'; import Fetch from 'node-fetch'; -import {SHOW_FETCH, DEBUG, ERROR_CODE_SAFE_TO_IGNORE} from './common.js'; +import {untilTrue, SHOW_FETCH, DEBUG, ERROR_CODE_SAFE_TO_IGNORE} from './common.js'; const ROOT_SESSION = "browser"; const MESSAGES = new Map(); @@ -8,7 +8,18 @@ const MESSAGES = new Map(); export async function connect({port:port = 9222} = {}) { let webSocketDebuggerUrl, socket; try { - ({webSocketDebuggerUrl} = await Fetch(`http://localhost:${port}/json/version`).then(r => r.json())); + await untilTrue(async () => { + let result = false; + try { + const {webSocketDebuggerUrl} = await Fetch(`http://127.0.0.1:${port}/json/version`).then(r => r.json()); + if ( webSocketDebuggerUrl ) { + result = true; + } + } finally { + return result; + } + }); + ({webSocketDebuggerUrl} = await Fetch(`http://127.0.0.1:${port}/json/version`).then(r => r.json())); socket = new Ws(webSocketDebuggerUrl); } catch(e) { console.log("Error communicating with browser", e);