Delete some

This commit is contained in:
Cris Stringfellow 2023-01-14 16:26:19 +08:00
commit bb560f0853
No known key found for this signature in database
24 changed files with 5784 additions and 229 deletions

View File

@ -1 +0,0 @@
Sun Jan 2 05:40:42 AM HKT 2022

View File

@ -1 +0,0 @@
Wed Jan 5 02:35:00 PM HKT 2022

2754
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,7 @@
"clean": "rm -rf build/* bin/*",
"super-clean": "npm run clean || : && rm -rf node_modules || : && rm package-lock.json",
"test": "nodemon src/app.js",
"inspect": "node --inspect-brk=127.0.0.1:9999 src/app.js",
"save": "nodemon src/app.js DiskerNet save",
"serve": "nodemon src/app.js DiskerNet serve",
"lint": "watch -n 5 npx eslint .",

View File

@ -0,0 +1,133 @@
#!/usr/bin/env node
import fs from 'node:fs';
import path from 'node:path';
import child_process from 'node:child_process';
import {
loadPref,
cache_file,
index_file,
} from '../src/args.js';
const CLEAN = true;
const CONCURRENT = 7;
const sleep = ms => new Promise(res => setTimeout(res, ms));
const problems = new Map();
let cleaning = false;
let made = false;
process.on('exit', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
process.on('SIGHUP', cleanup);
process.on('SIGUSR2', cleanup);
process.on('beforeExit', cleanup);
console.log({Pref:loadPref(), cache_file: cache_file(), index_file: index_file()});
make();
async function make() {
const indexFile = fs.readFileSync(index_file()).toString();
JSON.parse(indexFile).map(([key, value]) => {
if ( typeof key === "number" ) return;
if ( key.startsWith('ndx') ) return;
if ( value.title === undefined ) {
console.log('no title property', {key, value});
}
const url = key;
const title = value.title.toLocaleLowerCase();
if ( title.length === 0 || title.includes('404') || title.includes('not found') ) {
if ( problems.has(url) ) {
console.log('Found duplicate', url, title, problems.get(url));
}
const prob = {title, dupes:[], dupe:false};
problems.set(url, prob);
const cleaned1 = clean(url);
if ( problems.has(cleaned1) ) {
console.log(`Found duplicate`, {url, title, cleaned1, dupeEntry:problems.get(cleaned1)});
prob.dupe = true;
prob.dupes.push(cleaned1);
url !== cleaned1 && (problems.delete(cleaned1), prob.diff = true);
}
const cleaned2 = clean2(url);
if ( problems.has(cleaned2) ) {
console.log(`Found duplicate`, {url, title, cleaned2, dupeEntry: problems.get(cleaned2)});
prob.dupe = true;
prob.dupes.push(cleaned2);
url !== cleaned2 && (problems.delete(cleaned2), prob.diff = true);
}
}
});
made = true;
cleanup();
}
function cleanup() {
if ( cleaning ) return;
if ( ! made ) return;
cleaning = true;
console.log('cleanup running');
const outData = [...problems.entries()].filter(([key, {dupe}]) => dupe);
outData.sort(([a], [b]) => a.localeCompare(b));
fs.writeFileSync(
path.resolve('.', 'url-cleaned-dupes.json'),
JSON.stringify(outData, null, 2)
);
const {size:bytesWritten} = fs.statSync(
path.resolve('.', 'url-cleaned-dupes.json'),
{bigint: true}
);
console.log(`Wrote ${outData.length} dupe urls in ${bytesWritten} bytes.`);
process.exit(0);
}
function clean(urlString) {
const url = new URL(urlString);
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
} else {
url.hash = '';
}
for ( const [key, value] of url.searchParams ) {
if ( key.startsWith('utm_') ) {
url.searchParams.delete(key);
}
}
url.pathname = url.pathname.replace(/\/$/, '');
url.protocol = 'https:';
url.pathname = url.pathname.replace(/(\.htm.?|\.php|\.asp.?)$/, '');
if ( url.hostname.startsWith('www.') ) {
url.hostname = url.hostname.replace(/^www./, '');
}
const key = url.toString();
return key;
}
function clean2(urlString) {
const url = new URL(urlString);
url.pathname = '';
return url.toString();
}
function curlCommand(url) {
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'DNT: 1' \
-H 'Pragma: no-cache' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: none' \
-H 'Sec-Fetch-User: ?1' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed ;
`;
}

92
public/find_crawlable.mjs Executable file
View File

@ -0,0 +1,92 @@
#!/usr/bin/env node
import fs from 'node:fs';
import path from 'node:path';
import child_process from 'node:child_process';
const CLEAN = false;
const CONCURRENT = 7;
const sleep = ms => new Promise(res => setTimeout(res, ms));
const entries = [];
let cleaning = false;
process.on('exit', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
process.on('SIGHUP', cleanup);
process.on('SIGUSR2', cleanup);
process.on('beforeExit', cleanup);
make();
async function make() {
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
titles.forEach(({url,title}) => {
if ( title.length === 0 && url.startsWith('https:') && !url.endsWith('.pdf') ) {
entries.push(url);
}
});
cleanup();
}
function cleanup() {
if ( cleaning ) return;
cleaning = true;
console.log('cleanup running');
fs.writeFileSync(
path.resolve('.', 'recrawl-https-3.json'),
JSON.stringify(entries, null, 2)
);
console.log(`Wrote recrawlable urls`);
process.exit(0);
}
function clean(urlString) {
const url = new URL(urlString);
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
} else {
url.hash = '';
}
for ( const [key, value] of url.searchParams ) {
if ( key.startsWith('utm_') ) {
url.searchParams.delete(key);
}
}
url.pathname = url.pathname.replace(/\/$/, '');
url.protocol = 'https:';
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
if ( url.hostname.startsWith('www.') ) {
url.hostname = url.hostname.replace(/^www./, '');
}
const key = url.toString();
return key;
}
function clean2(urlString) {
const url = new URL(urlString);
url.pathname = '';
return url.toString();
}
function curlCommand(url) {
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'DNT: 1' \
-H 'Pragma: no-cache' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: none' \
-H 'Sec-Fetch-User: ?1' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed ;
`;
}

View File

@ -8,6 +8,140 @@
<p>
View <a href=/archive_index.html>the index</a>
</p>
<form method=POST action=/crawl>
<fieldset>
<legend>Crawl and Index</legend>
<p>
Crawl and index a list of links.
<br>
<small>This will open 1 link at a time, and index it when it has loaded.</small>
<p>
<label>
Links
<br>
<textarea class=long name=links>
https://cnn.com
https://bloomberg.com
https://microsoft.com
https://dosyago.com
https://intel.com
</textarea>
<br>
<small>List format is 1 link per line.</small>
</label>
</p>
<details open>
<summary>Advanced settings</summary>
<p>
<label>
Timeout
<br>
<input required name=timeout
type=number min=1 max=300 value=3.6 step=0.1> <span class=units>seconds</span>
<br>
<small>Seconds to wait for each page to load before indexing.</small>
</label>
<p>
<label>
Depth
<br>
<input required name=depth
type=number min=1 max=20 value=1 step=1> <span class=units>clicks</span>
</label>
<br>
<section class=small>
<strong>Value guide</strong>
<ol>
<li>Only each link.
<li>Plus anything 1 click from the link.
<li>Plus anything 2 clicks from the link.
</ol>
<em>And so on&hellip;</em>
</section>
<p>
<label>
Min Page Crawl Time
<br>
<input name=minPageCrawlTime
type=number min=1 max=60 value=20> <span class=units>seconds</span>
<br>
<small>Seconds to wait for each page to load before indexing.</small>
</label>
<p>
<p>
<label>
Max Page Crawl Time
<br>
<input name=maxPageCrawlTime
type=number min=3 max=120 value=30> <span class=units>seconds</span>
<br>
<small>Max time to allow for each page.</small>
</label>
<p>
<p>
<label>
Batch size
<br>
<input name=batchSize
type=number min=1 max=32 value=2> <span class=units>tabs</span>
<br>
<small>Number of concurrent tabs.</small>
</label>
<p>
<p>
<label>
<input name=saveToFile
type=checkbox checked>
Save the harvested URLs to a file
</label>
<p>
<p>
<label>
<span class=text>Program to run on every page</span>
<br>
<textarea class=long rows=9 name=program>
if ( ! State.titles ) {
State.titles = new Map();
State.onExit.addHandler(() => {
fs.writeFileSync(
path.resolve('.', `titles-${(new Date).toISOString()}.txt`),
JSON.stringify([...State.titles.entries()], null, 2) + '\n'
);
});
}
const {result:{value:data}} = await send("Runtime.evaluate",
{
expression: `(function () {
return {
url: document.location.href,
title: document.title,
};
}())`,
returnByValue: true
},
sessionId
);
State.titles.set(data.url, data.title);
console.log(`Saved ${State.titles.size} titles`);
</textarea>
</label>
</p>
</details>
<p>
<button>Crawl</button>
<script>
{
const button = document.currentScript.previousElementSibling;
let disabled = false;
button.addEventListener('click', click => {
if ( disabled ) return click.preventDefault();
disabled = true;
setTimeout(() => button.disabled = true, 0);
});
}
</script>
</fieldset>
</form>
<form method=GET action=/search>
<fieldset class=search>
<legend>Search your archive</legend>
@ -47,6 +181,7 @@
async function showCurrentMode() {
const mode = await fetch('/mode').then(r => r.text());
console.log({mode});
form.notification.value = "";
form.querySelector(`[name="mode"][value="${mode}"]`).checked = true;
}

View File

@ -12,7 +12,8 @@ export function getInjection({sessionId}) {
// in future
return `
{
const DEBUG = ${DEBUG};
const X = 1;
const DEBUG = ${JSON.stringify(DEBUG, null, 2)};
const MIN_CHECK_TEXT = 3000; // min time between checking documentElement.innerText
const MIN_NOTIFY = 5000; // min time between telling controller text maybe changed
const MAX_NOTIFICATIONS = 13; // max times we will tell controller text maybe changed
@ -99,7 +100,7 @@ export function getInjection({sessionId}) {
count++;
handler({textChange:{source}});
} catch(e) {
DEBUG && console.warn('could not parse message', data, e);
DEBUG.verboseSlow && console.warn('could not parse message', data, e);
}
}
}
@ -126,7 +127,7 @@ export function getInjection({sessionId}) {
console.log('check');
const textMutated = document.documentElement.innerText !== lastInnerText;
if ( textMutated ) {
DEBUG && console.log('Text changed');
DEBUG.verboseSlow && console.log('Text changed');
lastInnerText = document.documentElement.innerText;
Top.postMessage({frameTextChangeNotification:{source:location.href}}, '*');
}

250
public/make_top.mjs Executable file
View File

@ -0,0 +1,250 @@
#!/usr/bin/env node
import fs from 'node:fs';
import path from 'node:path';
import child_process from 'node:child_process';
const CLEAN = false;
const CONCURRENT = 7;
const sleep = ms => new Promise(res => setTimeout(res, ms));
const entries = [];
const counted = new Set();
const errors = new Map();
let counts;
let cleaning = false;
process.on('exit', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
process.on('SIGHUP', cleanup);
process.on('SIGUSR2', cleanup);
process.on('beforeExit', cleanup);
make();
async function make() {
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
if ( CLEAN ) {
for ( const [url, obj] of titles ) {
const k1 = clean(url);
const k2 = clean2(url);
if ( !titles.has(k1) ) {
titles.set(k1, obj);
}
if ( !titles.has(k2) ) {
titles.set(k2, obj);
}
}
}
const remainingFile = fs.readFileSync(path.resolve('.', 'remainingFile.json')).toString();
const remainingSet = new Set(JSON.parse(remainingFile));
const countsFile = fs.readFileSync(path.resolve('.', 'ran-counts.json')).toString();
counts = new Map(JSON.parse(countsFile).filter(([url, count]) => remainingSet.has(url)));
let current = 0;
for ( const [url, count] of counts ) {
let title;
let realUrl;
if ( titles.has(url) ) {
({title} = titles.get(url));
entries.push({
url,
title,
count,
});
counted.add(url);
} else {
console.log(`Curl call for ${url} in progress...`);
let notifyCurlComplete;
const curlCall = new Promise(res => notifyCurlComplete = res);
do {
await sleep(1000);
} while ( current >= CONCURRENT );
child_process.exec(curlCommand(url), (err, stdout, stderr) => {
if ( ! err && (!stderr || stderr.length == 0)) {
realUrl = stdout;
if ( titles.has(realUrl) ) {
({title} = titles.get(realUrl));
entries.push({
url,
realUrl,
title,
count,
});
counted.add(url);
}
} else {
console.log(`Error on curl for ${url}`, {err, stderr});
errors.set(url, {err, stderr});
}
console.log(`Curl call for ${url} complete!`);
notifyCurlComplete();
});
current += 1;
curlCall.then(() => current -= 1);
}
}
cleanup();
}
async function make_v2() {
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
if ( CLEAN ) {
for ( const [url, obj] of titles ) {
const k1 = clean(url);
const k2 = clean2(url);
if ( !titles.has(k1) ) {
titles.set(k1, obj);
}
if ( !titles.has(k2) ) {
titles.set(k2, obj);
}
}
}
const countsFile = fs.readFileSync(path.resolve('.', 'ran-counts.json')).toString();
counts = new Map(JSON.parse(countsFile));
let current = 0;
for ( const [url, count] of counts ) {
let title;
let realUrl;
if ( titles.has(url) ) {
({title} = titles.get(url));
entries.push({
url,
title,
count,
});
counted.add(url);
} else {
console.log(`Curl call for ${url} in progress...`);
let notifyCurlComplete;
const curlCall = new Promise(res => notifyCurlComplete = res);
do {
await sleep(250);
} while ( current >= CONCURRENT );
child_process.exec(curlCommand(url), (err, stdout, stderr) => {
if ( ! err && (!stderr || stderr.length == 0)) {
realUrl = stdout;
if ( titles.has(realUrl) ) {
({title} = titles.get(realUrl));
entries.push({
url,
realUrl,
title,
count,
});
counted.add(url);
}
} else {
console.log(`Error on curl for ${url}`, {err, stderr});
errors.set(url, {err, stderr});
}
console.log(`Curl call for ${url} complete!`);
notifyCurlComplete();
});
current += 1;
curlCall.then(() => current -= 1);
}
}
cleanup();
}
function cleanup() {
if ( cleaning ) return;
cleaning = true;
console.log('cleanup running');
if ( errors.size ) {
fs.writeFileSync(
path.resolve('.', 'errorLinks4.json'),
JSON.stringify([...errors.keys()], null, 2)
);
console.log(`Wrote errors`);
}
if ( counted.size !== counts.size ) {
counted.forEach(url => counts.delete(url));
fs.writeFileSync(
path.resolve('.', 'noTitleFound4.json'),
JSON.stringify([...counts.keys()], null, 2)
)
console.log(`Wrote noTitleFound`);
}
fs.writeFileSync(
path.resolve('.', 'topFrontPageLinksWithCounts4.json'),
JSON.stringify(entries, null, 2)
);
console.log(`Wrote top links with counts`);
process.exit(0);
}
async function make_v1() {
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [clean(url), {url,title}]));
const countsFile = fs.readFileSync(path.resolve('.', 'counts.json')).toString();
const counts = new Map(JSON.parse(countsFile).map(([url, count]) => [clean(url), count]));
for ( const [key, count] of counts ) {
counts.set(clean2(key), count);
}
const entries = [];
for ( const [key, {url,title}] of titles ) {
entries.push({
url, title,
count: counts.get(key) ||
counts.get(url) ||
counts.get(clean2(key)) ||
console.log(`No count found for`, {key, url, title, c2key: clean2(key)})
});
}
fs.writeFileSync(
path.resolve('.', 'topFrontPageLinks.json'),
JSON.stringify(entries, null, 2)
);
}
function clean(urlString) {
const url = new URL(urlString);
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
} else {
url.hash = '';
}
for ( const [key, value] of url.searchParams ) {
if ( key.startsWith('utm_') ) {
url.searchParams.delete(key);
}
}
url.pathname = url.pathname.replace(/\/$/, '');
url.protocol = 'https:';
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
if ( url.hostname.startsWith('www.') ) {
url.hostname = url.hostname.replace(/^www./, '');
}
const key = url.toString();
return key;
}
function clean2(urlString) {
const url = new URL(urlString);
url.pathname = '';
return url.toString();
}
function curlCommand(url) {
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'DNT: 1' \
-H 'Pragma: no-cache' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: none' \
-H 'Sec-Fetch-User: ?1' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed ;
`;
}

1440
public/none Normal file

File diff suppressed because it is too large Load Diff

117
public/problem_find.mjs Executable file
View File

@ -0,0 +1,117 @@
#!/usr/bin/env node
import fs from 'node:fs';
import path from 'node:path';
import child_process from 'node:child_process';
import {
loadPref,
cache_file,
index_file,
} from '../src/args.js';
const CLEAN = false;
const CONCURRENT = 7;
const sleep = ms => new Promise(res => setTimeout(res, ms));
const problems = new Map();
let cleaning = false;
let made = false;
process.on('exit', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
process.on('SIGHUP', cleanup);
process.on('SIGUSR2', cleanup);
process.on('beforeExit', cleanup);
console.log({Pref:loadPref(), cache_file: cache_file(), index_file: index_file()});
make();
async function make() {
const indexFile = fs.readFileSync(index_file()).toString();
JSON.parse(indexFile).map(([key, value]) => {
if ( typeof key === "number" ) return;
if ( key.startsWith('ndx') ) return;
if ( value.title === undefined ) {
console.log('no title property', {key, value});
}
const url = key;
const title = value.title.toLocaleLowerCase();
if ( title.length === 0 || title.includes('404') || title.includes('not found') ) {
if ( problems.has(url) ) {
console.log('Found duplicate', url, title, problems.get(url));
}
problems.set(url, title);
}
});
made = true;
cleanup();
}
function cleanup() {
if ( cleaning ) return;
if ( ! made ) return;
cleaning = true;
console.log('cleanup running');
const outData = [...problems.entries()];
fs.writeFileSync(
path.resolve('.', 'url-problems.json'),
JSON.stringify(outData, null, 2)
);
const {size:bytesWritten} = fs.statSync(
path.resolve('.', 'url-problems.json'),
{bigint: true}
);
console.log(`Wrote ${outData.length} problem urls in ${bytesWritten} bytes.`);
process.exit(0);
}
function clean(urlString) {
const url = new URL(urlString);
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
} else {
url.hash = '';
}
for ( const [key, value] of url.searchParams ) {
if ( key.startsWith('utm_') ) {
url.searchParams.delete(key);
}
}
url.pathname = url.pathname.replace(/\/$/, '');
url.protocol = 'https:';
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
if ( url.hostname.startsWith('www.') ) {
url.hostname = url.hostname.replace(/^www./, '');
}
const key = url.toString();
return key;
}
function clean2(urlString) {
const url = new URL(urlString);
url.pathname = '';
return url.toString();
}
function curlCommand(url) {
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'DNT: 1' \
-H 'Pragma: no-cache' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: none' \
-H 'Sec-Fetch-User: ?1' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed ;
`;
}

21
public/redirector.html Normal file
View File

@ -0,0 +1,21 @@
<!DOCTYPE html>
<meta name="referrer" content="no-referrer" />
<h1>About to index archive and index <code id=url-text></code></h1>
<script type=module>
const url = new URLSearchParams(location.search).get('url');
const text = document.querySelector('#url-text');
let valid = false;
try {
new URL(url);
valid = true;
} catch(e) {
console.warn(`URL ${url} is not a valid URL`);
}
if ( valid ) {
text.innerText = url;
setTimeout(() => {
window.location.href = url;
}, 1000);
}
</script>

View File

@ -47,7 +47,7 @@
}
button {
}
input.long {
form .long {
width: 100%;
min-width: 250px;
}
@ -64,6 +64,24 @@
small.url {
word-break: break-all;
}
.small {
font-size: smaller;
}
label small {
font-style: italic;
color: darkslategrey;
}
.units {
color: grey;
font-size: smaller;
}
input[type="number"] {
text-align: right;
]
input.search {
flex-grow: 1;
padding: 0.25em 0.5em;
@ -72,6 +90,12 @@
input.search + button {
font-size: 1em;
}
ol {
ol.results {
list-style-type: none;
}
.cent {
text-align: center;
}
.grey {
color: grey;
}

3
public/top.html Normal file
View File

@ -0,0 +1,3 @@
<script>
</script>

6
run.sh Executable file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
phys=$(free -t -m | grep -oP '\d+' | sed '10!d')
alloc=$(echo "$phys * 90/100" | bc )
echo $alloc
node --max-old-space-size=$alloc src/app.js

View File

@ -1,4 +1,10 @@
#!/usr/bin/env bash
echo "Post install script"
npm i -g rollup eslint
which brew || /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
which mkcert || brew install mkcert
mkdir -p $HOME/local-sslcerts
cd $HOME/local-sslcerts
mkcert -key-file privkey.pem -cert-file fullchain.pem localhost
mkcert -install

View File

@ -3,34 +3,34 @@ import ChildProcess from 'child_process';
import ChromeLauncher from 'chrome-launcher';
import {DEBUG, sleep, NO_SANDBOX} from './common.js';
import {DEBUG, sleep, NO_SANDBOX, GO_SECURE} from './common.js';
import Archivist from './archivist.js';
import {Archivist} from './archivist.js';
import LibraryServer from './libraryServer.js';
import args from './args.js';
const {server_port, mode, chrome_port} = args;
const CHROME_OPTS = !NO_SANDBOX ? [
'--restore-last-session',
/*'--restore-last-session',*/
`--disk-cache-dir=${args.temp_browser_cache()}`,
`--aggressive-cache-discard`
] : [
'--restore-last-session',
/*'--restore-last-session',*/
`--disk-cache-dir=${args.temp_browser_cache()}`,
`--aggressive-cache-discard`,
'--no-sandbox'
'--no-sandbox',
];
const LAUNCH_OPTS = {
logLevel: DEBUG ? 'verbose' : 'silent',
port: chrome_port,
chromeFlags:CHROME_OPTS,
userDataDir:false,
startingUrl: `http://localhost:${args.server_port}`,
startingUrl: `${GO_SECURE ? 'https' : 'http'}://localhost:${args.server_port}`,
ignoreDefaultFlags: true
}
const KILL_ON = {
win32: 'taskkill /IM chrome.exe /F',
darwin: 'pkill -15 chrome',
darwin: 'kill $(pgrep Chrome)',
freebsd: 'pkill -15 chrome',
linux: 'pkill -15 chrome',
};
@ -42,11 +42,16 @@ start();
async function start() {
console.log(`Running in node...`);
process.on('beforeExit', cleanup);
process.on('SIGBREAK', cleanup);
process.on('error', cleanup);
process.on('unhandledRejection', cleanup);
process.on('uncaughtException', cleanup);
process.on('SIGHUP', cleanup);
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
process.on('beforeExit', cleanup);
process.on('SIGINT', code => cleanup(code, 'signal', {exit:true}));
process.on('SIGTERM', code => cleanup(code, 'signal', {exit:true}));
process.on('SIGQUIT', code => cleanup(code, 'signal', {exit:true}));
process.on('SIGBREAK', code => cleanup(code, 'signal', {exit:true}));
process.on('SIGABRT', code => cleanup(code, 'signal', {exit:true}));
console.log(`Importing dependencies...`);
const {launch:ChromeLaunch} = ChromeLauncher;
@ -71,7 +76,7 @@ async function start() {
await ChromeLaunch(LAUNCH_OPTS);
} catch(e) {
console.log(`Could not launch chrome.`);
DEBUG && console.info('Chrome launch error:', e);
DEBUG.verboseSlow && console.info('Chrome launch error:', e);
process.exit(1);
}
console.log(`Chrome started.`);
@ -92,7 +97,7 @@ async function killChrome(wait = true) {
));
if ( err ) {
console.log(`There was no running chrome.`);
//DEBUG && console.warn("Error closing existing chrome", err);
DEBUG.verboseSlow && console.warn("Error closing existing chrome", err);
} else {
console.log(`Running chrome shut down.`);
if ( wait ) {
@ -108,8 +113,8 @@ async function killChrome(wait = true) {
}
}
async function cleanup(reason) {
console.log(`Cleanup called on reason: ${reason}`);
async function cleanup(reason, err, {exit = false} = {}) {
console.log(`Cleanup called on reason: ${reason}`, err);
if ( quitting ) {
console.log(`Cleanup already called so not running again.`);
@ -123,9 +128,11 @@ async function cleanup(reason) {
killChrome(false);
console.log(`Take a breath. Everything's done. 22120 is exiting in 3 seconds...`);
if ( exit ) {
console.log(`Take a breath. Everything's done. DiskerNet is exiting in 3 seconds...`);
await sleep(3000);
await sleep(3000);
process.exit(0);
process.exit(0);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -7,19 +7,21 @@ const mode = process.argv[3] || 'save';
const chrome_port = process.argv[4] || 9222;
const Pref = {};
const pref_file = path.resolve(os.homedir(), '.22120.config.json');
export const CONFIG_DIR = path.resolve(os.homedir(), '.config', 'dosyago', 'DiskerNet');
fs.mkdirSync(CONFIG_DIR, {recursive:true});
const pref_file = path.resolve(CONFIG_DIR, 'config.json');
const cacheId = Math.random();
loadPref();
let BasePath = Pref.BasePath;
const archive_root = () => path.resolve(BasePath, '22120-arc');
const no_file = () => path.resolve(archive_root(), 'no.json');
const temp_browser_cache = () => path.resolve(archive_root(), 'temp-browser-cache' + cacheId);
const library_path = () => path.resolve(archive_root(), 'public', 'library');
const cache_file = () => path.resolve(library_path(), 'cache.json');
const index_file = () => path.resolve(library_path(), 'index.json');
const fts_index_dir = () => path.resolve(library_path(), 'fts');
export const archive_root = () => path.resolve(BasePath, '22120-arc');
export const no_file = () => path.resolve(archive_root(), 'no.json');
export const temp_browser_cache = () => path.resolve(archive_root(), 'temp-browser-cache' + cacheId);
export const library_path = () => path.resolve(archive_root(), 'public', 'library');
export const cache_file = () => path.resolve(library_path(), 'cache.json');
export const index_file = () => path.resolve(library_path(), 'index.json');
export const fts_index_dir = () => path.resolve(library_path(), 'fts');
const flex_fts_index_dir = base => path.resolve(base || fts_index_dir(), 'flex');
const ndx_fts_index_dir = base => path.resolve(base || fts_index_dir(), 'ndx');
@ -29,7 +31,7 @@ const results_per_page = 10;
console.log(`Args usage: <server_port> <save|serve> <chrome_port> <library_path>`);
updateBasePath(process.argv[5] || Pref.BasePath || os.homedir());
updateBasePath(process.argv[5] || Pref.BasePath || CONFIG_DIR);
const args = {
mode,
@ -50,7 +52,8 @@ const args = {
ndx_fts_index_dir,
fuzzy_fts_index_dir,
results_per_page
results_per_page,
CONFIG_DIR
};
export default args;
@ -126,7 +129,7 @@ function getBasePath() {
return BasePath;
}
function loadPref() {
export function loadPref() {
if ( fs.existsSync(pref_file) ) {
try {
Object.assign(Pref, JSON.parse(fs.readFileSync(pref_file)));
@ -137,6 +140,7 @@ function loadPref() {
console.log("Preferences file does not exist. Creating one...");
savePref();
}
return clone(Pref);
}
function savePref() {
@ -147,3 +151,7 @@ function savePref() {
}
}
function clone(o) {
return JSON.parse(JSON.stringify(o));
}

View File

@ -86,13 +86,13 @@ export async function* bookmarkChanges() {
filename = filename || '';
// listen to everything
const path = Path.resolve(dirPath, filename);
DEBUG && console.log(event, path);
DEBUG.verboseSlow && console.log(event, path);
if ( isBookmarkFile(filename) ) {
if ( ! State.active.has(path) ) {
State.active.add(path);
}
// but only act if it is a bookmark file
DEBUG && console.log(event, path, notifyChange);
DEBUG.verboseSlow && console.log(event, path, notifyChange);
// save the event type and file it happened to
change = {event, path};
// drop the most recently pushed promise from our bookkeeping list
@ -208,7 +208,7 @@ function getProfileRootDir() {
let name = PLAT_TABLE[plat];
let rootDir;
DEBUG && console.log({plat, name});
DEBUG.verboseSlow && console.log({plat, name});
if ( !name ) {
if ( plat === 'win32' ) {

View File

@ -4,8 +4,9 @@ import fs from 'fs';
import os from 'os';
export const DEBUG = {
verboseSlow: false,
debug: process.env.DEBUG_22120 || false,
checkPred: true
checkPred: false
}
export const SHOW_FETCH = false;

View File

@ -43,7 +43,7 @@ export function highlight(query, doc, {
doc2.splice(doc2.length, 0, ...(new Array((chunkSize - doc2.length % chunkSize) % chunkSize)).join(' ').split(''));
const fragments2 = doc2.reduce(getFragmenter(chunkSize), []);
query.toLocaleLowerCase();
DEBUG && console.log(fragments);
DEBUG.verboseSlow && console.log(fragments);
const scores = [...fragments, ...fragments2].map(fragment => {
const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
@ -65,7 +65,7 @@ export function highlight(query, doc, {
let result;
if ( highlights.length === 0 ) {
DEBUG && console.log('Zero highlights, showing first score', scores[0]);
DEBUG.verboseSlow && console.log('Zero highlights, showing first score', scores[0]);
result = scores.slice(0,1);
} else {
let better = Array.from(highlights).slice(0, 10);
@ -73,7 +73,7 @@ export function highlight(query, doc, {
const length = Array.from(hl.fragment.text).length;
let {offset, symbols} = hl.fragment;
const newText = symbols.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + symbols.slice(offset + length, offset + length + extra).join('');
DEBUG && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});
DEBUG.verboseSlow && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});
hl.fragment.text = newText;
const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
@ -83,7 +83,7 @@ export function highlight(query, doc, {
return hl;
});
better.sort(({score:a}, {score:b}) => a-b);
DEBUG && console.log(JSON.stringify({better},null,2));
DEBUG.verboseSlow && console.log(JSON.stringify({better},null,2));
result = better.slice(0,3);
}
@ -107,16 +107,17 @@ export function trilight(query, doc, {
const trigrams = doc.reduce(getFragmenter(ngramSize, {overlap:true}), []);
const index = trigrams.reduce((idx, frag) => {
let counts = idx[frag.text];
let counts = idx.get(frag.text);
if ( ! counts ) {
counts = idx[frag.text] = [];
counts = [];
idx.set(frag.text, counts);
}
counts.push(frag.offset);
return idx;
}, {});
}, new Map);
const qtris = query.reduce(getFragmenter(ngramSize, {overlap:true}), []);
const entries = qtris.reduce((E, {text}, qi) => {
const counts = index[text];
const counts = index.get(text);
if ( counts ) {
counts.forEach(di => {
const entry = {text, qi, di};
@ -204,9 +205,9 @@ export function trilight(query, doc, {
}
}
if ( assigned ) {
DEBUG && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
DEBUG.verboseSlow && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
} else {
DEBUG && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
DEBUG.verboseSlow && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
as it would have made an existing segment too long, or it was already too long itself.`
);
}
@ -214,10 +215,10 @@ export function trilight(query, doc, {
segments.sort(({score:a}, {score:b}) => b-a);
const textSegments = segments.map(({start,end}) => oDoc.slice(start,end).join(''));
//console.log(JSON.stringify({gaps}, null, 2));
DEBUG && console.log(segments, textSegments);
DEBUG.verboseSlow && console.log(segments, textSegments);
if ( textSegments.length === 0 ) {
DEBUG && console.log({query, doc, maxLength, ngramSize, maxSegmentSize,
DEBUG.verboseSlow && console.log({query, doc, maxLength, ngramSize, maxSegmentSize,
trigrams,
index,
entries,

View File

@ -1,16 +1,26 @@
import http from 'http';
import https from 'https';
import fs from 'fs';
import os from 'os';
import path from 'path';
import express from 'express';
import args from './args.js';
import {
GO_SECURE,
MAX_REAL_URL_LENGTH,
MAX_HEAD, MAX_HIGHLIGHTABLE_LENGTH, DEBUG,
say, sleep, APP_ROOT
say, sleep, APP_ROOT,
RichError
} from './common.js';
import Archivist from './archivist.js';
import {trilight, /*highlight*/} from './highlighter.js';
import {startCrawl, Archivist} from './archivist.js';
import {trilight, highlight} from './highlighter.js';
const SITE_PATH = path.resolve(APP_ROOT, '..', 'public');
const SearchCache = new Map();
const app = express();
let running = false;
@ -20,18 +30,39 @@ const LibraryServer = {
start, stop
}
const secure_options = {};
const protocol = GO_SECURE ? https : http;
export default LibraryServer;
async function start({server_port}) {
if ( running ) {
DEBUG && console.warn(`Attempting to start server when it is not closed. Exiting start()...`);
DEBUG.verboseSlow && console.warn(`Attempting to start server when it is not closed. Exiting start()...`);
return;
}
running = true;
try {
const sec = {
key: fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'privkey.pem')),
cert: fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'fullchain.pem')),
ca: fs.existsSync(path.resolve(os.homedir(), 'local-sslcerts', 'chain.pem')) ?
fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'chain.pem'))
:
undefined
};
console.log({sec});
Object.assign(secure_options, sec);
} catch(e) {
console.warn(`No certs found so will use insecure no SSL.`);
}
try {
port = server_port;
addHandlers();
Server = app.listen(Number(port), err => {
const secure = secure_options.cert && secure_options.key;
const server = protocol.createServer.apply(protocol, GO_SECURE && secure ? [secure_options, app] : [app]);
Server = server.listen(Number(port), err => {
if ( err ) {
running = false;
throw err;
@ -41,12 +72,13 @@ async function start({server_port}) {
});
} catch(e) {
running = false;
DEBUG && console.error(`Error starting server`, e);
DEBUG.verboseSlow && console.error(`Error starting server`, e);
process.exit(1);
}
}
function addHandlers() {
app.use(express.urlencoded({extended:true}));
app.use(express.urlencoded({extended:true, limit: '50mb'}));
app.use(express.static(SITE_PATH));
if ( args.library_path() ) {
@ -55,8 +87,29 @@ function addHandlers() {
app.get('/search(.json)?', async (req, res) => {
await Archivist.isReady();
const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
const results = resultIds.map(docId => Archivist.getDetails(docId));
let {query:oquery} = req.query;
if ( ! oquery ) {
return res.end(SearchResultView({results:[], query:'', HL:new Map, page:1}));
}
oquery = oquery.trim();
if ( ! oquery ) {
return res.end(SearchResultView({results:[], query:'', HL:new Map, page:1}));
}
let {page} = req.query;
if ( ! page || ! Number.isInteger(parseInt(page)) ) {
page = 1;
} else {
page = parseInt(page);
}
let resultIds, query, HL;
if ( SearchCache.has(req.query.query) ) {
({query, resultIds, HL} = SearchCache.get(oquery));
} else {
({query, results:resultIds, HL} = await Archivist.search(oquery));
SearchCache.set(req.query.query, {query, resultIds, HL});
}
const start = (page-1)*args.results_per_page;
const results = resultIds.slice(start,start+args.results_per_page).map(docId => Archivist.getDetails(docId))
if ( req.path.endsWith('.json') ) {
res.end(JSON.stringify({
results, query
@ -73,7 +126,7 @@ function addHandlers() {
.map(segment => Archivist.findOffsets(query, segment))
.join(' ... ');
});
res.end(SearchResultView({results, query, HL}));
res.end(SearchResultView({results, query, HL, page}));
}
});
@ -133,6 +186,63 @@ function addHandlers() {
res.redirect('/');
}
});
app.post('/crawl', async (req, res) => {
try {
let {
links, timeout, depth, saveToFile,
maxPageCrawlTime, minPageCrawlTime, batchSize,
program,
} = req.body;
const oTimeout = timeout;
timeout = Math.round(parseFloat(timeout)*1000);
depth = Math.round(parseInt(depth));
batchSize = Math.round(parseInt(batchSize));
saveToFile = !!saveToFile;
minPageCrawlTime = Math.round(parseInt(minPageCrawlTime)*1000);
maxPageCrawlTime = Math.round(parseInt(maxPageCrawlTime)*1000);
if ( Number.isNaN(timeout) || Number.isNaN(depth) || typeof links != 'string' ) {
console.warn({invalid:{timeout,depth,links}});
throw new RichError({
status: 400,
message: 'Invalid parameters: timeout, depth or links'
});
}
const urls = links.split(/[\n\s\r]+/g).map(u => u.trim()).filter(u => {
const tooShort = u.length === 0;
if ( tooShort ) return false;
const tooLong = u.length > MAX_REAL_URL_LENGTH;
if ( tooLong ) return false;
let invalid = false;
try {
new URL(u);
} catch {
invalid = true;
};
if ( invalid ) return false;
return true;
}).map(url => ({url,depth:1}));
console.log(`Starting crawl from ${urls.length} URLs, waiting ${oTimeout} seconds for each to load, and continuing to a depth of ${depth} clicks...`);
await startCrawl({
urls, timeout, depth, saveToFile, batchSize, minPageCrawlTime, maxPageCrawlTime, program,
});
res.end(`Starting crawl from ${urls.length} URLs, waiting ${oTimeout} seconds for each to load, and continuing to a depth of ${depth} clicks...`);
} catch(e) {
if ( e instanceof RichError ) {
console.warn(e);
const {status, message} = JSON.parse(e.message);
res.status(status);
res.end(message);
} else {
console.warn(e);
res.sendStatus(500);
}
return;
}
});
}
async function stop() {
@ -251,7 +361,7 @@ function IndexView(urls, {edit:edit = false} = {}) {
`
}
function SearchResultView({results, query, HL}) {
function SearchResultView({results, query, HL, page}) {
return `
<!DOCTYPE html>
<meta charset=utf-8>
@ -273,7 +383,7 @@ function SearchResultView({results, query, HL}) {
<p>
Showing results for <b>${query}</b>
</p>
<ol>
<ol class=results start="${(page-1)*args.results_per_page+1}">
${
results.map(({snippet, url,title,id}) => `
<li>
@ -289,6 +399,19 @@ function SearchResultView({results, query, HL}) {
`).join('\n')
}
</ol>
<p class=cent>
${page > 1 ? `
<a href=/search?query=${encodeURIComponent(query)}&page=${encodeURIComponent(page-1)}>
&lt; Page ${page-1}
</a> |` : ''}
<span class=grey>
Page ${page}
</span>
|
<a href=/search?query=${encodeURIComponent(query)}&page=${encodeURIComponent(page+1)}>
Page ${page+1} &gt;
</a>
</p>
`
}

View File

@ -40,7 +40,7 @@ export async function connect({port:port = 9222} = {}) {
return {
send,
on, ons,
on, ons, ona,
close
};
@ -59,17 +59,24 @@ export async function connect({port:port = 9222} = {}) {
const outGoing = JSON.stringify(message);
MESSAGES.set(key, outGoing);
socket.send(outGoing);
DEBUG && (SHOW_FETCH || !method.startsWith('Fetch')) && console.log("Sent", message);
DEBUG.verboseSlow && (SHOW_FETCH || !method.startsWith('Fetch')) && console.log("Sent", message);
return promise;
}
async function handle(message) {
if ( typeof message !== "string" ) {
try {
message += '';
} catch(e) {
message = message.toString();
}
}
const stringMessage = message;
message = JSON.parse(message);
if ( message.error ) {
const showError = DEBUG || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
const showError = DEBUG.protocol || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
if ( showError ) {
console.warn(message);
DEBUG.protocol && console.warn(message);
}
}
const {sessionId} = message;
@ -80,7 +87,7 @@ export async function connect({port:port = 9222} = {}) {
const key = `${sessionId||ROOT_SESSION}:${id}`;
const resolve = Resolvers[key];
if ( ! resolve ) {
console.warn(`No resolver for key`, key, stringMessage.slice(0,140));
DEBUG.protocol && console.warn(`No resolver for key`, key, stringMessage.slice(0,140));
} else {
Resolvers[key] = undefined;
try {
@ -94,7 +101,7 @@ export async function connect({port:port = 9222} = {}) {
const showError = DEBUG || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
if ( showError ) {
const originalMessage = MESSAGES.get(key);
console.warn({originalMessage});
DEBUG.protocol && console.warn({originalMessage});
}
}
}
@ -131,6 +138,20 @@ export async function connect({port:port = 9222} = {}) {
listeners.push(handler);
}
function ona(method, handler, sessionId) {
let listeners = Handlers[method];
if ( ! listeners ) {
Handlers[method] = listeners = [];
}
listeners.push(({message}) => {
if ( message.sessionId === sessionId ) {
handler(message.params);
} else {
console.log(`No such`, {method, handler, sessionId, message});
}
});
}
function close() {
socket.close();
}