Delete some
This commit is contained in:
commit
bb560f0853
|
@ -1 +0,0 @@
|
|||
Sun Jan 2 05:40:42 AM HKT 2022
|
|
@ -1 +0,0 @@
|
|||
Wed Jan 5 02:35:00 PM HKT 2022
|
File diff suppressed because it is too large
Load Diff
|
@ -17,6 +17,7 @@
|
|||
"clean": "rm -rf build/* bin/*",
|
||||
"super-clean": "npm run clean || : && rm -rf node_modules || : && rm package-lock.json",
|
||||
"test": "nodemon src/app.js",
|
||||
"inspect": "node --inspect-brk=127.0.0.1:9999 src/app.js",
|
||||
"save": "nodemon src/app.js DiskerNet save",
|
||||
"serve": "nodemon src/app.js DiskerNet serve",
|
||||
"lint": "watch -n 5 npx eslint .",
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import child_process from 'node:child_process';
|
||||
|
||||
import {
|
||||
loadPref,
|
||||
cache_file,
|
||||
index_file,
|
||||
} from '../src/args.js';
|
||||
|
||||
const CLEAN = true;
|
||||
const CONCURRENT = 7;
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const problems = new Map();
|
||||
let cleaning = false;
|
||||
let made = false;
|
||||
|
||||
process.on('exit', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGHUP', cleanup);
|
||||
process.on('SIGUSR2', cleanup);
|
||||
process.on('beforeExit', cleanup);
|
||||
|
||||
console.log({Pref:loadPref(), cache_file: cache_file(), index_file: index_file()});
|
||||
make();
|
||||
|
||||
async function make() {
|
||||
const indexFile = fs.readFileSync(index_file()).toString();
|
||||
JSON.parse(indexFile).map(([key, value]) => {
|
||||
if ( typeof key === "number" ) return;
|
||||
if ( key.startsWith('ndx') ) return;
|
||||
if ( value.title === undefined ) {
|
||||
console.log('no title property', {key, value});
|
||||
}
|
||||
const url = key;
|
||||
const title = value.title.toLocaleLowerCase();
|
||||
if ( title.length === 0 || title.includes('404') || title.includes('not found') ) {
|
||||
if ( problems.has(url) ) {
|
||||
console.log('Found duplicate', url, title, problems.get(url));
|
||||
}
|
||||
const prob = {title, dupes:[], dupe:false};
|
||||
problems.set(url, prob);
|
||||
const cleaned1 = clean(url);
|
||||
if ( problems.has(cleaned1) ) {
|
||||
console.log(`Found duplicate`, {url, title, cleaned1, dupeEntry:problems.get(cleaned1)});
|
||||
prob.dupe = true;
|
||||
prob.dupes.push(cleaned1);
|
||||
url !== cleaned1 && (problems.delete(cleaned1), prob.diff = true);
|
||||
}
|
||||
const cleaned2 = clean2(url);
|
||||
if ( problems.has(cleaned2) ) {
|
||||
console.log(`Found duplicate`, {url, title, cleaned2, dupeEntry: problems.get(cleaned2)});
|
||||
prob.dupe = true;
|
||||
prob.dupes.push(cleaned2);
|
||||
url !== cleaned2 && (problems.delete(cleaned2), prob.diff = true);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
made = true;
|
||||
|
||||
cleanup();
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
if ( cleaning ) return;
|
||||
if ( ! made ) return;
|
||||
cleaning = true;
|
||||
console.log('cleanup running');
|
||||
const outData = [...problems.entries()].filter(([key, {dupe}]) => dupe);
|
||||
outData.sort(([a], [b]) => a.localeCompare(b));
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'url-cleaned-dupes.json'),
|
||||
JSON.stringify(outData, null, 2)
|
||||
);
|
||||
const {size:bytesWritten} = fs.statSync(
|
||||
path.resolve('.', 'url-cleaned-dupes.json'),
|
||||
{bigint: true}
|
||||
);
|
||||
console.log(`Wrote ${outData.length} dupe urls in ${bytesWritten} bytes.`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function clean(urlString) {
|
||||
const url = new URL(urlString);
|
||||
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
|
||||
} else {
|
||||
url.hash = '';
|
||||
}
|
||||
for ( const [key, value] of url.searchParams ) {
|
||||
if ( key.startsWith('utm_') ) {
|
||||
url.searchParams.delete(key);
|
||||
}
|
||||
}
|
||||
url.pathname = url.pathname.replace(/\/$/, '');
|
||||
url.protocol = 'https:';
|
||||
url.pathname = url.pathname.replace(/(\.htm.?|\.php|\.asp.?)$/, '');
|
||||
if ( url.hostname.startsWith('www.') ) {
|
||||
url.hostname = url.hostname.replace(/^www./, '');
|
||||
}
|
||||
const key = url.toString();
|
||||
return key;
|
||||
}
|
||||
|
||||
function clean2(urlString) {
|
||||
const url = new URL(urlString);
|
||||
url.pathname = '';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
function curlCommand(url) {
|
||||
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
||||
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: none' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
|
||||
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
|
||||
-H 'sec-ch-ua-mobile: ?0' \
|
||||
-H 'sec-ch-ua-platform: "macOS"' \
|
||||
--compressed ;
|
||||
`;
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import child_process from 'node:child_process';
|
||||
|
||||
const CLEAN = false;
|
||||
const CONCURRENT = 7;
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const entries = [];
|
||||
let cleaning = false;
|
||||
|
||||
process.on('exit', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGHUP', cleanup);
|
||||
process.on('SIGUSR2', cleanup);
|
||||
process.on('beforeExit', cleanup);
|
||||
|
||||
make();
|
||||
|
||||
async function make() {
|
||||
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
|
||||
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
|
||||
titles.forEach(({url,title}) => {
|
||||
if ( title.length === 0 && url.startsWith('https:') && !url.endsWith('.pdf') ) {
|
||||
entries.push(url);
|
||||
}
|
||||
});
|
||||
|
||||
cleanup();
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
if ( cleaning ) return;
|
||||
cleaning = true;
|
||||
console.log('cleanup running');
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'recrawl-https-3.json'),
|
||||
JSON.stringify(entries, null, 2)
|
||||
);
|
||||
console.log(`Wrote recrawlable urls`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function clean(urlString) {
|
||||
const url = new URL(urlString);
|
||||
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
|
||||
} else {
|
||||
url.hash = '';
|
||||
}
|
||||
for ( const [key, value] of url.searchParams ) {
|
||||
if ( key.startsWith('utm_') ) {
|
||||
url.searchParams.delete(key);
|
||||
}
|
||||
}
|
||||
url.pathname = url.pathname.replace(/\/$/, '');
|
||||
url.protocol = 'https:';
|
||||
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
|
||||
if ( url.hostname.startsWith('www.') ) {
|
||||
url.hostname = url.hostname.replace(/^www./, '');
|
||||
}
|
||||
const key = url.toString();
|
||||
return key;
|
||||
}
|
||||
|
||||
function clean2(urlString) {
|
||||
const url = new URL(urlString);
|
||||
url.pathname = '';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
function curlCommand(url) {
|
||||
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
||||
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: none' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
|
||||
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
|
||||
-H 'sec-ch-ua-mobile: ?0' \
|
||||
-H 'sec-ch-ua-platform: "macOS"' \
|
||||
--compressed ;
|
||||
`;
|
||||
}
|
|
@ -8,6 +8,140 @@
|
|||
<p>
|
||||
View <a href=/archive_index.html>the index</a>
|
||||
</p>
|
||||
<form method=POST action=/crawl>
|
||||
<fieldset>
|
||||
<legend>Crawl and Index</legend>
|
||||
<p>
|
||||
Crawl and index a list of links.
|
||||
<br>
|
||||
<small>This will open 1 link at a time, and index it when it has loaded.</small>
|
||||
<p>
|
||||
<label>
|
||||
Links
|
||||
<br>
|
||||
<textarea class=long name=links>
|
||||
https://cnn.com
|
||||
https://bloomberg.com
|
||||
https://microsoft.com
|
||||
https://dosyago.com
|
||||
https://intel.com
|
||||
</textarea>
|
||||
<br>
|
||||
<small>List format is 1 link per line.</small>
|
||||
</label>
|
||||
</p>
|
||||
<details open>
|
||||
<summary>Advanced settings</summary>
|
||||
<p>
|
||||
<label>
|
||||
Timeout
|
||||
<br>
|
||||
<input required name=timeout
|
||||
type=number min=1 max=300 value=3.6 step=0.1> <span class=units>seconds</span>
|
||||
<br>
|
||||
<small>Seconds to wait for each page to load before indexing.</small>
|
||||
</label>
|
||||
<p>
|
||||
<label>
|
||||
Depth
|
||||
<br>
|
||||
<input required name=depth
|
||||
type=number min=1 max=20 value=1 step=1> <span class=units>clicks</span>
|
||||
</label>
|
||||
<br>
|
||||
<section class=small>
|
||||
<strong>Value guide</strong>
|
||||
<ol>
|
||||
<li>Only each link.
|
||||
<li>Plus anything 1 click from the link.
|
||||
<li>Plus anything 2 clicks from the link.
|
||||
</ol>
|
||||
<em>And so on…</em>
|
||||
</section>
|
||||
<p>
|
||||
<label>
|
||||
Min Page Crawl Time
|
||||
<br>
|
||||
<input name=minPageCrawlTime
|
||||
type=number min=1 max=60 value=20> <span class=units>seconds</span>
|
||||
<br>
|
||||
<small>Seconds to wait for each page to load before indexing.</small>
|
||||
</label>
|
||||
<p>
|
||||
<p>
|
||||
<label>
|
||||
Max Page Crawl Time
|
||||
<br>
|
||||
<input name=maxPageCrawlTime
|
||||
type=number min=3 max=120 value=30> <span class=units>seconds</span>
|
||||
<br>
|
||||
<small>Max time to allow for each page.</small>
|
||||
</label>
|
||||
<p>
|
||||
<p>
|
||||
<label>
|
||||
Batch size
|
||||
<br>
|
||||
<input name=batchSize
|
||||
type=number min=1 max=32 value=2> <span class=units>tabs</span>
|
||||
<br>
|
||||
<small>Number of concurrent tabs.</small>
|
||||
</label>
|
||||
<p>
|
||||
<p>
|
||||
<label>
|
||||
<input name=saveToFile
|
||||
type=checkbox checked>
|
||||
Save the harvested URLs to a file
|
||||
</label>
|
||||
<p>
|
||||
<p>
|
||||
<label>
|
||||
<span class=text>Program to run on every page</span>
|
||||
<br>
|
||||
<textarea class=long rows=9 name=program>
|
||||
if ( ! State.titles ) {
|
||||
State.titles = new Map();
|
||||
State.onExit.addHandler(() => {
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', `titles-${(new Date).toISOString()}.txt`),
|
||||
JSON.stringify([...State.titles.entries()], null, 2) + '\n'
|
||||
);
|
||||
});
|
||||
}
|
||||
const {result:{value:data}} = await send("Runtime.evaluate",
|
||||
{
|
||||
expression: `(function () {
|
||||
return {
|
||||
url: document.location.href,
|
||||
title: document.title,
|
||||
};
|
||||
}())`,
|
||||
returnByValue: true
|
||||
},
|
||||
sessionId
|
||||
);
|
||||
State.titles.set(data.url, data.title);
|
||||
console.log(`Saved ${State.titles.size} titles`);
|
||||
</textarea>
|
||||
</label>
|
||||
</p>
|
||||
</details>
|
||||
<p>
|
||||
<button>Crawl</button>
|
||||
<script>
|
||||
{
|
||||
const button = document.currentScript.previousElementSibling;
|
||||
let disabled = false;
|
||||
button.addEventListener('click', click => {
|
||||
if ( disabled ) return click.preventDefault();
|
||||
disabled = true;
|
||||
setTimeout(() => button.disabled = true, 0);
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</fieldset>
|
||||
</form>
|
||||
<form method=GET action=/search>
|
||||
<fieldset class=search>
|
||||
<legend>Search your archive</legend>
|
||||
|
@ -47,6 +181,7 @@
|
|||
|
||||
async function showCurrentMode() {
|
||||
const mode = await fetch('/mode').then(r => r.text());
|
||||
console.log({mode});
|
||||
form.notification.value = "";
|
||||
form.querySelector(`[name="mode"][value="${mode}"]`).checked = true;
|
||||
}
|
||||
|
|
|
@ -12,7 +12,8 @@ export function getInjection({sessionId}) {
|
|||
// in future
|
||||
return `
|
||||
{
|
||||
const DEBUG = ${DEBUG};
|
||||
const X = 1;
|
||||
const DEBUG = ${JSON.stringify(DEBUG, null, 2)};
|
||||
const MIN_CHECK_TEXT = 3000; // min time between checking documentElement.innerText
|
||||
const MIN_NOTIFY = 5000; // min time between telling controller text maybe changed
|
||||
const MAX_NOTIFICATIONS = 13; // max times we will tell controller text maybe changed
|
||||
|
@ -99,7 +100,7 @@ export function getInjection({sessionId}) {
|
|||
count++;
|
||||
handler({textChange:{source}});
|
||||
} catch(e) {
|
||||
DEBUG && console.warn('could not parse message', data, e);
|
||||
DEBUG.verboseSlow && console.warn('could not parse message', data, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -126,7 +127,7 @@ export function getInjection({sessionId}) {
|
|||
console.log('check');
|
||||
const textMutated = document.documentElement.innerText !== lastInnerText;
|
||||
if ( textMutated ) {
|
||||
DEBUG && console.log('Text changed');
|
||||
DEBUG.verboseSlow && console.log('Text changed');
|
||||
lastInnerText = document.documentElement.innerText;
|
||||
Top.postMessage({frameTextChangeNotification:{source:location.href}}, '*');
|
||||
}
|
||||
|
|
|
@ -0,0 +1,250 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import child_process from 'node:child_process';
|
||||
|
||||
const CLEAN = false;
|
||||
const CONCURRENT = 7;
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const entries = [];
|
||||
const counted = new Set();
|
||||
const errors = new Map();
|
||||
let counts;
|
||||
let cleaning = false;
|
||||
|
||||
process.on('exit', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGHUP', cleanup);
|
||||
process.on('SIGUSR2', cleanup);
|
||||
process.on('beforeExit', cleanup);
|
||||
|
||||
make();
|
||||
|
||||
async function make() {
|
||||
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
|
||||
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
|
||||
if ( CLEAN ) {
|
||||
for ( const [url, obj] of titles ) {
|
||||
const k1 = clean(url);
|
||||
const k2 = clean2(url);
|
||||
if ( !titles.has(k1) ) {
|
||||
titles.set(k1, obj);
|
||||
}
|
||||
if ( !titles.has(k2) ) {
|
||||
titles.set(k2, obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
const remainingFile = fs.readFileSync(path.resolve('.', 'remainingFile.json')).toString();
|
||||
const remainingSet = new Set(JSON.parse(remainingFile));
|
||||
const countsFile = fs.readFileSync(path.resolve('.', 'ran-counts.json')).toString();
|
||||
counts = new Map(JSON.parse(countsFile).filter(([url, count]) => remainingSet.has(url)));
|
||||
let current = 0;
|
||||
for ( const [url, count] of counts ) {
|
||||
let title;
|
||||
let realUrl;
|
||||
if ( titles.has(url) ) {
|
||||
({title} = titles.get(url));
|
||||
entries.push({
|
||||
url,
|
||||
title,
|
||||
count,
|
||||
});
|
||||
counted.add(url);
|
||||
} else {
|
||||
console.log(`Curl call for ${url} in progress...`);
|
||||
let notifyCurlComplete;
|
||||
const curlCall = new Promise(res => notifyCurlComplete = res);
|
||||
do {
|
||||
await sleep(1000);
|
||||
} while ( current >= CONCURRENT );
|
||||
child_process.exec(curlCommand(url), (err, stdout, stderr) => {
|
||||
if ( ! err && (!stderr || stderr.length == 0)) {
|
||||
realUrl = stdout;
|
||||
if ( titles.has(realUrl) ) {
|
||||
({title} = titles.get(realUrl));
|
||||
entries.push({
|
||||
url,
|
||||
realUrl,
|
||||
title,
|
||||
count,
|
||||
});
|
||||
counted.add(url);
|
||||
}
|
||||
} else {
|
||||
console.log(`Error on curl for ${url}`, {err, stderr});
|
||||
errors.set(url, {err, stderr});
|
||||
}
|
||||
console.log(`Curl call for ${url} complete!`);
|
||||
notifyCurlComplete();
|
||||
});
|
||||
current += 1;
|
||||
curlCall.then(() => current -= 1);
|
||||
}
|
||||
}
|
||||
cleanup();
|
||||
}
|
||||
|
||||
async function make_v2() {
|
||||
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
|
||||
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [url, {url,title}]));
|
||||
if ( CLEAN ) {
|
||||
for ( const [url, obj] of titles ) {
|
||||
const k1 = clean(url);
|
||||
const k2 = clean2(url);
|
||||
if ( !titles.has(k1) ) {
|
||||
titles.set(k1, obj);
|
||||
}
|
||||
if ( !titles.has(k2) ) {
|
||||
titles.set(k2, obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
const countsFile = fs.readFileSync(path.resolve('.', 'ran-counts.json')).toString();
|
||||
counts = new Map(JSON.parse(countsFile));
|
||||
let current = 0;
|
||||
for ( const [url, count] of counts ) {
|
||||
let title;
|
||||
let realUrl;
|
||||
if ( titles.has(url) ) {
|
||||
({title} = titles.get(url));
|
||||
entries.push({
|
||||
url,
|
||||
title,
|
||||
count,
|
||||
});
|
||||
counted.add(url);
|
||||
} else {
|
||||
console.log(`Curl call for ${url} in progress...`);
|
||||
let notifyCurlComplete;
|
||||
const curlCall = new Promise(res => notifyCurlComplete = res);
|
||||
do {
|
||||
await sleep(250);
|
||||
} while ( current >= CONCURRENT );
|
||||
child_process.exec(curlCommand(url), (err, stdout, stderr) => {
|
||||
if ( ! err && (!stderr || stderr.length == 0)) {
|
||||
realUrl = stdout;
|
||||
if ( titles.has(realUrl) ) {
|
||||
({title} = titles.get(realUrl));
|
||||
entries.push({
|
||||
url,
|
||||
realUrl,
|
||||
title,
|
||||
count,
|
||||
});
|
||||
counted.add(url);
|
||||
}
|
||||
} else {
|
||||
console.log(`Error on curl for ${url}`, {err, stderr});
|
||||
errors.set(url, {err, stderr});
|
||||
}
|
||||
console.log(`Curl call for ${url} complete!`);
|
||||
notifyCurlComplete();
|
||||
});
|
||||
current += 1;
|
||||
curlCall.then(() => current -= 1);
|
||||
}
|
||||
}
|
||||
cleanup();
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
if ( cleaning ) return;
|
||||
cleaning = true;
|
||||
console.log('cleanup running');
|
||||
if ( errors.size ) {
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'errorLinks4.json'),
|
||||
JSON.stringify([...errors.keys()], null, 2)
|
||||
);
|
||||
console.log(`Wrote errors`);
|
||||
}
|
||||
if ( counted.size !== counts.size ) {
|
||||
counted.forEach(url => counts.delete(url));
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'noTitleFound4.json'),
|
||||
JSON.stringify([...counts.keys()], null, 2)
|
||||
)
|
||||
console.log(`Wrote noTitleFound`);
|
||||
}
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'topFrontPageLinksWithCounts4.json'),
|
||||
JSON.stringify(entries, null, 2)
|
||||
);
|
||||
console.log(`Wrote top links with counts`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
async function make_v1() {
|
||||
const titlesFile = fs.readFileSync(path.resolve('.', 'topTitles.json')).toString();
|
||||
const titles = new Map(JSON.parse(titlesFile).map(([url, title]) => [clean(url), {url,title}]));
|
||||
const countsFile = fs.readFileSync(path.resolve('.', 'counts.json')).toString();
|
||||
const counts = new Map(JSON.parse(countsFile).map(([url, count]) => [clean(url), count]));
|
||||
for ( const [key, count] of counts ) {
|
||||
counts.set(clean2(key), count);
|
||||
}
|
||||
const entries = [];
|
||||
for ( const [key, {url,title}] of titles ) {
|
||||
entries.push({
|
||||
url, title,
|
||||
count: counts.get(key) ||
|
||||
counts.get(url) ||
|
||||
counts.get(clean2(key)) ||
|
||||
console.log(`No count found for`, {key, url, title, c2key: clean2(key)})
|
||||
});
|
||||
}
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'topFrontPageLinks.json'),
|
||||
JSON.stringify(entries, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
function clean(urlString) {
|
||||
const url = new URL(urlString);
|
||||
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
|
||||
} else {
|
||||
url.hash = '';
|
||||
}
|
||||
for ( const [key, value] of url.searchParams ) {
|
||||
if ( key.startsWith('utm_') ) {
|
||||
url.searchParams.delete(key);
|
||||
}
|
||||
}
|
||||
url.pathname = url.pathname.replace(/\/$/, '');
|
||||
url.protocol = 'https:';
|
||||
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
|
||||
if ( url.hostname.startsWith('www.') ) {
|
||||
url.hostname = url.hostname.replace(/^www./, '');
|
||||
}
|
||||
const key = url.toString();
|
||||
return key;
|
||||
}
|
||||
|
||||
function clean2(urlString) {
|
||||
const url = new URL(urlString);
|
||||
url.pathname = '';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
function curlCommand(url) {
|
||||
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
||||
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: none' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
|
||||
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
|
||||
-H 'sec-ch-ua-mobile: ?0' \
|
||||
-H 'sec-ch-ua-platform: "macOS"' \
|
||||
--compressed ;
|
||||
`;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import child_process from 'node:child_process';
|
||||
|
||||
import {
|
||||
loadPref,
|
||||
cache_file,
|
||||
index_file,
|
||||
} from '../src/args.js';
|
||||
|
||||
const CLEAN = false;
|
||||
const CONCURRENT = 7;
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const problems = new Map();
|
||||
let cleaning = false;
|
||||
let made = false;
|
||||
|
||||
process.on('exit', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGHUP', cleanup);
|
||||
process.on('SIGUSR2', cleanup);
|
||||
process.on('beforeExit', cleanup);
|
||||
|
||||
console.log({Pref:loadPref(), cache_file: cache_file(), index_file: index_file()});
|
||||
make();
|
||||
|
||||
async function make() {
|
||||
const indexFile = fs.readFileSync(index_file()).toString();
|
||||
JSON.parse(indexFile).map(([key, value]) => {
|
||||
if ( typeof key === "number" ) return;
|
||||
if ( key.startsWith('ndx') ) return;
|
||||
if ( value.title === undefined ) {
|
||||
console.log('no title property', {key, value});
|
||||
}
|
||||
const url = key;
|
||||
const title = value.title.toLocaleLowerCase();
|
||||
if ( title.length === 0 || title.includes('404') || title.includes('not found') ) {
|
||||
if ( problems.has(url) ) {
|
||||
console.log('Found duplicate', url, title, problems.get(url));
|
||||
}
|
||||
problems.set(url, title);
|
||||
}
|
||||
});
|
||||
|
||||
made = true;
|
||||
|
||||
cleanup();
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
if ( cleaning ) return;
|
||||
if ( ! made ) return;
|
||||
cleaning = true;
|
||||
console.log('cleanup running');
|
||||
const outData = [...problems.entries()];
|
||||
fs.writeFileSync(
|
||||
path.resolve('.', 'url-problems.json'),
|
||||
JSON.stringify(outData, null, 2)
|
||||
);
|
||||
const {size:bytesWritten} = fs.statSync(
|
||||
path.resolve('.', 'url-problems.json'),
|
||||
{bigint: true}
|
||||
);
|
||||
console.log(`Wrote ${outData.length} problem urls in ${bytesWritten} bytes.`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function clean(urlString) {
|
||||
const url = new URL(urlString);
|
||||
if ( url.hash.startsWith('#!') || url.hostname.includes('google.com') || url.hostname.includes('80s.nyc') ) {
|
||||
} else {
|
||||
url.hash = '';
|
||||
}
|
||||
for ( const [key, value] of url.searchParams ) {
|
||||
if ( key.startsWith('utm_') ) {
|
||||
url.searchParams.delete(key);
|
||||
}
|
||||
}
|
||||
url.pathname = url.pathname.replace(/\/$/, '');
|
||||
url.protocol = 'https:';
|
||||
url.pathname = url.pathname.replace(/(\.htm.?|\.php)$/, '');
|
||||
if ( url.hostname.startsWith('www.') ) {
|
||||
url.hostname = url.hostname.replace(/^www./, '');
|
||||
}
|
||||
const key = url.toString();
|
||||
return key;
|
||||
}
|
||||
|
||||
function clean2(urlString) {
|
||||
const url = new URL(urlString);
|
||||
url.pathname = '';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
function curlCommand(url) {
|
||||
return `curl -k -L -s -o /dev/null -w '%{url_effective}' ${JSON.stringify(url)} \
|
||||
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
|
||||
-H 'Accept-Language: en,en-US;q=0.9,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.6,ja;q=0.5' \
|
||||
-H 'Cache-Control: no-cache' \
|
||||
-H 'Connection: keep-alive' \
|
||||
-H 'DNT: 1' \
|
||||
-H 'Pragma: no-cache' \
|
||||
-H 'Sec-Fetch-Dest: document' \
|
||||
-H 'Sec-Fetch-Mode: navigate' \
|
||||
-H 'Sec-Fetch-Site: none' \
|
||||
-H 'Sec-Fetch-User: ?1' \
|
||||
-H 'Upgrade-Insecure-Requests: 1' \
|
||||
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' \
|
||||
-H 'sec-ch-ua: "Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"' \
|
||||
-H 'sec-ch-ua-mobile: ?0' \
|
||||
-H 'sec-ch-ua-platform: "macOS"' \
|
||||
--compressed ;
|
||||
`;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
<!DOCTYPE html>
|
||||
<meta name="referrer" content="no-referrer" />
|
||||
<h1>About to index archive and index <code id=url-text></code></h1>
|
||||
<script type=module>
|
||||
const url = new URLSearchParams(location.search).get('url');
|
||||
const text = document.querySelector('#url-text');
|
||||
let valid = false;
|
||||
try {
|
||||
new URL(url);
|
||||
valid = true;
|
||||
} catch(e) {
|
||||
console.warn(`URL ${url} is not a valid URL`);
|
||||
}
|
||||
|
||||
if ( valid ) {
|
||||
text.innerText = url;
|
||||
setTimeout(() => {
|
||||
window.location.href = url;
|
||||
}, 1000);
|
||||
}
|
||||
</script>
|
|
@ -47,7 +47,7 @@
|
|||
}
|
||||
button {
|
||||
}
|
||||
input.long {
|
||||
form .long {
|
||||
width: 100%;
|
||||
min-width: 250px;
|
||||
}
|
||||
|
@ -64,6 +64,24 @@
|
|||
small.url {
|
||||
word-break: break-all;
|
||||
}
|
||||
.small {
|
||||
font-size: smaller;
|
||||
}
|
||||
|
||||
label small {
|
||||
font-style: italic;
|
||||
color: darkslategrey;
|
||||
}
|
||||
|
||||
.units {
|
||||
color: grey;
|
||||
font-size: smaller;
|
||||
}
|
||||
|
||||
input[type="number"] {
|
||||
text-align: right;
|
||||
]
|
||||
|
||||
input.search {
|
||||
flex-grow: 1;
|
||||
padding: 0.25em 0.5em;
|
||||
|
@ -72,6 +90,12 @@
|
|||
input.search + button {
|
||||
font-size: 1em;
|
||||
}
|
||||
ol {
|
||||
ol.results {
|
||||
list-style-type: none;
|
||||
}
|
||||
.cent {
|
||||
text-align: center;
|
||||
}
|
||||
.grey {
|
||||
color: grey;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
<script>
|
||||
|
||||
</script>
|
|
@ -0,0 +1,6 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
phys=$(free -t -m | grep -oP '\d+' | sed '10!d')
|
||||
alloc=$(echo "$phys * 90/100" | bc )
|
||||
echo $alloc
|
||||
node --max-old-space-size=$alloc src/app.js
|
|
@ -1,4 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
echo "Post install script"
|
||||
npm i -g rollup eslint
|
||||
which brew || /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
||||
which mkcert || brew install mkcert
|
||||
mkdir -p $HOME/local-sslcerts
|
||||
cd $HOME/local-sslcerts
|
||||
|
||||
mkcert -key-file privkey.pem -cert-file fullchain.pem localhost
|
||||
mkcert -install
|
||||
|
||||
|
|
43
src/app.js
43
src/app.js
|
@ -3,34 +3,34 @@ import ChildProcess from 'child_process';
|
|||
|
||||
import ChromeLauncher from 'chrome-launcher';
|
||||
|
||||
import {DEBUG, sleep, NO_SANDBOX} from './common.js';
|
||||
import {DEBUG, sleep, NO_SANDBOX, GO_SECURE} from './common.js';
|
||||
|
||||
import Archivist from './archivist.js';
|
||||
import {Archivist} from './archivist.js';
|
||||
import LibraryServer from './libraryServer.js';
|
||||
import args from './args.js';
|
||||
|
||||
const {server_port, mode, chrome_port} = args;
|
||||
const CHROME_OPTS = !NO_SANDBOX ? [
|
||||
'--restore-last-session',
|
||||
/*'--restore-last-session',*/
|
||||
`--disk-cache-dir=${args.temp_browser_cache()}`,
|
||||
`--aggressive-cache-discard`
|
||||
] : [
|
||||
'--restore-last-session',
|
||||
/*'--restore-last-session',*/
|
||||
`--disk-cache-dir=${args.temp_browser_cache()}`,
|
||||
`--aggressive-cache-discard`,
|
||||
'--no-sandbox'
|
||||
'--no-sandbox',
|
||||
];
|
||||
const LAUNCH_OPTS = {
|
||||
logLevel: DEBUG ? 'verbose' : 'silent',
|
||||
port: chrome_port,
|
||||
chromeFlags:CHROME_OPTS,
|
||||
userDataDir:false,
|
||||
startingUrl: `http://localhost:${args.server_port}`,
|
||||
startingUrl: `${GO_SECURE ? 'https' : 'http'}://localhost:${args.server_port}`,
|
||||
ignoreDefaultFlags: true
|
||||
}
|
||||
const KILL_ON = {
|
||||
win32: 'taskkill /IM chrome.exe /F',
|
||||
darwin: 'pkill -15 chrome',
|
||||
darwin: 'kill $(pgrep Chrome)',
|
||||
freebsd: 'pkill -15 chrome',
|
||||
linux: 'pkill -15 chrome',
|
||||
};
|
||||
|
@ -42,11 +42,16 @@ start();
|
|||
async function start() {
|
||||
console.log(`Running in node...`);
|
||||
|
||||
process.on('beforeExit', cleanup);
|
||||
process.on('SIGBREAK', cleanup);
|
||||
process.on('error', cleanup);
|
||||
process.on('unhandledRejection', cleanup);
|
||||
process.on('uncaughtException', cleanup);
|
||||
process.on('SIGHUP', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('beforeExit', cleanup);
|
||||
process.on('SIGINT', code => cleanup(code, 'signal', {exit:true}));
|
||||
process.on('SIGTERM', code => cleanup(code, 'signal', {exit:true}));
|
||||
process.on('SIGQUIT', code => cleanup(code, 'signal', {exit:true}));
|
||||
process.on('SIGBREAK', code => cleanup(code, 'signal', {exit:true}));
|
||||
process.on('SIGABRT', code => cleanup(code, 'signal', {exit:true}));
|
||||
|
||||
console.log(`Importing dependencies...`);
|
||||
const {launch:ChromeLaunch} = ChromeLauncher;
|
||||
|
@ -71,7 +76,7 @@ async function start() {
|
|||
await ChromeLaunch(LAUNCH_OPTS);
|
||||
} catch(e) {
|
||||
console.log(`Could not launch chrome.`);
|
||||
DEBUG && console.info('Chrome launch error:', e);
|
||||
DEBUG.verboseSlow && console.info('Chrome launch error:', e);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Chrome started.`);
|
||||
|
@ -92,7 +97,7 @@ async function killChrome(wait = true) {
|
|||
));
|
||||
if ( err ) {
|
||||
console.log(`There was no running chrome.`);
|
||||
//DEBUG && console.warn("Error closing existing chrome", err);
|
||||
DEBUG.verboseSlow && console.warn("Error closing existing chrome", err);
|
||||
} else {
|
||||
console.log(`Running chrome shut down.`);
|
||||
if ( wait ) {
|
||||
|
@ -108,8 +113,8 @@ async function killChrome(wait = true) {
|
|||
}
|
||||
}
|
||||
|
||||
async function cleanup(reason) {
|
||||
console.log(`Cleanup called on reason: ${reason}`);
|
||||
async function cleanup(reason, err, {exit = false} = {}) {
|
||||
console.log(`Cleanup called on reason: ${reason}`, err);
|
||||
|
||||
if ( quitting ) {
|
||||
console.log(`Cleanup already called so not running again.`);
|
||||
|
@ -123,9 +128,11 @@ async function cleanup(reason) {
|
|||
|
||||
killChrome(false);
|
||||
|
||||
console.log(`Take a breath. Everything's done. 22120 is exiting in 3 seconds...`);
|
||||
if ( exit ) {
|
||||
console.log(`Take a breath. Everything's done. DiskerNet is exiting in 3 seconds...`);
|
||||
|
||||
await sleep(3000);
|
||||
await sleep(3000);
|
||||
|
||||
process.exit(0);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
|
727
src/archivist.js
727
src/archivist.js
File diff suppressed because it is too large
Load Diff
30
src/args.js
30
src/args.js
|
@ -7,19 +7,21 @@ const mode = process.argv[3] || 'save';
|
|||
const chrome_port = process.argv[4] || 9222;
|
||||
|
||||
const Pref = {};
|
||||
const pref_file = path.resolve(os.homedir(), '.22120.config.json');
|
||||
export const CONFIG_DIR = path.resolve(os.homedir(), '.config', 'dosyago', 'DiskerNet');
|
||||
fs.mkdirSync(CONFIG_DIR, {recursive:true});
|
||||
const pref_file = path.resolve(CONFIG_DIR, 'config.json');
|
||||
const cacheId = Math.random();
|
||||
|
||||
loadPref();
|
||||
|
||||
let BasePath = Pref.BasePath;
|
||||
const archive_root = () => path.resolve(BasePath, '22120-arc');
|
||||
const no_file = () => path.resolve(archive_root(), 'no.json');
|
||||
const temp_browser_cache = () => path.resolve(archive_root(), 'temp-browser-cache' + cacheId);
|
||||
const library_path = () => path.resolve(archive_root(), 'public', 'library');
|
||||
const cache_file = () => path.resolve(library_path(), 'cache.json');
|
||||
const index_file = () => path.resolve(library_path(), 'index.json');
|
||||
const fts_index_dir = () => path.resolve(library_path(), 'fts');
|
||||
export const archive_root = () => path.resolve(BasePath, '22120-arc');
|
||||
export const no_file = () => path.resolve(archive_root(), 'no.json');
|
||||
export const temp_browser_cache = () => path.resolve(archive_root(), 'temp-browser-cache' + cacheId);
|
||||
export const library_path = () => path.resolve(archive_root(), 'public', 'library');
|
||||
export const cache_file = () => path.resolve(library_path(), 'cache.json');
|
||||
export const index_file = () => path.resolve(library_path(), 'index.json');
|
||||
export const fts_index_dir = () => path.resolve(library_path(), 'fts');
|
||||
|
||||
const flex_fts_index_dir = base => path.resolve(base || fts_index_dir(), 'flex');
|
||||
const ndx_fts_index_dir = base => path.resolve(base || fts_index_dir(), 'ndx');
|
||||
|
@ -29,7 +31,7 @@ const results_per_page = 10;
|
|||
|
||||
console.log(`Args usage: <server_port> <save|serve> <chrome_port> <library_path>`);
|
||||
|
||||
updateBasePath(process.argv[5] || Pref.BasePath || os.homedir());
|
||||
updateBasePath(process.argv[5] || Pref.BasePath || CONFIG_DIR);
|
||||
|
||||
const args = {
|
||||
mode,
|
||||
|
@ -50,7 +52,8 @@ const args = {
|
|||
ndx_fts_index_dir,
|
||||
fuzzy_fts_index_dir,
|
||||
|
||||
results_per_page
|
||||
results_per_page,
|
||||
CONFIG_DIR
|
||||
};
|
||||
|
||||
export default args;
|
||||
|
@ -126,7 +129,7 @@ function getBasePath() {
|
|||
return BasePath;
|
||||
}
|
||||
|
||||
function loadPref() {
|
||||
export function loadPref() {
|
||||
if ( fs.existsSync(pref_file) ) {
|
||||
try {
|
||||
Object.assign(Pref, JSON.parse(fs.readFileSync(pref_file)));
|
||||
|
@ -137,6 +140,7 @@ function loadPref() {
|
|||
console.log("Preferences file does not exist. Creating one...");
|
||||
savePref();
|
||||
}
|
||||
return clone(Pref);
|
||||
}
|
||||
|
||||
function savePref() {
|
||||
|
@ -147,3 +151,7 @@ function savePref() {
|
|||
}
|
||||
}
|
||||
|
||||
function clone(o) {
|
||||
return JSON.parse(JSON.stringify(o));
|
||||
}
|
||||
|
||||
|
|
|
@ -86,13 +86,13 @@ export async function* bookmarkChanges() {
|
|||
filename = filename || '';
|
||||
// listen to everything
|
||||
const path = Path.resolve(dirPath, filename);
|
||||
DEBUG && console.log(event, path);
|
||||
DEBUG.verboseSlow && console.log(event, path);
|
||||
if ( isBookmarkFile(filename) ) {
|
||||
if ( ! State.active.has(path) ) {
|
||||
State.active.add(path);
|
||||
}
|
||||
// but only act if it is a bookmark file
|
||||
DEBUG && console.log(event, path, notifyChange);
|
||||
DEBUG.verboseSlow && console.log(event, path, notifyChange);
|
||||
// save the event type and file it happened to
|
||||
change = {event, path};
|
||||
// drop the most recently pushed promise from our bookkeeping list
|
||||
|
@ -208,7 +208,7 @@ function getProfileRootDir() {
|
|||
let name = PLAT_TABLE[plat];
|
||||
let rootDir;
|
||||
|
||||
DEBUG && console.log({plat, name});
|
||||
DEBUG.verboseSlow && console.log({plat, name});
|
||||
|
||||
if ( !name ) {
|
||||
if ( plat === 'win32' ) {
|
||||
|
|
|
@ -4,8 +4,9 @@ import fs from 'fs';
|
|||
import os from 'os';
|
||||
|
||||
export const DEBUG = {
|
||||
verboseSlow: false,
|
||||
debug: process.env.DEBUG_22120 || false,
|
||||
checkPred: true
|
||||
checkPred: false
|
||||
}
|
||||
export const SHOW_FETCH = false;
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ export function highlight(query, doc, {
|
|||
doc2.splice(doc2.length, 0, ...(new Array((chunkSize - doc2.length % chunkSize) % chunkSize)).join(' ').split(''));
|
||||
const fragments2 = doc2.reduce(getFragmenter(chunkSize), []);
|
||||
query.toLocaleLowerCase();
|
||||
DEBUG && console.log(fragments);
|
||||
DEBUG.verboseSlow && console.log(fragments);
|
||||
|
||||
const scores = [...fragments, ...fragments2].map(fragment => {
|
||||
const distance = ukkonen(query, fragment.text.toLocaleLowerCase(), MaxDist);
|
||||
|
@ -65,7 +65,7 @@ export function highlight(query, doc, {
|
|||
let result;
|
||||
|
||||
if ( highlights.length === 0 ) {
|
||||
DEBUG && console.log('Zero highlights, showing first score', scores[0]);
|
||||
DEBUG.verboseSlow && console.log('Zero highlights, showing first score', scores[0]);
|
||||
result = scores.slice(0,1);
|
||||
} else {
|
||||
let better = Array.from(highlights).slice(0, 10);
|
||||
|
@ -73,7 +73,7 @@ export function highlight(query, doc, {
|
|||
const length = Array.from(hl.fragment.text).length;
|
||||
let {offset, symbols} = hl.fragment;
|
||||
const newText = symbols.slice(Math.max(0,offset - extra), offset).join('') + hl.fragment.text + symbols.slice(offset + length, offset + length + extra).join('');
|
||||
DEBUG && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});
|
||||
DEBUG.verboseSlow && console.log({newText, oldText:hl.fragment.text, p:[Math.max(0,offset-extra), offset, offset+length, offset+length+extra], trueText: symbols.slice(offset, offset+length).join('')});
|
||||
hl.fragment.text = newText;
|
||||
const {MaxDist,MinScore,MaxScore} = params(Array.from(newText).length);
|
||||
const distance = ukkonen(query, hl.fragment.text.toLocaleLowerCase(), MaxDist);
|
||||
|
@ -83,7 +83,7 @@ export function highlight(query, doc, {
|
|||
return hl;
|
||||
});
|
||||
better.sort(({score:a}, {score:b}) => a-b);
|
||||
DEBUG && console.log(JSON.stringify({better},null,2));
|
||||
DEBUG.verboseSlow && console.log(JSON.stringify({better},null,2));
|
||||
result = better.slice(0,3);
|
||||
}
|
||||
|
||||
|
@ -107,16 +107,17 @@ export function trilight(query, doc, {
|
|||
|
||||
const trigrams = doc.reduce(getFragmenter(ngramSize, {overlap:true}), []);
|
||||
const index = trigrams.reduce((idx, frag) => {
|
||||
let counts = idx[frag.text];
|
||||
let counts = idx.get(frag.text);
|
||||
if ( ! counts ) {
|
||||
counts = idx[frag.text] = [];
|
||||
counts = [];
|
||||
idx.set(frag.text, counts);
|
||||
}
|
||||
counts.push(frag.offset);
|
||||
return idx;
|
||||
}, {});
|
||||
}, new Map);
|
||||
const qtris = query.reduce(getFragmenter(ngramSize, {overlap:true}), []);
|
||||
const entries = qtris.reduce((E, {text}, qi) => {
|
||||
const counts = index[text];
|
||||
const counts = index.get(text);
|
||||
if ( counts ) {
|
||||
counts.forEach(di => {
|
||||
const entry = {text, qi, di};
|
||||
|
@ -204,9 +205,9 @@ export function trilight(query, doc, {
|
|||
}
|
||||
}
|
||||
if ( assigned ) {
|
||||
DEBUG && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
|
||||
DEBUG.verboseSlow && console.log('Assigned ', nextGap, 'to segment', assigned, 'now having length', newSegmentLength);
|
||||
} else {
|
||||
DEBUG && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
|
||||
DEBUG.verboseSlow && console.log('Gap ', nextGap, `could not be assigned as it would have made an existing
|
||||
as it would have made an existing segment too long, or it was already too long itself.`
|
||||
);
|
||||
}
|
||||
|
@ -214,10 +215,10 @@ export function trilight(query, doc, {
|
|||
segments.sort(({score:a}, {score:b}) => b-a);
|
||||
const textSegments = segments.map(({start,end}) => oDoc.slice(start,end).join(''));
|
||||
//console.log(JSON.stringify({gaps}, null, 2));
|
||||
DEBUG && console.log(segments, textSegments);
|
||||
DEBUG.verboseSlow && console.log(segments, textSegments);
|
||||
|
||||
if ( textSegments.length === 0 ) {
|
||||
DEBUG && console.log({query, doc, maxLength, ngramSize, maxSegmentSize,
|
||||
DEBUG.verboseSlow && console.log({query, doc, maxLength, ngramSize, maxSegmentSize,
|
||||
trigrams,
|
||||
index,
|
||||
entries,
|
||||
|
|
|
@ -1,16 +1,26 @@
|
|||
import http from 'http';
|
||||
import https from 'https';
|
||||
import fs from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
|
||||
import express from 'express';
|
||||
|
||||
import args from './args.js';
|
||||
import {
|
||||
GO_SECURE,
|
||||
MAX_REAL_URL_LENGTH,
|
||||
MAX_HEAD, MAX_HIGHLIGHTABLE_LENGTH, DEBUG,
|
||||
say, sleep, APP_ROOT
|
||||
say, sleep, APP_ROOT,
|
||||
RichError
|
||||
} from './common.js';
|
||||
import Archivist from './archivist.js';
|
||||
import {trilight, /*highlight*/} from './highlighter.js';
|
||||
import {startCrawl, Archivist} from './archivist.js';
|
||||
import {trilight, highlight} from './highlighter.js';
|
||||
|
||||
const SITE_PATH = path.resolve(APP_ROOT, '..', 'public');
|
||||
|
||||
const SearchCache = new Map();
|
||||
|
||||
const app = express();
|
||||
|
||||
let running = false;
|
||||
|
@ -20,18 +30,39 @@ const LibraryServer = {
|
|||
start, stop
|
||||
}
|
||||
|
||||
const secure_options = {};
|
||||
const protocol = GO_SECURE ? https : http;
|
||||
|
||||
export default LibraryServer;
|
||||
|
||||
async function start({server_port}) {
|
||||
if ( running ) {
|
||||
DEBUG && console.warn(`Attempting to start server when it is not closed. Exiting start()...`);
|
||||
DEBUG.verboseSlow && console.warn(`Attempting to start server when it is not closed. Exiting start()...`);
|
||||
return;
|
||||
}
|
||||
running = true;
|
||||
|
||||
try {
|
||||
const sec = {
|
||||
key: fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'privkey.pem')),
|
||||
cert: fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'fullchain.pem')),
|
||||
ca: fs.existsSync(path.resolve(os.homedir(), 'local-sslcerts', 'chain.pem')) ?
|
||||
fs.readFileSync(path.resolve(os.homedir(), 'local-sslcerts', 'chain.pem'))
|
||||
:
|
||||
undefined
|
||||
};
|
||||
console.log({sec});
|
||||
Object.assign(secure_options, sec);
|
||||
} catch(e) {
|
||||
console.warn(`No certs found so will use insecure no SSL.`);
|
||||
}
|
||||
|
||||
try {
|
||||
port = server_port;
|
||||
addHandlers();
|
||||
Server = app.listen(Number(port), err => {
|
||||
const secure = secure_options.cert && secure_options.key;
|
||||
const server = protocol.createServer.apply(protocol, GO_SECURE && secure ? [secure_options, app] : [app]);
|
||||
Server = server.listen(Number(port), err => {
|
||||
if ( err ) {
|
||||
running = false;
|
||||
throw err;
|
||||
|
@ -41,12 +72,13 @@ async function start({server_port}) {
|
|||
});
|
||||
} catch(e) {
|
||||
running = false;
|
||||
DEBUG && console.error(`Error starting server`, e);
|
||||
DEBUG.verboseSlow && console.error(`Error starting server`, e);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function addHandlers() {
|
||||
app.use(express.urlencoded({extended:true}));
|
||||
app.use(express.urlencoded({extended:true, limit: '50mb'}));
|
||||
app.use(express.static(SITE_PATH));
|
||||
|
||||
if ( args.library_path() ) {
|
||||
|
@ -55,8 +87,29 @@ function addHandlers() {
|
|||
|
||||
app.get('/search(.json)?', async (req, res) => {
|
||||
await Archivist.isReady();
|
||||
const {query, results:resultIds, HL} = await Archivist.search(req.query.query);
|
||||
const results = resultIds.map(docId => Archivist.getDetails(docId));
|
||||
let {query:oquery} = req.query;
|
||||
if ( ! oquery ) {
|
||||
return res.end(SearchResultView({results:[], query:'', HL:new Map, page:1}));
|
||||
}
|
||||
oquery = oquery.trim();
|
||||
if ( ! oquery ) {
|
||||
return res.end(SearchResultView({results:[], query:'', HL:new Map, page:1}));
|
||||
}
|
||||
let {page} = req.query;
|
||||
if ( ! page || ! Number.isInteger(parseInt(page)) ) {
|
||||
page = 1;
|
||||
} else {
|
||||
page = parseInt(page);
|
||||
}
|
||||
let resultIds, query, HL;
|
||||
if ( SearchCache.has(req.query.query) ) {
|
||||
({query, resultIds, HL} = SearchCache.get(oquery));
|
||||
} else {
|
||||
({query, results:resultIds, HL} = await Archivist.search(oquery));
|
||||
SearchCache.set(req.query.query, {query, resultIds, HL});
|
||||
}
|
||||
const start = (page-1)*args.results_per_page;
|
||||
const results = resultIds.slice(start,start+args.results_per_page).map(docId => Archivist.getDetails(docId))
|
||||
if ( req.path.endsWith('.json') ) {
|
||||
res.end(JSON.stringify({
|
||||
results, query
|
||||
|
@ -73,7 +126,7 @@ function addHandlers() {
|
|||
.map(segment => Archivist.findOffsets(query, segment))
|
||||
.join(' ... ');
|
||||
});
|
||||
res.end(SearchResultView({results, query, HL}));
|
||||
res.end(SearchResultView({results, query, HL, page}));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -133,6 +186,63 @@ function addHandlers() {
|
|||
res.redirect('/');
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/crawl', async (req, res) => {
|
||||
try {
|
||||
let {
|
||||
links, timeout, depth, saveToFile,
|
||||
maxPageCrawlTime, minPageCrawlTime, batchSize,
|
||||
program,
|
||||
} = req.body;
|
||||
const oTimeout = timeout;
|
||||
timeout = Math.round(parseFloat(timeout)*1000);
|
||||
depth = Math.round(parseInt(depth));
|
||||
batchSize = Math.round(parseInt(batchSize));
|
||||
saveToFile = !!saveToFile;
|
||||
minPageCrawlTime = Math.round(parseInt(minPageCrawlTime)*1000);
|
||||
maxPageCrawlTime = Math.round(parseInt(maxPageCrawlTime)*1000);
|
||||
if ( Number.isNaN(timeout) || Number.isNaN(depth) || typeof links != 'string' ) {
|
||||
console.warn({invalid:{timeout,depth,links}});
|
||||
throw new RichError({
|
||||
status: 400,
|
||||
message: 'Invalid parameters: timeout, depth or links'
|
||||
});
|
||||
}
|
||||
const urls = links.split(/[\n\s\r]+/g).map(u => u.trim()).filter(u => {
|
||||
const tooShort = u.length === 0;
|
||||
if ( tooShort ) return false;
|
||||
|
||||
const tooLong = u.length > MAX_REAL_URL_LENGTH;
|
||||
if ( tooLong ) return false;
|
||||
|
||||
let invalid = false;
|
||||
try {
|
||||
new URL(u);
|
||||
} catch {
|
||||
invalid = true;
|
||||
};
|
||||
if ( invalid ) return false;
|
||||
|
||||
return true;
|
||||
}).map(url => ({url,depth:1}));
|
||||
console.log(`Starting crawl from ${urls.length} URLs, waiting ${oTimeout} seconds for each to load, and continuing to a depth of ${depth} clicks...`);
|
||||
await startCrawl({
|
||||
urls, timeout, depth, saveToFile, batchSize, minPageCrawlTime, maxPageCrawlTime, program,
|
||||
});
|
||||
res.end(`Starting crawl from ${urls.length} URLs, waiting ${oTimeout} seconds for each to load, and continuing to a depth of ${depth} clicks...`);
|
||||
} catch(e) {
|
||||
if ( e instanceof RichError ) {
|
||||
console.warn(e);
|
||||
const {status, message} = JSON.parse(e.message);
|
||||
res.status(status);
|
||||
res.end(message);
|
||||
} else {
|
||||
console.warn(e);
|
||||
res.sendStatus(500);
|
||||
}
|
||||
return;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function stop() {
|
||||
|
@ -251,7 +361,7 @@ function IndexView(urls, {edit:edit = false} = {}) {
|
|||
`
|
||||
}
|
||||
|
||||
function SearchResultView({results, query, HL}) {
|
||||
function SearchResultView({results, query, HL, page}) {
|
||||
return `
|
||||
<!DOCTYPE html>
|
||||
<meta charset=utf-8>
|
||||
|
@ -273,7 +383,7 @@ function SearchResultView({results, query, HL}) {
|
|||
<p>
|
||||
Showing results for <b>${query}</b>
|
||||
</p>
|
||||
<ol>
|
||||
<ol class=results start="${(page-1)*args.results_per_page+1}">
|
||||
${
|
||||
results.map(({snippet, url,title,id}) => `
|
||||
<li>
|
||||
|
@ -289,6 +399,19 @@ function SearchResultView({results, query, HL}) {
|
|||
`).join('\n')
|
||||
}
|
||||
</ol>
|
||||
<p class=cent>
|
||||
${page > 1 ? `
|
||||
<a href=/search?query=${encodeURIComponent(query)}&page=${encodeURIComponent(page-1)}>
|
||||
< Page ${page-1}
|
||||
</a> |` : ''}
|
||||
<span class=grey>
|
||||
Page ${page}
|
||||
</span>
|
||||
|
|
||||
<a href=/search?query=${encodeURIComponent(query)}&page=${encodeURIComponent(page+1)}>
|
||||
Page ${page+1} >
|
||||
</a>
|
||||
</p>
|
||||
`
|
||||
}
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ export async function connect({port:port = 9222} = {}) {
|
|||
|
||||
return {
|
||||
send,
|
||||
on, ons,
|
||||
on, ons, ona,
|
||||
close
|
||||
};
|
||||
|
||||
|
@ -59,17 +59,24 @@ export async function connect({port:port = 9222} = {}) {
|
|||
const outGoing = JSON.stringify(message);
|
||||
MESSAGES.set(key, outGoing);
|
||||
socket.send(outGoing);
|
||||
DEBUG && (SHOW_FETCH || !method.startsWith('Fetch')) && console.log("Sent", message);
|
||||
DEBUG.verboseSlow && (SHOW_FETCH || !method.startsWith('Fetch')) && console.log("Sent", message);
|
||||
return promise;
|
||||
}
|
||||
|
||||
async function handle(message) {
|
||||
if ( typeof message !== "string" ) {
|
||||
try {
|
||||
message += '';
|
||||
} catch(e) {
|
||||
message = message.toString();
|
||||
}
|
||||
}
|
||||
const stringMessage = message;
|
||||
message = JSON.parse(message);
|
||||
if ( message.error ) {
|
||||
const showError = DEBUG || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
|
||||
const showError = DEBUG.protocol || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
|
||||
if ( showError ) {
|
||||
console.warn(message);
|
||||
DEBUG.protocol && console.warn(message);
|
||||
}
|
||||
}
|
||||
const {sessionId} = message;
|
||||
|
@ -80,7 +87,7 @@ export async function connect({port:port = 9222} = {}) {
|
|||
const key = `${sessionId||ROOT_SESSION}:${id}`;
|
||||
const resolve = Resolvers[key];
|
||||
if ( ! resolve ) {
|
||||
console.warn(`No resolver for key`, key, stringMessage.slice(0,140));
|
||||
DEBUG.protocol && console.warn(`No resolver for key`, key, stringMessage.slice(0,140));
|
||||
} else {
|
||||
Resolvers[key] = undefined;
|
||||
try {
|
||||
|
@ -94,7 +101,7 @@ export async function connect({port:port = 9222} = {}) {
|
|||
const showError = DEBUG || !ERROR_CODE_SAFE_TO_IGNORE.has(message.error.code);
|
||||
if ( showError ) {
|
||||
const originalMessage = MESSAGES.get(key);
|
||||
console.warn({originalMessage});
|
||||
DEBUG.protocol && console.warn({originalMessage});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -131,6 +138,20 @@ export async function connect({port:port = 9222} = {}) {
|
|||
listeners.push(handler);
|
||||
}
|
||||
|
||||
function ona(method, handler, sessionId) {
|
||||
let listeners = Handlers[method];
|
||||
if ( ! listeners ) {
|
||||
Handlers[method] = listeners = [];
|
||||
}
|
||||
listeners.push(({message}) => {
|
||||
if ( message.sessionId === sessionId ) {
|
||||
handler(message.params);
|
||||
} else {
|
||||
console.log(`No such`, {method, handler, sessionId, message});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function close() {
|
||||
socket.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue