Diskernet/lib/ndx.js

264 lines
7.8 KiB
JavaScript

// Source: https://github.com/ndx-search/ndx
// License: MIT License
// Source: https://github.com/ndx-search/ndx/commit/cc9ec2780d88918338d4edcfca2d4304af9dc721
// Changes by Cris Stringfellow:
// I changed the _vacuumIndex function to use a stack instead of recursion.
/**
* Creates an Index.
*
* @typeparam T Document key.
* @param fieldsNum Number of fields.
* @returns {@link Index}
*/
export function createIndex(fieldsNum) {
var fields = [];
for (var i = 0; i < fieldsNum; i++) {
fields.push({ sum: 0, avg: 0 });
}
return {
docs: new Map(),
root: createInvertedIndexNode(0),
fields: fields,
};
}
/**
* Creates inverted index node.
*
* @typeparam T Document key.
* @param charCode Char code.
* @returnd {@link InvertedIndexNode} instance.
*/
export function createInvertedIndexNode(charCode) {
return {
charCode: charCode,
next: null,
firstChild: null,
firstDoc: null,
};
}
/**
* Finds inverted index node that matches the `term`.
*
* @typeparam T Document key.
* @param node Root node.
* @param term Term.
* @returns Inverted index node that contains `term` or an `undefined` value.
*/
export function findInvertedIndexNode(node, term) {
for (var i = 0; node !== void 0 && i < term.length; i++) {
node = findInvertedIndexChildNodeByCharCode(node, term.charCodeAt(i));
}
return node;
}
/**
* Finds inverted index child node with matching `charCode`.
*
* @typeparam T Document key.
* @param node {@link InvertedIndexNode}
* @param charCode Char code.
* @returns Matching {@link InvertedIndexNode} or `undefined`.
*/
export function findInvertedIndexChildNodeByCharCode(node, charCode) {
var child = node.firstChild;
while (child !== null) {
if (child.charCode === charCode) {
return child;
}
child = child.next;
}
return void 0;
}
/**
* Adds inverted index child node.
*
* @typeparam T Document key.
* @param parent Parent node.
* @param child Child node to add.
*/
export function addInvertedIndexChildNode(parent, child) {
if (parent.firstChild !== null) {
child.next = parent.firstChild;
}
parent.firstChild = child;
}
/**
* Adds document to inverted index node.
*
* @typeparam T Document key.
* @param node Inverted index node.
* @param doc Posting.
*/
export function addInvertedIndexDoc(node, doc) {
if (node.firstDoc !== null) {
doc.next = node.firstDoc;
}
node.firstDoc = doc;
}
/**
* Adds a document to the index.
*
* @typeparam T Document key.
* @typeparam D Document type.
* @param index {@link Index}.
* @param fieldAccessors Field accessors.
* @param tokenizer Tokenizer is a function that breaks a text into words, phrases, symbols, or other meaningful
* elements called tokens.
* @param filter Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to index
* documents.
* @param key Document key.
* @param doc Document.
*/
export function addDocumentToIndex(index, fieldAccessors, tokenizer, filter, key, doc) {
var docs = index.docs, root = index.root, fields = index.fields;
var fieldLengths = [];
var termCounts = new Map();
for (var i = 0; i < fields.length; i++) {
var fieldValue = fieldAccessors[i](doc);
if (fieldValue === void 0) {
fieldLengths.push(0);
}
else {
var fieldDetails = fields[i];
// tokenize text
var terms = tokenizer(fieldValue);
// filter and count terms, ignore empty strings
var filteredTermsCount = 0;
for (var j = 0; j < terms.length; j++) {
var term = filter(terms[j]);
if (term !== "") {
filteredTermsCount++;
var counts = termCounts.get(term);
if (counts === void 0) {
counts = new Array(fields.length).fill(0);
termCounts.set(term, counts);
}
counts[i] += 1;
}
}
fieldDetails.sum += filteredTermsCount;
fieldDetails.avg = fieldDetails.sum / (docs.size + 1);
fieldLengths[i] = filteredTermsCount;
}
}
var details = { key: key, fieldLengths: fieldLengths };
docs.set(key, details);
termCounts.forEach(function (termFrequency, term) {
var node = root;
for (var i = 0; i < term.length; i++) {
if (node.firstChild === null) {
node = createInvertedIndexNodes(node, term, i);
break;
}
var nextNode = findInvertedIndexChildNodeByCharCode(node, term.charCodeAt(i));
if (nextNode === void 0) {
node = createInvertedIndexNodes(node, term, i);
break;
}
node = nextNode;
}
addInvertedIndexDoc(node, { next: null, details: details, termFrequency: termFrequency });
});
}
/**
* Creates inverted index nodes for the `term` starting from the `start` character.
*
* @typeparam T Document key.
* @param parent Parent node.
* @param term Term.
* @param start First char code position in the `term`.
* @returns Leaf {@link InvertedIndexNode}.
*/
function createInvertedIndexNodes(parent, term, start) {
for (; start < term.length; start++) {
var newNode = createInvertedIndexNode(term.charCodeAt(start));
addInvertedIndexChildNode(parent, newNode);
parent = newNode;
}
return parent;
}
/**
* Remove document from the index.
*
* @typeparam T Document key.
* @param index {@link Index}.
* @param removed Set of removed document ids.
* @param key Document key.
*/
export function removeDocumentFromIndex(index, removed, key) {
var documents = index.docs, fields = index.fields;
var docDetails = documents.get(key);
if (docDetails !== void 0) {
removed.add(key);
documents.delete(key);
for (var i = 0; i < fields.length; i++) {
var fieldLength = docDetails.fieldLengths[i];
if (fieldLength > 0) {
var field = fields[i];
field.sum -= fieldLength;
field.avg = field.sum / documents.size;
}
}
}
}
/**
* Cleans up removed documents from the {@link Index}.
*
* @typeparam T Document key.
* @param index {@link Index}.
* @param removed Set of removed document ids.
*/
export function vacuumIndex(index, removed) {
_vacuumIndex(index.root, removed);
removed.clear();
}
/**
* Recursively cleans up removed documents from the index.
*
* @typeparam T Document key.
* @param node {@link InvertedIndexNode}
* @param removed Set of removed document ids.
* @returns `1` when subtree contains any document.
*/
function _vacuumIndex(node, removed) {
var prevPointer = null;
var pointer = node.firstDoc;
while (pointer !== null) {
var id = pointer.details.key;
if (removed.has(id)) {
if (prevPointer === null) {
node.firstDoc = pointer.next;
}
else {
prevPointer.next = pointer.next;
}
}
else {
prevPointer = pointer;
}
pointer = pointer.next;
}
var prevChild = null;
var child = node.firstChild;
var ret = node.firstDoc === null ? 0 : 1;
while (child !== null) {
var r = _vacuumIndex(child, removed);
ret |= r;
if (r === 0) { // subtree doesn't have any documents, remove this node
if (prevChild === null) {
node.firstChild = child.next;
}
else {
prevChild.next = child.next;
}
}
else {
prevChild = child;
}
child = child.next;
}
return ret;
}
//# sourceMappingURL=index.js.map