Diskernet/lib/ndx.js

// Source: https://github.com/ndx-search/ndx
// License: MIT License
  // Source: https://github.com/ndx-search/ndx/commit/cc9ec2780d88918338d4edcfca2d4304af9dc721

// Changes by Cris Stringfellow:
// I changed the _vacuumIndex function to use a stack instead of recursion.

/**
 * Creates an Index.
 *
 * @typeparam T Document key.
 * @param fieldsNum Number of fields.
 * @returns {@link Index}
 */
export function createIndex(fieldsNum) {
    var fields = [];
    for (var i = 0; i < fieldsNum; i++) {
        fields.push({ sum: 0, avg: 0 });
    }
    return {
        docs: new Map(),
        root: createInvertedIndexNode(0),
        fields: fields,
    };
}
/**
 * Creates inverted index node.
 *
 * @typeparam T Document key.
 * @param charCode Char code.
 * @returnd {@link InvertedIndexNode} instance.
 */
export function createInvertedIndexNode(charCode) {
    return {
        charCode: charCode,
        next: null,
        firstChild: null,
        firstDoc: null,
    };
}
/**
 * Finds inverted index node that matches the `term`.
 *
 * @typeparam T Document key.
 * @param node Root node.
 * @param term Term.
 * @returns Inverted index node that contains `term` or an `undefined` value.
 */
export function findInvertedIndexNode(node, term) {
    for (var i = 0; node !== void 0 && i < term.length; i++) {
        node = findInvertedIndexChildNodeByCharCode(node, term.charCodeAt(i));
    }
    return node;
}
/**
 * Finds inverted index child node with matching `charCode`.
 *
 * @typeparam T Document key.
 * @param node {@link InvertedIndexNode}
 * @param charCode Char code.
 * @returns Matching {@link InvertedIndexNode} or `undefined`.
 */
export function findInvertedIndexChildNodeByCharCode(node, charCode) {
    var child = node.firstChild;
    while (child !== null) {
        if (child.charCode === charCode) {
            return child;
        }
        child = child.next;
    }
    return void 0;
}
/**
 * Adds inverted index child node.
 *
 * @typeparam T Document key.
 * @param parent Parent node.
 * @param child Child node to add.
 */
export function addInvertedIndexChildNode(parent, child) {
    if (parent.firstChild !== null) {
        child.next = parent.firstChild;
    }
    parent.firstChild = child;
}
/**
 * Adds document to inverted index node.
 *
 * @typeparam T Document key.
 * @param node Inverted index node.
 * @param doc Posting.
 */
export function addInvertedIndexDoc(node, doc) {
    if (node.firstDoc !== null) {
        doc.next = node.firstDoc;
    }
    node.firstDoc = doc;
}
/**
 * Adds a document to the index.
 *
 * @typeparam T Document key.
 * @typeparam D Document type.
 * @param index {@link Index}.
 * @param fieldAccessors Field accessors.
 * @param tokenizer Tokenizer is a function that breaks a text into words, phrases, symbols, or other meaningful
 *  elements called tokens.
 * @param filter Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to index
 *  documents.
 * @param key Document key.
 * @param doc Document.
 */
export function addDocumentToIndex(index, fieldAccessors, tokenizer, filter, key, doc) {
    var docs = index.docs, root = index.root, fields = index.fields;
    var fieldLengths = [];
    var termCounts = new Map();
    for (var i = 0; i < fields.length; i++) {
        var fieldValue = fieldAccessors[i](doc);
        if (fieldValue === void 0) {
            fieldLengths.push(0);
        }
        else {
            var fieldDetails = fields[i];
            // tokenize text
            var terms = tokenizer(fieldValue);
            // filter and count terms, ignore empty strings
            var filteredTermsCount = 0;
            for (var j = 0; j < terms.length; j++) {
                var term = filter(terms[j]);
                if (term !== "") {
                    filteredTermsCount++;
                    var counts = termCounts.get(term);
                    if (counts === void 0) {
                        counts = new Array(fields.length).fill(0);
                        termCounts.set(term, counts);
                    }
                    counts[i] += 1;
                }
            }
            fieldDetails.sum += filteredTermsCount;
            fieldDetails.avg = fieldDetails.sum / (docs.size + 1);
            fieldLengths[i] = filteredTermsCount;
        }
    }
    var details = { key: key, fieldLengths: fieldLengths };
    docs.set(key, details);
    termCounts.forEach(function (termFrequency, term) {
        var node = root;
        for (var i = 0; i < term.length; i++) {
            if (node.firstChild === null) {
                node = createInvertedIndexNodes(node, term, i);
                break;
            }
            var nextNode = findInvertedIndexChildNodeByCharCode(node, term.charCodeAt(i));
            if (nextNode === void 0) {
                node = createInvertedIndexNodes(node, term, i);
                break;
            }
            node = nextNode;
        }
        addInvertedIndexDoc(node, { next: null, details: details, termFrequency: termFrequency });
    });
}
/**
 * Creates inverted index nodes for the `term` starting from the `start` character.
 *
 * @typeparam T Document key.
 * @param parent Parent node.
 * @param term Term.
 * @param start First char code position in the `term`.
 * @returns Leaf {@link InvertedIndexNode}.
 */
function createInvertedIndexNodes(parent, term, start) {
    for (; start < term.length; start++) {
        var newNode = createInvertedIndexNode(term.charCodeAt(start));
        addInvertedIndexChildNode(parent, newNode);
        parent = newNode;
    }
    return parent;
}
/**
 * Remove document from the index.
 *
 * @typeparam T Document key.
 * @param index {@link Index}.
 * @param removed Set of removed document ids.
 * @param key Document key.
 */
export function removeDocumentFromIndex(index, removed, key) {
    var documents = index.docs, fields = index.fields;
    var docDetails = documents.get(key);
    if (docDetails !== void 0) {
        removed.add(key);
        documents.delete(key);
        for (var i = 0; i < fields.length; i++) {
            var fieldLength = docDetails.fieldLengths[i];
            if (fieldLength > 0) {
                var field = fields[i];
                field.sum -= fieldLength;
                field.avg = field.sum / documents.size;
            }
        }
    }
}
/**
 * Cleans up removed documents from the {@link Index}.
 *
 * @typeparam T Document key.
 * @param index {@link Index}.
 * @param removed Set of removed document ids.
 */
export function vacuumIndex(index, removed) {
    _vacuumIndex(index.root, removed);
    removed.clear();
}
/**
 * Recursively cleans up removed documents from the index.
 *
 * @typeparam T Document key.
 * @param node {@link InvertedIndexNode}
 * @param removed Set of removed document ids.
 * @returns `1` when subtree contains any document.
 */
function _vacuumIndex(node, removed) {
    var prevPointer = null;
    var pointer = node.firstDoc;
    while (pointer !== null) {
        var id = pointer.details.key;
        if (removed.has(id)) {
            if (prevPointer === null) {
                node.firstDoc = pointer.next;
            }
            else {
                prevPointer.next = pointer.next;
            }
        }
        else {
            prevPointer = pointer;
        }
        pointer = pointer.next;
    }
    var prevChild = null;
    var child = node.firstChild;
    var ret = node.firstDoc === null ? 0 : 1;
    while (child !== null) {
        var r = _vacuumIndex(child, removed);
        ret |= r;
        if (r === 0) { // subtree doesn't have any documents, remove this node
            if (prevChild === null) {
                node.firstChild = child.next;
            }
            else {
                prevChild.next = child.next;
            }
        }
        else {
            prevChild = child;
        }
        child = child.next;
    }
    return ret;
}
//# sourceMappingURL=index.js.map