spaCy/spacy/pipeline/_edit_tree_internals/edit_trees.pyx

# cython: infer_types=True, binding=True
from cython.operator cimport dereference as deref
from libc.stdint cimport uint32_t
from libc.stdint cimport UINT32_MAX
from libc.string cimport memset
from libcpp.pair cimport pair
from libcpp.vector cimport vector

from pathlib import Path

from ...typedefs cimport hash_t

from ... import util
from ...errors import Errors
from ...strings import StringStore
from .schemas import validate_edit_tree


NULL_TREE_ID = UINT32_MAX

cdef LCS find_lcs(str source, str target):
    """
    Find the longest common subsequence (LCS) between two strings. If there are
    multiple LCSes, only one of them is returned.

    source (str): The first string.
    target (str): The second string.
    RETURNS (LCS): The spans of the longest common subsequences.
    """
    cdef Py_ssize_t source_len = len(source)
    cdef Py_ssize_t target_len = len(target)
    cdef size_t longest_align = 0;
    cdef int source_idx, target_idx
    cdef LCS lcs
    cdef Py_UCS4 source_cp, target_cp

    memset(&lcs, 0, sizeof(lcs))

    cdef vector[size_t] prev_aligns = vector[size_t](target_len);
    cdef vector[size_t] cur_aligns = vector[size_t](target_len);

    for (source_idx, source_cp) in enumerate(source):
        for (target_idx, target_cp) in enumerate(target):
            if source_cp == target_cp:
                if source_idx == 0 or target_idx == 0:
                    cur_aligns[target_idx] = 1
                else:
                    cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1

                # Check if this is the longest alignment and replace previous
                # best alignment when this is the case.
                if cur_aligns[target_idx] > longest_align:
                    longest_align = cur_aligns[target_idx]
                    lcs.source_begin = source_idx - longest_align + 1
                    lcs.source_end = source_idx + 1
                    lcs.target_begin = target_idx - longest_align + 1
                    lcs.target_end = target_idx + 1
            else:
                # No match, we start with a zero-length alignment.
                cur_aligns[target_idx] = 0
        swap(prev_aligns, cur_aligns)

    return lcs

cdef class EditTrees:
    """Container for constructing and storing edit trees."""
    def __init__(self, strings: StringStore):
        """Create a container for edit trees.

        strings (StringStore): the string store to use."""
        self.strings = strings

    cpdef uint32_t add(self, str form, str lemma):
        """Add an edit tree that rewrites the given string into the given lemma.

        RETURNS (int): identifier of the edit tree in the container.
        """
        # Treat two empty strings as a special case. Generating an edit
        # tree for identical strings results in a match node. However,
        # since two empty strings have a zero-length LCS, a substitution
        # node would be created. Since we do not want to clutter the
        # recursive tree construction with logic for this case, handle
        # it in this wrapper method.
        if len(form) == 0 and len(lemma) == 0:
            tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
            return self._tree_id(tree)

        return self._add(form, lemma)

    cdef uint32_t _add(self, str form, str lemma):
        cdef LCS lcs = find_lcs(form, lemma)

        cdef EditTreeC tree
        cdef uint32_t tree_id, prefix_tree, suffix_tree
        if lcs_is_empty(lcs):
            tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
        else:
            # If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
            # create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
            prefix_tree = NULL_TREE_ID
            if lcs.source_begin != 0 or lcs.target_begin != 0:
                prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])

            suffix_tree = NULL_TREE_ID
            if lcs.source_end != len(form) or lcs.target_end != len(lemma):
                suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])

            tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)

        return self._tree_id(tree)

    cdef uint32_t _tree_id(self, EditTreeC tree):
         # If this tree has been constructed before, return its identifier.
        cdef hash_t hash = edittree_hash(tree)
        cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
        if iter != self.map.end():
            return deref(iter).second

        #  The tree hasn't been seen before, store it.
        cdef uint32_t tree_id = self.trees.size()
        self.trees.push_back(tree)
        self.map.insert(pair[hash_t, uint32_t](hash, tree_id))

        return tree_id

    cpdef str apply(self, uint32_t tree_id, str form):
        """Apply an edit tree to a form.

        tree_id (uint32_t): the identifier of the edit tree to apply.
        form (str): the form to apply the edit tree to.
        RETURNS (str): the transformer form or None if the edit tree
            could not be applied to the form.
        """
        if tree_id >= self.trees.size():
            raise IndexError("Edit tree identifier out of range")

        lemma_pieces = []
        try:
            self._apply(tree_id, form, lemma_pieces)
        except ValueError:
            return None
        return "".join(lemma_pieces)

    cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
        """Recursively apply an edit tree to a form, adding pieces to
        the lemma_pieces list."""
        assert tree_id <= self.trees.size()

        cdef EditTreeC tree = self.trees[tree_id]
        cdef MatchNodeC match_node
        cdef int suffix_start

        if tree.is_match_node:
            match_node = tree.inner.match_node

            if match_node.prefix_len + match_node.suffix_len > len(form_part):
                raise ValueError("Edit tree cannot be applied to form")

            suffix_start = len(form_part) - match_node.suffix_len

            if match_node.prefix_tree != NULL_TREE_ID:
                self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)

            lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])

            if match_node.suffix_tree != NULL_TREE_ID:
                self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
        else:
            if form_part == self.strings[tree.inner.subst_node.orig]:
                lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
            else:
                raise ValueError("Edit tree cannot be applied to form")

    cpdef unicode tree_to_str(self, uint32_t tree_id):
        """Return the tree as a string. The tree tree string is formatted
        like an S-expression. This is primarily useful for debugging. Match
        nodes have the following format:

        (m prefix_len suffix_len prefix_tree suffix_tree)

        Substitution nodes have the following format:

        (s original substitute)

        tree_id (uint32_t): the identifier of the edit tree.
        RETURNS (str): the tree as an S-expression.
        """

        if tree_id >= self.trees.size():
            raise IndexError("Edit tree identifier out of range")

        cdef EditTreeC tree = self.trees[tree_id]
        cdef SubstNodeC subst_node

        if not tree.is_match_node:
            subst_node = tree.inner.subst_node
            return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"

        cdef MatchNodeC match_node = tree.inner.match_node

        prefix_tree = "()"
        if match_node.prefix_tree != NULL_TREE_ID:
            prefix_tree = self.tree_to_str(match_node.prefix_tree)

        suffix_tree = "()"
        if match_node.suffix_tree != NULL_TREE_ID:
            suffix_tree = self.tree_to_str(match_node.suffix_tree)

        return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"

    def from_json(self, trees: list) -> "EditTrees":
        self.trees.clear()

        for tree in trees:
            tree = _dict2tree(tree)
            self.trees.push_back(tree)

        self._rebuild_tree_map()

    def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
        def deserialize_trees(tree_dicts):
            cdef EditTreeC c_tree
            for tree_dict in tree_dicts:
                c_tree = _dict2tree(tree_dict)
                self.trees.push_back(c_tree)

        deserializers = {}
        deserializers["trees"] = lambda n: deserialize_trees(n)
        util.from_bytes(bytes_data, deserializers, [])

        self._rebuild_tree_map()

        return self

    def to_bytes(self, **kwargs) -> bytes:
        tree_dicts = []
        for tree in self.trees:
            tree = _tree2dict(tree)
            tree_dicts.append(tree)

        serializers = {}
        serializers["trees"] = lambda: tree_dicts

        return util.to_bytes(serializers, [])

    def to_disk(self, path, **kwargs) -> "EditTrees":
        path = util.ensure_path(path)
        with path.open("wb") as file_:
            file_.write(self.to_bytes())

    def from_disk(self, path, **kwargs) -> "EditTrees":
        path = util.ensure_path(path)
        if path.exists():
            with path.open("rb") as file_:
                data = file_.read()
            return self.from_bytes(data)

        return self

    def __getitem__(self, idx):
        return _tree2dict(self.trees[idx])

    def __len__(self):
        return self.trees.size()

    def _rebuild_tree_map(self):
        """Rebuild the tree hash -> tree id mapping"""
        cdef EditTreeC c_tree
        cdef uint32_t tree_id
        cdef hash_t tree_hash

        self.map.clear()

        for tree_id in range(self.trees.size()):
            c_tree = self.trees[tree_id]
            tree_hash = edittree_hash(c_tree)
            self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))

    def __reduce__(self):
        return (unpickle_edittrees, (self.strings, self.to_bytes()))


def unpickle_edittrees(strings, trees_data):
    return EditTrees(strings).from_bytes(trees_data)


def _tree2dict(tree):
    if tree["is_match_node"]:
        tree = tree["inner"]["match_node"]
    else:
        tree = tree["inner"]["subst_node"]
    return(dict(tree))

def _dict2tree(tree):
    errors = validate_edit_tree(tree)
    if errors:
        raise ValueError(Errors.E1026.format(errors="\n".join(errors)))

    tree = dict(tree)
    if "prefix_len" in tree:
        tree = {"is_match_node": True, "inner": {"match_node": tree}}
    else:
        tree = {"is_match_node": False, "inner": {"subst_node": tree}}

    return tree