* Add merge() method to Tokens, with fairly brittle/hacky implementation, but quite easy to test. Passing minimal tests. Still need to fix left/right deps in C data

2015-03-30 01:37:41 +02:00 · 2015-03-30 01:37:41 +02:00 · e70b87efeb
parent 557856e84c
commit e70b87efeb
1 changed files with 85 additions and 0 deletions
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,8 +1,10 @@
 # cython: embedsignature=True
 from libc.string cimport memset
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
 from .strings cimport slice_unicode
 from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
@ -11,6 +13,7 @@ from .typedefs cimport POS, LEMMA
 from .parts_of_speech import UNIV_POS_NAMES
 from .lexeme cimport check_flag
 from .spans import Span
 from .structs cimport UniStr
 from unidecode import unidecode
@ -253,6 +256,88 @@ cdef class Tokens:
        for i in range(self.length):
            self.data[i] = parsed[i]
    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
              unicode ent_type):
        cdef int i
        cdef int start = -1
        cdef int end = -1
        for i in range(self.length):
            if self.data[i].idx == start_idx:
                start = i
            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
                end = i + 1
                break
        else:
            return None
        # Get LexemeC for newly merged token
        cdef UniStr new_orth_c
        slice_unicode(&new_orth_c, self._string, start_idx, end_idx)
        cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
        # House the new merged token where it starts
        cdef TokenC* token = &self.data[start]
        # Update fields
        token.lex = lex
        # What to do about morphology??
        # TODO: token.morph = ???
        token.tag = self.vocab.strings[tag]
        token.lemma = self.vocab.strings[lemma] 
        if ent_type == 'O':
            token.ent_iob = 2
            token.ent_type = 0
        else:
            token.ent_iob = 3
            token.ent_type = self.vocab.strings[ent_type]
        # Fix dependencies
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
        for i in range(self.length):
            self.data[i].head += i
        # Find the head of the merged token, and its dep relation
        outer_heads = {}
        for i in range(start, end):
            head_idx = self.data[i].head
            if head_idx == i or head_idx < start or head_idx >= end:
                # Don't consider "heads" which are actually dominated by a word
                # in the region we're merging
                gp = head_idx
                while self.data[gp].head != gp:
                    if start <= gp < end:
                        break
                    gp = self.data[gp].head
                else:
                    # If we have multiple words attaching to the same head,
                    # but with different dep labels, we're preferring the last
                    # occurring dep label. Shrug. What else could we do, I guess?
                    outer_heads[head_idx] = self.data[i].dep
        token.head, token.dep = max(outer_heads.items())
        # Adjust deps before shrinking tokens
        # Tokens which point into the merged token should now point to it
        # Subtract the offset from all tokens which point to >= end
        offset = (end - start) - 1
        for i in range(self.length):
            head_idx = self.data[i].head
            if start <= head_idx < end:
                self.data[i].head = start
            elif head_idx >= end:
                self.data[i].head -= offset
        # TODO: Fix left and right deps
        # Now compress the token array
        for i in range(end, self.length):
            self.data[i - offset] = self.data[i]
        for i in range(self.length - offset, self.length):
            memset(&self.data[i], 0, sizeof(TokenC))
            self.data[i].lex = &EMPTY_LEXEME
        self.length -= offset
        for i in range(self.length):
            # ...And, set heads back to a relative position
            self.data[i].head -= i
        # Clear cached Python objects
        self._py_tokens = [None] * self.length
        # Return the merged Python object
        return self[start]
 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created