Add function for entity->biluo transformation

2016-10-15 21:51:04 +02:00 · 2016-10-15 21:51:04 +02:00 · 86ae665c78
parent 2163fd238f
commit 86ae665c78
1 changed files with 63 additions and 0 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -267,6 +267,69 @@ cdef class GoldParse:
        return not nonproj.is_nonproj_tree(self.heads)
 def biluo_tags_from_offsets(doc, entities):
    '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (biluo).
    Arguments:
        doc (Doc):
            The document that the entity offsets refer to. The output tags will
            refer to the token boundaries within the document.
        entities (sequence):
            A sequence of (start, end, label) triples. start and end should be
            character-offset integers denoting the slice into the original string.
    Returns:
        tags (list):
            A list of unicode strings, describing the tags. Each tag string will
            be of the form either "", "O" or "{action}-{label}", where action is one
            of "B", "I", "L", "U". The empty string "" is used where the entity
            offsets don't align with the tokenization in the Doc object. The
            training algorithm will view these as missing values. "O" denotes
            a non-entity token. "B" denotes the beginning of a multi-token entity,
            "I" the inside of an entity of three or more tokens, and "L" the end
            of an entity of two or more tokens. "U" denotes a single-token entity.
    Example:
        text = 'I like London.'
        entities = [(len('I like '), len('I like London'), 'LOC')]
        doc = nlp.tokenizer(text)
        tags = biluo_tags_from_offsets(doc, entities)
        assert tags == ['O', 'O', 'U-LOC', 'O']
    '''
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
    biluo = ['' for _ in doc]
    # Handle entity cases
    for start_char, end_char, label in entities:
        start_token = starts.get(start_char)
        end_token = ends.get(end_char)
        # Only interested if the tokenization is correct
        if start_token is not None and end_token is not None:
            if start_token == end_token:
                biluo[start_token] = 'U-%s' % label
            else:
                biluo[start_token] = 'B-%s' % label
                for i in range(start_token+1, end_token):
                    biluo[i] = 'I-%s' % label
                biluo[end_token] = 'L-%s' % label
    # Now distinguish the O cases from ones where we miss the tokenization
    entity_chars = set()
    for start_char, end_char, label in entities:
        for i in range(start_char, end_char):
            entity_chars.add(i)
    for token in doc:
        for i in range(token.idx, token.idx+len(token)):
            if i in entity_chars:
                break
        else:
            biluo[token.i] = 'O'
    return biluo
 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'