From b59e3b157f593f5f74981bf9d39deba7bc9a12a4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 20 May 2018 15:15:37 +0200 Subject: [PATCH] Don't require attrs argument in Doc.retokenize and allow both ints and unicode (resolves #2304) --- spacy/tests/doc/test_doc_api.py | 21 +++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 24 ++++++++++++++---------- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 06f6a3d30..d9db0916b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from ..util import get_doc from ...tokens import Doc from ...vocab import Vocab +from ...attrs import LEMMA import pytest import numpy @@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer): doc.merge(8, 32, tag='', lemma='', ent_type='ORG') +def test_doc_api_retokenizer(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7]) + assert len(doc) == 7 + assert doc[4].text == 'the beach boys' + + +def test_doc_api_retokenizer_attrs(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + # test both string and integer attributes and values + attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']} + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) + assert len(doc) == 7 + assert doc[4].text == 'the beach boys' + assert doc[4].lemma_ == 'boys' + assert doc[4].ent_type_ == 'ORG' + + def test_doc_api_sents_empty_string(en_tokenizer): doc = en_tokenizer("") doc.is_parsed = True diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 00f724ed6..b405dd000 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,11 +11,13 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport * +from ..attrs cimport TAG +from ..attrs import intify_attrs +from ..util import SimpleFrozenDict cdef class Retokenizer: - '''Helper class for doc.retokenize() context manager.''' + """Helper class for doc.retokenize() context manager.""" cdef Doc doc cdef list merges cdef list splits @@ -24,14 +26,18 @@ cdef class Retokenizer: self.merges = [] self.splits = [] - def merge(self, Span span, attrs=None): - '''Mark a span for merging. The attrs will be applied to the resulting - token.''' + def merge(self, Span span, attrs=SimpleFrozenDict()): + """Mark a span for merging. The attrs will be applied to the resulting + token. + """ + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span.start_char, span.end_char, attrs)) - def split(self, Token token, orths, attrs=None): - '''Mark a Token for splitting, into the specified orths. The attrs - will be applied to each subtoken.''' + def split(self, Token token, orths, attrs=SimpleFrozenDict()): + """Mark a Token for splitting, into the specified orths. The attrs + will be applied to each subtoken. + """ + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.splits.append((token.start_char, orths, attrs)) def __enter__(self): @@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes): # Clear the cached Python objects # Return the merged Python object return doc[start] - -