diff --git a/spacy/multi_words.py b/spacy/multi_words.py index 8f8cda2b9..748086d30 100644 --- a/spacy/multi_words.py +++ b/spacy/multi_words.py @@ -4,5 +4,5 @@ class RegexMerger(object): def __call__(self, tokens): for tag, entity_type, regex in self.regexes: - for m in regex.finditer(unicode(tokens)): + for m in regex.finditer(tokens.string): tokens.merge(m.start(), m.end(), tag, m.group(), entity_type) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 8cb86c7ec..411f5cd95 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -136,6 +136,10 @@ cdef class Tokens: cdef const TokenC* last = &self.data[self.length - 1] return self._string[:last.idx + last.lex.length] + @property + def string(self): + return unicode(self) + @property def ents(self): """Yields named-entity Span objects."""