diff --git a/spacy/multi_words.py b/spacy/multi_words.py new file mode 100644 index 000000000..8f8cda2b9 --- /dev/null +++ b/spacy/multi_words.py @@ -0,0 +1,8 @@ +class RegexMerger(object): + def __init__(self, regexes): + self.regexes = regexes + + def __call__(self, tokens): + for tag, entity_type, regex in self.regexes: + for m in regex.finditer(unicode(tokens)): + tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)