From 0ea5af88b66f27cf905c108d8587911930051562 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Apr 2015 03:45:40 +0200 Subject: [PATCH] * Add multi-word expression RegexMatcher --- spacy/multi_words.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 spacy/multi_words.py diff --git a/spacy/multi_words.py b/spacy/multi_words.py new file mode 100644 index 000000000..8f8cda2b9 --- /dev/null +++ b/spacy/multi_words.py @@ -0,0 +1,8 @@ +class RegexMerger(object): + def __init__(self, regexes): + self.regexes = regexes + + def __call__(self, tokens): + for tag, entity_type, regex in self.regexes: + for m in regex.finditer(unicode(tokens)): + tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)