From 1748549aebce44b3e3dce42815fd281b4e6894bd Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:16:19 +0100 Subject: [PATCH] Added exception pattern mechanism to the tokenizer. --- spacy/language.py | 6 ++++- spacy/language_data/__init__.py | 1 + spacy/language_data/special_cases.py | 5 ++++ spacy/tests/tokenizer/test_urls.py | 19 ++++++++++++++ spacy/tokenizer.pxd | 2 ++ spacy/tokenizer.pyx | 39 +++++++++++++++++++++------- spacy/util.py | 5 ++++ 7 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/spacy/language.py b/spacy/language.py index c6f1376a4..16bffcd7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,6 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions + if cls.exception_patterns: + rule_match = util.compile_rule_regex(cls.exception_patterns).match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -80,7 +82,7 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, + return Tokenizer(vocab, rules=rules, rule_match=rule_match, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer) @@ -142,6 +144,8 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline + exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..aa379d86d 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,3 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * +from .special_cases import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py new file mode 100644 index 000000000..e7b2be5a5 --- /dev/null +++ b/spacy/language_data/special_cases.py @@ -0,0 +1,5 @@ +from __future__ import unicode_literals + +EXCEPTION_PATTERNS = r''' +((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) +'''.strip().split() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py new file mode 100644 index 000000000..5d0654d50 --- /dev/null +++ b/spacy/tests/tokenizer/test_urls.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize("text", [ + u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + u"www.google.com?q=google", + u"google.com", + u"www.red-stars.com", + pytest.mark.xfail(u"red-stars.com"), + u"http://foo.com/blah_(wikipedia)#cite-1", + u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", + u"mailto:foo.bar@baz.com", + u"mailto:foo-bar@baz-co.com" +]) +def test_simple_url(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].orth_ == text diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e53b7dbd1..24c76f7ee 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,6 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab + cdef public object rule_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -24,6 +25,7 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 + cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 66c93528b..ec5b5ea87 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,7 +28,7 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, infix_finditer=None): '''Load a Tokenizer, reading unsupplied components from the path. @@ -39,6 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + Special case matcher. Signature of re.compile(string).match prefix_search: Signature of re.compile(string).search suffix_search: @@ -65,10 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -76,6 +77,9 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +93,7 @@ cdef class Tokenizer: self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() + self.rule_match = rule_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -100,8 +105,9 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, + self.rule_match, + self._prefix_re, + self._suffix_re, self._infix_re) return (self.__class__, args, None, None) @@ -202,9 +208,12 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + if self._match_rule(span): + tokens.push_back(self.vocab.get(tokens.mem, span), False) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -314,6 +323,18 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) + cdef int _match_rule(self, unicode string): + """Check whether the given string matches any of the patterns. + + string (unicode): The string to segment. + + Returns (int or None): The length of the prefix if present, otherwise None. + """ + if self.rule_match is None: + return 0 + match = self.rule_match(string) + return (match.end() - match.start()) if match is not None else 0 + def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index afed4142e..316e431ad 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,6 +108,11 @@ def compile_infix_regex(entries): return re.compile(expression) +def compile_rule_regex(entries): + expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' + return re.compile(expression) + + def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects."