diff --git a/spacy/language.py b/spacy/language.py index 16bffcd7b..bebdeab20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,8 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions - if cls.exception_patterns: - rule_match = util.compile_rule_regex(cls.exception_patterns).match + if cls.token_match: + token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -82,9 +82,9 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, rule_match=rule_match, + return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None): @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline - exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + token_match = language_data.TOKEN_MATCH prefixes = tuple(language_data.TOKENIZER_PREFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index aa379d86d..028924796 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,4 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * -from .special_cases import * +from .tokenizer_exceptions import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py deleted file mode 100644 index e7b2be5a5..000000000 --- a/spacy/language_data/special_cases.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import unicode_literals - -EXCEPTION_PATTERNS = r''' -((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) -'''.strip().split() diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py new file mode 100644 index 000000000..6551440f2 --- /dev/null +++ b/spacy/language_data/tokenizer_exceptions.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import re + +_URL_PATTERN = r''' +^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +'''.strip() + +TOKEN_MATCH = re.compile(_URL_PATTERN).match + +__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 24c76f7ee..1a3e86b49 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,7 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab - cdef public object rule_match + cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -25,7 +25,6 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4aabdb3db..63ac84482 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, rule_match = None): + infix_finditer=None, token_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,8 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - Special case matcher. Signature of re.compile(string).match + token_match: + A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -67,9 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -86,14 +86,13 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. + token_match: + A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() - self.rule_match = rule_match + self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -105,10 +104,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self.rule_match, self._prefix_re, self._suffix_re, - self._infix_re) + self._infix_re, + self.token_match) return (self.__class__, args, None, None) @@ -208,7 +207,7 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self._match_rule(span): + if self.token_match and self.token_match(span): tokens.push_back(self.vocab.get(tokens.mem, span), False) else: span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) @@ -323,18 +322,6 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - cdef int _match_rule(self, unicode string): - """Check whether the given string matches any of the patterns. - - string (unicode): The string to segment. - - Returns (int or None): The length of the prefix if present, otherwise None. - """ - if self.rule_match is None: - return 0 - match = self.rule_match(string) - return (match.end() - match.start()) if match is not None else 0 - def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index 316e431ad..afed4142e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,11 +108,6 @@ def compile_infix_regex(entries): return re.compile(expression) -def compile_rule_regex(entries): - expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' - return re.compile(expression) - - def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects."