From aa876884f03f404b49273fbe15c8b5a5cbb142f1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:17:57 +0100 Subject: [PATCH] Revert "Revert "Merge remote-tracking branch 'origin/master'"" This reverts commit fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9. --- .github/PULL_REQUEST_TEMPLATE.md | 4 +- spacy/language.py | 6 +- spacy/language_data/__init__.py | 1 + spacy/language_data/tokenizer_exceptions.py | 11 +++ spacy/tests/tokenizer/test_urls.py | 77 +++++++++++++++++++++ spacy/tokenizer.pxd | 1 + spacy/tokenizer.pyx | 29 +++++--- 7 files changed, 117 insertions(+), 12 deletions(-) create mode 100644 spacy/language_data/tokenizer_exceptions.py create mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e99d6dadc..a55f98646 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,8 +12,6 @@ -## Screenshots (if appropriate): - ## Types of changes - [ ] Bug fix (non-breaking change fixing an issue) @@ -27,4 +25,4 @@ - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. \ No newline at end of file +- [ ] All new and existing tests passed. diff --git a/spacy/language.py b/spacy/language.py index c6f1376a4..bebdeab20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,6 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions + if cls.token_match: + token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -82,7 +84,7 @@ class BaseDefaults(object): vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None): @@ -142,6 +144,8 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline + token_match = language_data.TOKEN_MATCH + prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 43a4ef0be..2119c071b 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -4,3 +4,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * +from .tokenizer_exceptions import * diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py new file mode 100644 index 000000000..6551440f2 --- /dev/null +++ b/spacy/language_data/tokenizer_exceptions.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import re + +_URL_PATTERN = r''' +^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +'''.strip() + +TOKEN_MATCH = re.compile(_URL_PATTERN).match + +__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py new file mode 100644 index 000000000..9e0172834 --- /dev/null +++ b/spacy/tests/tokenizer/test_urls.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +import pytest + +URLS = [ + u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + u"www.google.com?q=google", + u"google.com", + u"www.red-stars.com", + pytest.mark.xfail(u"red-stars.com"), + u"http://foo.com/blah_(wikipedia)#cite-1", + u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", + u"mailto:foo.bar@baz.com", + u"mailto:foo-bar@baz-co.com" +] + +# Punctuation we want to check is split away before the URL +PREFIXES = [ + "(", '"', "...", ">" +] + +# Punctuation we want to check is split away after the URL +SUFFIXES = [ + '"', ":", ">"] + +@pytest.mark.parametrize("text", URLS) +def test_simple_url(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].orth_ == text + assert len(tokens) == 1 + + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, prefix, url): + tokens = en_tokenizer(prefix + url) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_suffixed_url(en_tokenizer, url, suffix): + tokens = en_tokenizer(url + suffix) + assert tokens[0].text == url + assert tokens[1].text == suffix + assert len(tokens) == 2 + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_surround_url(en_tokenizer, prefix, suffix, url): + tokens = en_tokenizer(prefix + url + suffix) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert tokens[2].text == suffix + assert len(tokens) == 3 + +@pytest.mark.parametrize("prefix1", PREFIXES) +@pytest.mark.parametrize("prefix2", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, prefix1, prefix2, url): + tokens = en_tokenizer(prefix1 + prefix2 + url) + assert tokens[0].text == prefix1 + assert tokens[1].text == prefix2 + assert tokens[2].text == url + assert len(tokens) == 3 + +@pytest.mark.parametrize("suffix1", SUFFIXES) +@pytest.mark.parametrize("suffix2", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, suffix1, suffix2, url): + tokens = en_tokenizer(url + suffix1 + suffix2) + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 + assert len(tokens) == 3 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e53b7dbd1..1a3e86b49 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,6 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab + cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 66c93528b..0e83c4a75 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None): + infix_finditer=None, token_match=None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,6 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + token_match: + A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -65,10 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) - - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -85,10 +86,13 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. + token_match: + A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() + self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -100,9 +104,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, - self._infix_re) + self._prefix_re, + self._suffix_re, + self._infix_re, + self.token_match) return (self.__class__, args, None, None) @@ -216,6 +221,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break last_size = len(string) pre_len = self.find_prefix(string) if pre_len != 0: @@ -226,6 +233,8 @@ cdef class Tokenizer: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break + if self.token_match and self.token_match(string): + break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] @@ -263,7 +272,11 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if not cache_hit: + if cache_hit: + pass + elif self.token_match and self.token_match(string): + tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + else: matches = self.find_infix(string) if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False)