diff --git a/spacy/en.pyx b/spacy/en.pyx index e3d2ec7a1..29bff9bc1 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -56,67 +56,7 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: - cdef Py_UNICODE c0 = chars[0] - cdef Py_UNICODE c1 = chars[1] - if c0 == ",": - return 1 - elif c0 == '"': - return 1 - elif c0 == "(": - return 1 - elif c0 == "[": - return 1 - elif c0 == "{": - return 1 - elif c0 == "*": - return 1 - elif c0 == "<": - return 1 - elif c0 == "$": - return 1 - elif c0 == "£": - return 1 - elif c0 == "€": - return 1 - elif c0 == "\u201c": - return 1 - elif c0 == "'": - if c1 == "s": - return 2 - elif c1 == "S": - return 2 - elif c1 == "'": - return 2 - else: - return 1 - elif c0 == "`": - if c1 == "`": - return 2 - else: - return 1 - else: - return 0 - -abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P']) -cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length): - cdef unicode char_i = characters[i] - cdef unicode char_i1 = characters[i+1] - # Don't count appostrophes as punct if the next char is a letter - if characters[i] == "'" and i < (length - 1) and char_i1.isalpha(): - return i == 0 - if characters[i] == "-": - return False - #and i < (length - 1) and characters[i+1] == '-': - #return False - # Don't count commas as punct if the next char is a number - if characters[i] == "," and i < (length - 1) and char_i1.isdigit(): - return False - if characters[i] == "." and i < (length - 1): - return False - if characters[i] == "." and characters[:i] in abbreviations: - return False - return not char_i.isalnum() + pass EN = English('en', [], []) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d154a77d3..57891fcd7 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -25,7 +25,7 @@ cdef class Lexicon: cpdef readonly size_t size cpdef Lexeme lookup(self, unicode string) - cdef LexemeC* get(self, String* s) + cdef LexemeC* get(self, String* s) except NULL cdef PointerHash _dict diff --git a/spacy/lang.pyx b/spacy/lang.pyx index b8e5256d6..3328b53bd 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -185,7 +185,11 @@ cdef class Language: if Py_UNICODE_ISSPACE(c) == 1: if start < i: string_from_slice(&span, chars, start, i) - self._tokenize(tokens.v, &span) + try: + self._tokenize(tokens.v, &span) + except MemoryError: + print chars[start:i] + raise start = i + 1 i += 1 if start < i: @@ -194,28 +198,61 @@ cdef class Language: return tokens cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: - self._check_cache(tokens_v, string) - if not string.n: + cdef size_t i + lexemes = self.cache.get(string.key) + if lexemes != NULL: + i = 0 + while lexemes[i] != NULL: + tokens.push_back(lexemes[i]) + i += 1 return 0 + cdef uint64_t orig_key = string.key cdef size_t orig_size = tokens_v.size() cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes - cdef String affix - cdef int split = self._find_prefix(string.chars, string.n) - while string.n and split >= 1: - string_slice_prefix(string, &affix, split) - prefixes.push_back(self.lexicon.get(&affix)) - split = self._find_prefix(string.chars, string.n) + cdef String prefix + cdef String suffix + cdef String minus_pre + cdef String minus_suf + cdef size_t last_size = 0 + while string.n != 0 and string.n != last_size: + last_size = string.n + pre_len = self._find_prefix(string.chars, string.n) + if pre_len != 0: + string_from_slice(&prefix, string.chars, 0, pre_len) + string_from_slice(&minus_pre, string.chars, pre_len, string.n) + # Check whether we've hit a special-case + if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: + string = &minus_pre + prefixes.push_back(self.lexicon.get(&prefix)) + break + suf_len = self._find_suffix(string.chars, string.n) + if suf_len != 0: + string_from_slice(&suffix, string.chars, string.n - suf_len, string.n) + string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len) + # Check whether we've hit a special-case + if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: + string = &minus_suf + suffixes.push_back(self.lexicon.get(&suffix)) + break + + if pre_len and suf_len and (pre_len + suf_len) <= string.n: + string_from_slice(string, string.chars, pre_len, string.n - suf_len) + prefixes.push_back(self.lexicon.get(&prefix)) + suffixes.push_back(self.lexicon.get(&suffix)) + elif pre_len: + string = &minus_pre + prefixes.push_back(self.lexicon.get(&prefix)) + elif suf_len: + string = &minus_suf + suffixes.push_back(self.lexicon.get(&suffix)) + + if self.specials.get(string.key): + break - split = self._find_suffix(string.chars, string.n) - while string.n and split >= 1: - string_slice_suffix(string, &affix, split) - suffixes.push_back(self.lexicon.get(&affix)) - split = self._find_suffix(string.chars, string.n) - self._attach_tokens(tokens_v, string, &prefixes, &suffixes) self._save_cached(tokens_v, orig_key, orig_size) @@ -230,16 +267,23 @@ cdef class Language: string.key = 0 string.chars = NULL - cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1: + cdef size_t i + cdef LexemeC** lexemes cdef LexemeC* lexeme - for lexeme in prefixes[0]: + for lexeme in deref(prefixes): tokens.push_back(lexeme) - self._check_cache(tokens, string) if string.n != 0: - tokens.push_back(self.lexicon.get(string)) + lexemes = self.specials.get(string.key) + if lexemes != NULL: + i = 0 + while lexemes[i] != NULL: + tokens.push_back(lexemes[i]) + i += 1 + else: + tokens.push_back(self.lexicon.get(string)) cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): tokens.push_back(deref(it)) @@ -247,22 +291,100 @@ cdef class Language: cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: - pass - - cdef int _find_prefix(self, Py_UNICODE* characters, size_t length): - return 0 - - cdef int _find_suffix(self, Py_UNICODE* characters, size_t length): - if length < 2: - return 0 - cdef unicode string = characters[:length] - print repr(string) - if string.endswith("'s") or string.endswith("'S"): - return 2 - elif string.endswith("..."): - return 3 - elif not string[-1].isalnum(): + assert tokens.size() > n + lexemes = calloc((tokens.size() - n) + 1, sizeof(LexemeC**)) + cdef size_t i, j + for i, j in enumerate(range(n, tokens.size())): + lexemes[i] = tokens.at(j) + lexemes[i + 1] = NULL + self.cache.set(key, lexemes) + + cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: + cdef Py_UNICODE c0 = chars[0] + cdef Py_UNICODE c1 = chars[1] + if c0 == ",": return 1 + elif c0 == '"': + return 1 + elif c0 == "(": + return 1 + elif c0 == "[": + return 1 + elif c0 == "{": + return 1 + elif c0 == "*": + return 1 + elif c0 == "<": + return 1 + elif c0 == "$": + return 1 + elif c0 == "£": + return 1 + elif c0 == "€": + return 1 + elif c0 == "\u201c": + return 1 + elif c0 == "'": + return 1 + elif c0 == "`": + if c1 == "`": + return 2 + else: + return 1 + else: + return 0 + + cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): + cdef Py_UNICODE c0 = chars[length - 1] + cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0 + cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0 + + if c0 == ",": + return 1 + elif c0 == '"': + return 1 + elif c0 == ')': + return 1 + elif c0 == ']': + return 1 + elif c0 == '}': + return 1 + elif c0 == '*': + return 1 + elif c0 == '!': + return 1 + elif c0 == '?': + return 1 + elif c0 == '%': + return 1 + elif c0 == '$': + return 1 + elif c0 == '>': + return 1 + elif c0 == ':': + return 1 + elif c0 == "'": + return 1 + elif c0 == u'\u201d': + return 1 + elif c0 == "s": + if c1 == "'": + return 2 + else: + return 0 + elif c0 == "S": + if c1 == "'": + return 2 + else: + return 0 + elif c0 == ".": + if c1 == ".": + if c2 == ".": + return 3 + else: + return 2 + else: + return 1 else: return 0 @@ -316,7 +438,7 @@ cdef class Lexicon: self._dict.set(string.key, lexeme) self.size += 1 - cdef LexemeC* get(self, String* string): + cdef LexemeC* get(self, String* string) except NULL: cdef LexemeC* lexeme lexeme = self._dict.get(string.key) if lexeme != NULL: @@ -372,5 +494,3 @@ cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil: string_from_slice(suffix, s.chars, s.n - n, s.n) s.n -= n s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) - - diff --git a/tests/test_is_punct.py b/tests/test_is_punct.py new file mode 100644 index 000000000..687f5cf31 --- /dev/null +++ b/tests/test_is_punct.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals + + +from spacy.orth import is_punct + + +def test_comma(): + assert is_punct(',', 0, {}, {}) == True + + +def test_space(): + assert is_punct(' ', 0, {}, {}) == False + + +def test_letter(): + assert is_punct('a', 0, {}, {}) == False diff --git a/tests/test_only_punct.py b/tests/test_only_punct.py new file mode 100644 index 000000000..acaa2fd78 --- /dev/null +++ b/tests/test_only_punct.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals +import pytest + +from spacy.en import EN + +def test_only_pre1(): + assert len(EN.tokenize("(")) == 1 + + +def test_only_pre2(): + assert len(EN.tokenize("((")) == 2 + +def test_only_suf2(): + assert len(EN.tokenize("''")) == 2 diff --git a/tests/test_special_affix.py b/tests/test_special_affix.py new file mode 100644 index 000000000..ec1765368 --- /dev/null +++ b/tests/test_special_affix.py @@ -0,0 +1,45 @@ +"""Test entries in the tokenization special-case interacting with prefix +and suffix punctuation.""" +from __future__ import unicode_literals +import pytest + +from spacy.en import EN + +def test_no_special(): + assert len(EN.tokenize("(can)")) == 3 + +def test_no_punct(): + assert len(EN.tokenize("can't")) == 2 + +def test_prefix(): + assert len(EN.tokenize("(can't")) == 3 + + +def test_suffix(): + assert len(EN.tokenize("can't)")) == 3 + + +def test_wrap(): + assert len(EN.tokenize("(can't)")) == 4 + + +def test_uneven_wrap(): + assert len(EN.tokenize("(can't?)")) == 5 + + +def test_prefix_interact(): + assert len(EN.tokenize("U.S.")) == 1 + assert len(EN.tokenize("us.")) == 2 + assert len(EN.tokenize("(U.S.")) == 2 + + +def test_suffix_interact(): + assert len(EN.tokenize("U.S.)")) == 2 + + +def test_even_wrap_interact(): + assert len(EN.tokenize("(U.S.)")) == 3 + + +def test_uneven_wrap_interact(): + assert len(EN.tokenize("(U.S.?)")) == 4