diff --git a/spacy/lang.pxd b/spacy/lang.pxd index f84e35fcd..a3cb7c281 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -1,20 +1,21 @@ from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t -from spacy.word cimport Lexeme -from spacy.tokens cimport Tokens -from spacy.lexeme cimport LexemeC -from preshed.maps cimport PreshMap - -from cymem.cymem cimport Pool - -from libcpp.utility cimport pair from libcpp.vector cimport vector from libc.stdint cimport uint64_t, int64_t +from preshed.maps cimport PreshMap +from cymem.cymem cimport Pool + +from .word cimport Lexeme +from .tokens cimport Tokens +from .lexeme cimport LexemeC + cdef extern from "Python.h": cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch) cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) + cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch) + cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch) cdef struct String: @@ -24,7 +25,7 @@ cdef struct String: cdef class Lexicon: - cdef Pool _mem + cdef Pool mem cpdef readonly size_t size cdef vector[LexemeC*] lexemes @@ -37,7 +38,6 @@ cdef class Lexicon: cdef list _string_features cdef list _flag_features - cdef class Language: cdef Pool _mem cdef unicode name @@ -47,19 +47,17 @@ cdef class Language: cdef object prefix_re cdef object suffix_re + cdef object infix_re cpdef Tokens tokenize(self, unicode text) - cpdef Lexeme lookup(self, unicode text) - - cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1 - - cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1 - cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) - cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) - - cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, - vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) except -1 + cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1 + cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes) except NULL + cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, + vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 + cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 + cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 + cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index b3c558dfa..2ab744402 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -14,9 +14,9 @@ from os import path import re from .util import read_lang_data -from spacy.tokens import Tokens -from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack -from spacy.lexeme cimport LexStr_orig +from .tokens import Tokens +from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack +from .lexeme cimport LexStr_orig from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF @@ -41,23 +41,13 @@ cdef class Language: self._mem = Pool() self.cache = PreshMap(2 ** 25) self.specials = PreshMap(2 ** 16) - rules, prefix, suffix, lexemes = util.read_lang_data(name) + rules, prefix, suffix, infix, lexemes = util.read_lang_data(name) self.prefix_re = re.compile(prefix) self.suffix_re = re.compile(suffix) + self.infix_re = re.compile(infix) self.lexicon = Lexicon(lexemes) self._load_special_tokenization(rules) - cpdef Lexeme lookup(self, unicode string): - """Retrieve (or create, if not found) a Lexeme for a string, and return it. - - Args: - string (unicode): The string to be looked up. Must be unicode, not bytes. - - Returns: - lexeme (Lexeme): A reference to a lexical type. - """ - return self.lexicon.lookup(string) - cpdef Tokens tokenize(self, unicode string): """Tokenize a string. @@ -73,37 +63,43 @@ cdef class Language: Returns: tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ - cdef size_t length = len(string) + cdef int length = len(string) cdef Tokens tokens = Tokens(length) if length == 0: return tokens - - cdef size_t start = 0 - cdef size_t i = 0 + cdef int start = 0 + cdef int i = 0 cdef Py_UNICODE* chars = string - cdef String span for i in range(length): if Py_UNICODE_ISSPACE(chars[i]) == 1: if start < i: - string_from_slice(&span, chars, start, i) - if not _extend_from_map(tokens.v, &span, self.cache): - self._tokenize(tokens.v, &span) + self._tokenize(tokens, chars, start, i) start = i + 1 i += 1 if start < i: - string_from_slice(&span, chars, start, i) - if not _extend_from_map(tokens.v, &span, self.cache): - self._tokenize(tokens.v, &span) + self._tokenize(tokens, chars, start, i) return tokens - cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1: - cdef size_t i - cdef uint64_t orig_key = string.key - cdef size_t orig_size = tokens_v.size() - + cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1: + cdef String span cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes + cdef uint64_t orig_key + cdef int orig_size + string_slice(&span, chars, start, end) + lexemes = self.cache.get(span.key) + if lexemes != NULL: + tokens.extend(start, lexemes, 0) + else: + orig_key = span.key + orig_size = tokens.lex.size() + span = self._split_affixes(&span, &prefixes, &suffixes)[0] + self._attach_tokens(tokens, start, &span, &prefixes, &suffixes) + self._save_cached(&tokens.lex, orig_key, orig_size) + cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes) except NULL: + cdef size_t i cdef String prefix cdef String suffix cdef String minus_pre @@ -113,8 +109,8 @@ cdef class Language: last_size = string.n pre_len = self._find_prefix(string.chars, string.n) if pre_len != 0: - string_from_slice(&prefix, string.chars, 0, pre_len) - string_from_slice(&minus_pre, string.chars, pre_len, string.n) + string_slice(&prefix, string.chars, 0, pre_len) + string_slice(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: string = &minus_pre @@ -122,16 +118,15 @@ cdef class Language: break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: - string_from_slice(&suffix, string.chars, string.n - suf_len, string.n) - string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len) + string_slice(&suffix, string.chars, string.n - suf_len, string.n) + string_slice(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: string = &minus_suf suffixes.push_back(self.lexicon.get(&suffix)) break - if pre_len and suf_len and (pre_len + suf_len) <= string.n: - string_from_slice(string, string.chars, pre_len, string.n - suf_len) + string_slice(string, string.chars, pre_len, string.n - suf_len) prefixes.push_back(self.lexicon.get(&prefix)) suffixes.push_back(self.lexicon.get(&suffix)) elif pre_len: @@ -140,26 +135,37 @@ cdef class Language: elif suf_len: string = &minus_suf suffixes.push_back(self.lexicon.get(&suffix)) - if self.specials.get(string.key): break + return string - self._attach_tokens(tokens_v, string, &prefixes, &suffixes) - self._save_cached(tokens_v, orig_key, orig_size) - - cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string, + cdef int _attach_tokens(self, Tokens tokens, + int idx, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1: - cdef size_t i + cdef int split cdef LexemeC** lexemes cdef LexemeC* lexeme - for lexeme in deref(prefixes): - tokens.push_back(lexeme) - if not _extend_from_map(tokens, string, self.specials): - self._split_body_token(tokens, string) + cdef String span + idx = tokens.extend(idx, prefixes.data(), prefixes.size()) + if string.n != 0: + lexemes = self.cache.get(string.key) + if lexemes != NULL: + idx = tokens.extend(idx, lexemes, 0) + else: + split = self._find_infix(string.chars, string.n) + if split == 0 or split == -1: + idx = tokens.push_back(idx, self.lexicon.get(string)) + else: + string_slice(&span, string.chars, 0, split) + idx = tokens.push_back(idx, self.lexicon.get(&span)) + string_slice(&span, string.chars, split, split+1) + idx = tokens.push_back(idx, self.lexicon.get(&span)) + string_slice(&span, string.chars, split + 1, string.n) + idx = tokens.push_back(idx, self.lexicon.get(&span)) cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): - tokens.push_back(deref(it)) + idx = tokens.push_back(idx, deref(it)) preinc(it) cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: @@ -171,15 +177,17 @@ cdef class Language: lexemes[i + 1] = NULL self.cache.set(key, lexemes) - cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1: - tokens.push_back(self.lexicon.get(string)) + cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: + cdef unicode string = chars[:length] + match = self.infix_re.search(string) + return match.start() if match is not None else 0 cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] match = self.prefix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): + cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] match = self.suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 @@ -212,27 +220,30 @@ cdef class Language: cdef class Lexicon: def __cinit__(self, lexemes): - self._mem = Pool() + self.mem = Pool() self._dict = PreshMap(2 ** 20) self.size = 0 cdef String string cdef dict lexeme_dict cdef LexemeC* lexeme - for lexeme_dict in lexemes: - string_from_unicode(&string, lexeme_dict['string']) - lexeme = self._mem.alloc(1, sizeof(LexemeC)) + for py_string, lexeme_dict in lexemes.iteritems(): + string_from_unicode(&string, py_string) + lexeme = self.mem.alloc(1, sizeof(LexemeC)) lexeme_unpack(lexeme, lexeme_dict) self._dict.set(string.key, lexeme) self.lexemes.push_back(lexeme) self.size += 1 + def __getitem__(self, size_t i): + return Lexeme(self.lexemes.at(i)) + cdef LexemeC* get(self, String* string) except NULL: cdef LexemeC* lex lex = self._dict.get(string.key) if lex != NULL: return lex - lex = self._mem.alloc(1, sizeof(LexemeC)) + lex = self.mem.alloc(1, sizeof(LexemeC)) cdef unicode unicode_string = string.chars[:string.n] lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string)) self._dict.set(string.key, lex) @@ -255,38 +266,12 @@ cdef class Lexicon: return Lexeme(lexeme) -cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1: - if string.n == 0: - return 1 - lexemes = map_.get(string.key) - if lexemes == NULL: - return 0 - cdef size_t i = 0 - while lexemes[i] != NULL: - tokens.push_back(lexemes[i]) - i += 1 - return 1 - - cdef void string_from_unicode(String* s, unicode uni): cdef Py_UNICODE* c_uni = uni - string_from_slice(s, c_uni, 0, len(uni)) + string_slice(s, c_uni, 0, len(uni)) -cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: +cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: s.chars = &chars[start] s.n = end - start s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) - - -cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil: - string_from_slice(prefix, s.chars, 0, n) - s.chars += n - s.n -= n - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) - - -cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil: - string_from_slice(suffix, s.chars, s.n - n, s.n) - s.n -= n - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) diff --git a/spacy/orth.py b/spacy/orth.py index 53dbcf863..4bec8d665 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats): def word_shape(string, *args): length = len(string) - shape = "" + shape = [] last = "" shape_char = "" seq = 0 @@ -99,8 +99,8 @@ def word_shape(string, *args): seq = 0 last = shape_char if seq < 5: - shape += shape_char - return shape + shape.append(shape_char) + return ''.join(shape) def non_sparse(string, prob, cluster, case_stats, tag_stats): diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 38918e092..97adea956 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC from libcpp.vector cimport vector -cdef struct Token: - int i - int pos - LexemeC* lex - - cdef class Tokens: - cdef vector[Token] v + cdef vector[LexemeC*] lex + cdef vector[int] idx + cdef vector[int] pos cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1 @@ -21,6 +17,7 @@ cdef class Tokens: cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except * cpdef unicode string_view(self, size_t i, size_t view_id) + cpdef unicode string(self, size_t i) cpdef unicode orig(self, size_t i) cpdef unicode norm(self, size_t i) cpdef unicode shape(self, size_t i) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 44bca0b45..783aa5c18 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -25,17 +25,20 @@ cdef class Tokens: """ def __cinit__(self, string_length=0): size = int(string_length / 3) if string_length >= 3 else 1 - self.v = vector[Token]() - self.v.reserve(size) + self.lex.reserve(size) + self.idx.reserve(size) + self.pos.reserve(size) def __getitem__(self, i): - return Lexeme(self.v.at(i).lex) + return Lexeme(self.lex.at(i)) def __len__(self): - return self.v.size() + return self.lex.size() cdef int push_back(self, int idx, LexemeC* lexeme) except -1: - self.v.push_back(Token(idx, 0, lexeme)) + self.lex.push_back(lexeme) + self.idx.push_back(idx) + self.pos.push_back(0) return idx + lexeme.ints[LexInt_length] cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: @@ -46,120 +49,124 @@ cdef class Tokens: i = 0 while lexemes[i] != NULL: idx = self.push_back(idx, lexemes[i]) + i += 1 else: for i in range(n): idx = self.push_back(idx, lexemes[i]) return idx cpdef int id(self, size_t i) except -1: - return self.v.at(i).lex.ints[LexInt_id] + return self.lex.at(i).ints[LexInt_id] cpdef float prob(self, size_t i) except 1: - return self.v.at(i).lex.floats[LexFloat_prob] + return self.lex.at(i).floats[LexFloat_prob] cpdef int cluster(self, size_t i) except *: - return self.v.at(i).lex.ints[LexInt_cluster] + return self.lex.at(i).ints[LexInt_cluster] cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, flag_id) + return lexeme_check_orth_flag(self.lex.at(i), flag_id) cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, flag_id) + return lexeme_check_dist_flag(self.lex.at(i), flag_id) cpdef unicode string_view(self, size_t i, size_t view_id): - return lexeme_get_string(self.v.at(i).lex, view_id) + return lexeme_get_string(self.lex.at(i), view_id) # Provide accessor methods for the features supported by the language. # Without these, clients have to use the underlying string_view and check_flag # methods, which requires them to know the IDs. + cpdef unicode string(self, size_t i): + return self.orig(i) + cpdef unicode orig(self, size_t i): - cdef bytes utf8_string = self.v.at(i).lex.strings[LexStr_orig] + cdef bytes utf8_string = self.lex.at(i).strings[LexStr_orig] cdef unicode string = utf8_string.decode('utf8') return string cpdef unicode norm(self, size_t i): - cdef bytes utf8_string = self.v.at(i).lex.strings[LexStr_norm] + cdef bytes utf8_string = self.lex.at(i).strings[LexStr_norm] cdef unicode string = utf8_string.decode('utf8') return string cpdef unicode shape(self, size_t i): - return lexeme_get_string(self.v.at(i).lex, LexStr_shape) + return lexeme_get_string(self.lex.at(i), LexStr_shape) cpdef unicode unsparse(self, size_t i): - return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse) + return lexeme_get_string(self.lex.at(i), LexStr_unsparse) cpdef unicode asciied(self, size_t i): - return lexeme_get_string(self.v.at(i).lex, LexStr_asciied) + return lexeme_get_string(self.lex.at(i), LexStr_asciied) cpdef bint is_alpha(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha) cpdef bint is_ascii(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii) cpdef bint is_digit(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit) cpdef bint is_lower(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower) cpdef bint is_punct(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct) cpdef bint is_space(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space) cpdef bint is_title(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title) cpdef bint is_upper(self, size_t i) except *: - return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper) + return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper) cpdef bint can_adj(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj) cpdef bint can_adp(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp) cpdef bint can_adv(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv) cpdef bint can_conj(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj) cpdef bint can_det(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_det) cpdef bint can_noun(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun) cpdef bint can_num(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_num) cpdef bint can_pdt(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt) cpdef bint can_pos(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos) cpdef bint can_pron(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron) cpdef bint can_prt(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt) cpdef bint can_punct(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct) cpdef bint can_verb(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb) cpdef bint oft_lower(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower) cpdef bint oft_title(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_title) cpdef bint oft_upper(self, size_t i) except *: - return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper) + return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper) diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 89d6a840d..34c327069 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -4,3 +4,5 @@ ctypedef uint64_t hash_t ctypedef char* utf8_t ctypedef uint64_t flag_t ctypedef uintptr_t id_t + + diff --git a/spacy/util.py b/spacy/util.py index 15c03780a..fc398bd79 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,7 +1,7 @@ import os from os import path import codecs -import json +import ujson import re DATA_DIR = path.join(path.dirname(__file__), '..', 'data') @@ -16,28 +16,36 @@ def read_lang_data(name): tokenization = read_tokenization(data_dir) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) + infix = read_infix(data_dir) lex_loc = path.join(data_dir, 'lexemes.json') if path.exists(lex_loc): with open(lex_loc) as file_: lexemes = ujson.load(file_) else: - lexemes = [] - return tokenization, prefix, suffix, lexemes + lexemes = {} + return tokenization, prefix, suffix, infix, lexemes def read_prefix(data_dir): with utf8open(path.join(data_dir, 'prefix')) as file_: entries = file_.read().split('\n') - expression = '|'.join(['^' + re.escape(piece) for piece in entries]) + expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression def read_suffix(data_dir): with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') - expression = '|'.join([re.escape(piece) + '$' for piece in entries]) + expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) return expression +def read_infix(data_dir): + with utf8open(path.join(data_dir, 'infix')) as file_: + entries = file_.read().split('\n') + expression = '|'.join([piece for piece in entries if piece.strip()]) + return expression + + def read_tokenization(lang): loc = path.join(DATA_DIR, lang, 'tokenization') entries = [] @@ -60,3 +68,16 @@ def read_tokenization(lang): seen.add(chunk) entries.append((chunk, pieces)) return entries + + +def align_tokens(ref, indices): + start = 0 + queue = list(indices) + for token in ref: + end = start + len(token) + emit = [] + while queue and queue[0][1] <= end: + emit.append(queue.pop(0)) + yield token, emit + start = end + assert not queue diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index c6ff44757..f6c77dc43 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -7,20 +7,20 @@ from spacy.lexeme import * def test_is_alpha(): - the = EN.lookup('the') + the = EN.lexicon.lookup('the') assert the.check_orth_flag(LexOrth_alpha) - year = EN.lookup('1999') + year = EN.lexicon.lookup('1999') assert not year.check_orth_flag(LexOrth_alpha) - mixed = EN.lookup('hello1') + mixed = EN.lexicon.lookup('hello1') assert not mixed.check_orth_flag(LexOrth_alpha) def test_is_digit(): - the = EN.lookup('the') + the = EN.lexicon.lookup('the') assert not the.check_orth_flag(LexOrth_digit) - year = EN.lookup('1999') + year = EN.lexicon.lookup('1999') assert year.check_orth_flag(LexOrth_digit) - mixed = EN.lookup('hello1') + mixed = EN.lexicon.lookup('hello1') assert not mixed.check_orth_flag(LexOrth_digit) diff --git a/tests/test_orth.py b/tests/test_orth.py index a6be98b05..fb6f56b94 100644 --- a/tests/test_orth.py +++ b/tests/test_orth.py @@ -9,7 +9,7 @@ from spacy.lexeme import * @pytest.fixture def C3P0(): - return EN.lookup("C3P0") + return EN.lexicon.lookup("C3P0") def test_shape(C3P0): @@ -17,11 +17,11 @@ def test_shape(C3P0): def test_length(): - t = EN.lookup('the') + t = EN.lexicon.lookup('the') assert t.length == 3 - t = EN.lookup("n't") + t = EN.lexicon.lookup("n't") assert t.length == 3 - t = EN.lookup("'s") + t = EN.lexicon.lookup("'s") assert t.length == 2 - t = EN.lookup('Xxxx') + t = EN.lexicon.lookup('Xxxx') assert t.length == 4 diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fcdcdc141..59a7fe524 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -27,7 +27,7 @@ def test_punct(): def test_digits(): lex_ids = EN.tokenize('The year: 1984.') - assert lex_ids.string(3) == "1984" + assert lex_ids.orig(3) == "1984" assert len(lex_ids) == 5 assert lex_ids[0].string == EN.lexicon.lookup('The').string assert lex_ids[3].string == EN.lexicon.lookup('1984').string @@ -101,4 +101,4 @@ def test_cnts6(): def test_cnts7(): text = 'But then the 6,000-year ice age came...' tokens = EN.tokenize(text) - assert len(tokens) == 8 + assert len(tokens) == 10 diff --git a/tests/test_vocab.py b/tests/test_vocab.py index cd7bf42e1..047df07b3 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -4,31 +4,31 @@ from spacy.en import EN def test_neq(): - addr = EN.lookup('Hello') - assert EN.lookup('bye').string != addr.string + addr = EN.lexicon.lookup('Hello') + assert EN.lexicon.lookup('bye').string != addr.string def test_eq(): - addr = EN.lookup('Hello') - assert EN.lookup('Hello').string == addr.string + addr = EN.lexicon.lookup('Hello') + assert EN.lexicon.lookup('Hello').string == addr.string def test_round_trip(): - hello = EN.lookup('Hello') + hello = EN.lexicon.lookup('Hello') assert hello.string == 'Hello' def test_case_neq(): - addr = EN.lookup('Hello') - assert EN.lookup('hello').string != addr.string + addr = EN.lexicon.lookup('Hello') + assert EN.lexicon.lookup('hello').string != addr.string def test_punct_neq(): - addr = EN.lookup('Hello') - assert EN.lookup('Hello,').string != addr.string + addr = EN.lexicon.lookup('Hello') + assert EN.lexicon.lookup('Hello,').string != addr.string def test_short(): - addr = EN.lookup('I') + addr = EN.lexicon.lookup('I') assert addr.string == 'I' assert addr.string != 'not'