mirror of https://github.com/explosion/spaCy.git
* Add offsets to Tokens class. Some changes to interfaces, and reorganization of spacy.Lang
This commit is contained in:
parent
2805068ca8
commit
6fb42c4919
|
@ -1,20 +1,21 @@
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from spacy.word cimport Lexeme
|
||||
from spacy.tokens cimport Tokens
|
||||
from spacy.lexeme cimport LexemeC
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport uint64_t, int64_t
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .word cimport Lexeme
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport LexemeC
|
||||
|
||||
|
||||
cdef extern from "Python.h":
|
||||
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
|
||||
|
||||
|
||||
cdef struct String:
|
||||
|
@ -24,7 +25,7 @@ cdef struct String:
|
|||
|
||||
|
||||
cdef class Lexicon:
|
||||
cdef Pool _mem
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
|
||||
cdef vector[LexemeC*] lexemes
|
||||
|
@ -37,7 +38,6 @@ cdef class Lexicon:
|
|||
cdef list _string_features
|
||||
cdef list _flag_features
|
||||
|
||||
|
||||
cdef class Language:
|
||||
cdef Pool _mem
|
||||
cdef unicode name
|
||||
|
@ -47,19 +47,17 @@ cdef class Language:
|
|||
|
||||
cdef object prefix_re
|
||||
cdef object suffix_re
|
||||
cdef object infix_re
|
||||
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
cpdef Lexeme lookup(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
|
||||
|
||||
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
|
||||
|
||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1
|
||||
|
||||
|
|
153
spacy/lang.pyx
153
spacy/lang.pyx
|
@ -14,9 +14,9 @@ from os import path
|
|||
import re
|
||||
|
||||
from .util import read_lang_data
|
||||
from spacy.tokens import Tokens
|
||||
from spacy.lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||
from spacy.lexeme cimport LexStr_orig
|
||||
from .tokens import Tokens
|
||||
from .lexeme cimport LexemeC, get_lexeme_dict, lexeme_pack, lexeme_unpack
|
||||
from .lexeme cimport LexStr_orig
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
|
@ -41,23 +41,13 @@ cdef class Language:
|
|||
self._mem = Pool()
|
||||
self.cache = PreshMap(2 ** 25)
|
||||
self.specials = PreshMap(2 ** 16)
|
||||
rules, prefix, suffix, lexemes = util.read_lang_data(name)
|
||||
rules, prefix, suffix, infix, lexemes = util.read_lang_data(name)
|
||||
self.prefix_re = re.compile(prefix)
|
||||
self.suffix_re = re.compile(suffix)
|
||||
self.infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon(lexemes)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
cpdef Lexeme lookup(self, unicode string):
|
||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||
|
||||
Returns:
|
||||
lexeme (Lexeme): A reference to a lexical type.
|
||||
"""
|
||||
return self.lexicon.lookup(string)
|
||||
|
||||
cpdef Tokens tokenize(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
|
||||
|
@ -73,37 +63,43 @@ cdef class Language:
|
|||
Returns:
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
"""
|
||||
cdef size_t length = len(string)
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
|
||||
cdef size_t start = 0
|
||||
cdef size_t i = 0
|
||||
cdef int start = 0
|
||||
cdef int i = 0
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef String span
|
||||
for i in range(length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||
if start < i:
|
||||
string_from_slice(&span, chars, start, i)
|
||||
if not _extend_from_map(tokens.v, &span, self.cache):
|
||||
self._tokenize(tokens.v, &span)
|
||||
self._tokenize(tokens, chars, start, i)
|
||||
start = i + 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_from_slice(&span, chars, start, i)
|
||||
if not _extend_from_map(tokens.v, &span, self.cache):
|
||||
self._tokenize(tokens.v, &span)
|
||||
self._tokenize(tokens, chars, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||
cdef size_t i
|
||||
cdef uint64_t orig_key = string.key
|
||||
cdef size_t orig_size = tokens_v.size()
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
|
||||
cdef String span
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef uint64_t orig_key
|
||||
cdef int orig_size
|
||||
string_slice(&span, chars, start, end)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
orig_key = span.key
|
||||
orig_size = tokens.lex.size()
|
||||
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
|
||||
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex, orig_key, orig_size)
|
||||
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL:
|
||||
cdef size_t i
|
||||
cdef String prefix
|
||||
cdef String suffix
|
||||
cdef String minus_pre
|
||||
|
@ -113,8 +109,8 @@ cdef class Language:
|
|||
last_size = string.n
|
||||
pre_len = self._find_prefix(string.chars, string.n)
|
||||
if pre_len != 0:
|
||||
string_from_slice(&prefix, string.chars, 0, pre_len)
|
||||
string_from_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||
string_slice(&prefix, string.chars, 0, pre_len)
|
||||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
||||
string = &minus_pre
|
||||
|
@ -122,16 +118,15 @@ cdef class Language:
|
|||
break
|
||||
suf_len = self._find_suffix(string.chars, string.n)
|
||||
if suf_len != 0:
|
||||
string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
||||
string = &minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
break
|
||||
|
||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||
string_from_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||
string_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
elif pre_len:
|
||||
|
@ -140,26 +135,37 @@ cdef class Language:
|
|||
elif suf_len:
|
||||
string = &minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
|
||||
if self.specials.get(string.key):
|
||||
break
|
||||
return string
|
||||
|
||||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||
self._save_cached(tokens_v, orig_key, orig_size)
|
||||
|
||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||
cdef int _attach_tokens(self, Tokens tokens,
|
||||
int idx, String* string,
|
||||
vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except -1:
|
||||
cdef size_t i
|
||||
cdef int split
|
||||
cdef LexemeC** lexemes
|
||||
cdef LexemeC* lexeme
|
||||
for lexeme in deref(prefixes):
|
||||
tokens.push_back(lexeme)
|
||||
if not _extend_from_map(tokens, string, self.specials):
|
||||
self._split_body_token(tokens, string)
|
||||
cdef String span
|
||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||
if string.n != 0:
|
||||
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||
if lexemes != NULL:
|
||||
idx = tokens.extend(idx, lexemes, 0)
|
||||
else:
|
||||
split = self._find_infix(string.chars, string.n)
|
||||
if split == 0 or split == -1:
|
||||
idx = tokens.push_back(idx, self.lexicon.get(string))
|
||||
else:
|
||||
string_slice(&span, string.chars, 0, split)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split, split+1)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
tokens.push_back(deref(it))
|
||||
idx = tokens.push_back(idx, deref(it))
|
||||
preinc(it)
|
||||
|
||||
cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
|
||||
|
@ -171,15 +177,17 @@ cdef class Language:
|
|||
lexemes[i + 1] = NULL
|
||||
self.cache.set(key, lexemes)
|
||||
|
||||
cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
|
||||
tokens.push_back(self.lexicon.get(string))
|
||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
match = self.infix_re.search(string)
|
||||
return match.start() if match is not None else 0
|
||||
|
||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
match = self.prefix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
|
||||
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
match = self.suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
@ -212,27 +220,30 @@ cdef class Language:
|
|||
|
||||
cdef class Lexicon:
|
||||
def __cinit__(self, lexemes):
|
||||
self._mem = Pool()
|
||||
self.mem = Pool()
|
||||
self._dict = PreshMap(2 ** 20)
|
||||
self.size = 0
|
||||
cdef String string
|
||||
cdef dict lexeme_dict
|
||||
cdef LexemeC* lexeme
|
||||
for lexeme_dict in lexemes:
|
||||
string_from_unicode(&string, lexeme_dict['string'])
|
||||
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||
for py_string, lexeme_dict in lexemes.iteritems():
|
||||
string_from_unicode(&string, py_string)
|
||||
lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||
lexeme_unpack(lexeme, lexeme_dict)
|
||||
self._dict.set(string.key, lexeme)
|
||||
self.lexemes.push_back(lexeme)
|
||||
self.size += 1
|
||||
|
||||
def __getitem__(self, size_t i):
|
||||
return Lexeme(<size_t>self.lexemes.at(i))
|
||||
|
||||
cdef LexemeC* get(self, String* string) except NULL:
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._dict.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
|
||||
lex = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
|
||||
cdef unicode unicode_string = string.chars[:string.n]
|
||||
lexeme_unpack(lex, get_lexeme_dict(self.size, unicode_string))
|
||||
self._dict.set(string.key, lex)
|
||||
|
@ -255,38 +266,12 @@ cdef class Lexicon:
|
|||
return Lexeme(<size_t>lexeme)
|
||||
|
||||
|
||||
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
|
||||
if string.n == 0:
|
||||
return 1
|
||||
lexemes = <LexemeC**>map_.get(string.key)
|
||||
if lexemes == NULL:
|
||||
return 0
|
||||
cdef size_t i = 0
|
||||
while lexemes[i] != NULL:
|
||||
tokens.push_back(lexemes[i])
|
||||
i += 1
|
||||
return 1
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||
string_from_slice(s, c_uni, 0, len(uni))
|
||||
string_slice(s, c_uni, 0, len(uni))
|
||||
|
||||
|
||||
cdef inline void string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cdef inline void string_slice_prefix(String* s, String* prefix, size_t n) nogil:
|
||||
string_from_slice(prefix, s.chars, 0, n)
|
||||
s.chars += n
|
||||
s.n -= n
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
|
||||
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
||||
s.n -= n
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
|
|
@ -79,7 +79,7 @@ def canon_case(string, prob, cluster, case_stats, tag_stats):
|
|||
|
||||
def word_shape(string, *args):
|
||||
length = len(string)
|
||||
shape = ""
|
||||
shape = []
|
||||
last = ""
|
||||
shape_char = ""
|
||||
seq = 0
|
||||
|
@ -99,8 +99,8 @@ def word_shape(string, *args):
|
|||
seq = 0
|
||||
last = shape_char
|
||||
if seq < 5:
|
||||
shape += shape_char
|
||||
return shape
|
||||
shape.append(shape_char)
|
||||
return ''.join(shape)
|
||||
|
||||
|
||||
def non_sparse(string, prob, cluster, case_stats, tag_stats):
|
||||
|
|
|
@ -2,14 +2,10 @@ from spacy.lexeme cimport LexemeC
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
|
||||
cdef struct Token:
|
||||
int i
|
||||
int pos
|
||||
LexemeC* lex
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef vector[Token] v
|
||||
cdef vector[LexemeC*] lex
|
||||
cdef vector[int] idx
|
||||
cdef vector[int] pos
|
||||
|
||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||
|
@ -21,6 +17,7 @@ cdef class Tokens:
|
|||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *
|
||||
cpdef unicode string_view(self, size_t i, size_t view_id)
|
||||
|
||||
cpdef unicode string(self, size_t i)
|
||||
cpdef unicode orig(self, size_t i)
|
||||
cpdef unicode norm(self, size_t i)
|
||||
cpdef unicode shape(self, size_t i)
|
||||
|
|
|
@ -25,17 +25,20 @@ cdef class Tokens:
|
|||
"""
|
||||
def __cinit__(self, string_length=0):
|
||||
size = int(string_length / 3) if string_length >= 3 else 1
|
||||
self.v = vector[Token]()
|
||||
self.v.reserve(size)
|
||||
self.lex.reserve(size)
|
||||
self.idx.reserve(size)
|
||||
self.pos.reserve(size)
|
||||
|
||||
def __getitem__(self, i):
|
||||
return Lexeme(<size_t>self.v.at(i).lex)
|
||||
return Lexeme(<size_t>self.lex.at(i))
|
||||
|
||||
def __len__(self):
|
||||
return self.v.size()
|
||||
return self.lex.size()
|
||||
|
||||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
||||
self.v.push_back(Token(idx, 0, lexeme))
|
||||
self.lex.push_back(lexeme)
|
||||
self.idx.push_back(idx)
|
||||
self.pos.push_back(0)
|
||||
return idx + lexeme.ints[<int>LexInt_length]
|
||||
|
||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||
|
@ -46,120 +49,124 @@ cdef class Tokens:
|
|||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
i += 1
|
||||
else:
|
||||
for i in range(n):
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
return idx
|
||||
|
||||
cpdef int id(self, size_t i) except -1:
|
||||
return self.v.at(i).lex.ints[<int>LexInt_id]
|
||||
return self.lex.at(i).ints[<int>LexInt_id]
|
||||
|
||||
cpdef float prob(self, size_t i) except 1:
|
||||
return self.v.at(i).lex.floats[<int>LexFloat_prob]
|
||||
return self.lex.at(i).floats[<int>LexFloat_prob]
|
||||
|
||||
cpdef int cluster(self, size_t i) except *:
|
||||
return self.v.at(i).lex.ints[<int>LexInt_cluster]
|
||||
return self.lex.at(i).ints[<int>LexInt_cluster]
|
||||
|
||||
cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, flag_id)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), flag_id)
|
||||
|
||||
cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, flag_id)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), flag_id)
|
||||
|
||||
cpdef unicode string_view(self, size_t i, size_t view_id):
|
||||
return lexeme_get_string(self.v.at(i).lex, view_id)
|
||||
return lexeme_get_string(self.lex.at(i), view_id)
|
||||
|
||||
# Provide accessor methods for the features supported by the language.
|
||||
# Without these, clients have to use the underlying string_view and check_flag
|
||||
# methods, which requires them to know the IDs.
|
||||
|
||||
cpdef unicode string(self, size_t i):
|
||||
return self.orig(i)
|
||||
|
||||
cpdef unicode orig(self, size_t i):
|
||||
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_orig]
|
||||
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_orig]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
cpdef unicode norm(self, size_t i):
|
||||
cdef bytes utf8_string = self.v.at(i).lex.strings[<int>LexStr_norm]
|
||||
cdef bytes utf8_string = self.lex.at(i).strings[<int>LexStr_norm]
|
||||
cdef unicode string = utf8_string.decode('utf8')
|
||||
return string
|
||||
|
||||
cpdef unicode shape(self, size_t i):
|
||||
return lexeme_get_string(self.v.at(i).lex, LexStr_shape)
|
||||
return lexeme_get_string(self.lex.at(i), LexStr_shape)
|
||||
|
||||
cpdef unicode unsparse(self, size_t i):
|
||||
return lexeme_get_string(self.v.at(i).lex, LexStr_unsparse)
|
||||
return lexeme_get_string(self.lex.at(i), LexStr_unsparse)
|
||||
|
||||
cpdef unicode asciied(self, size_t i):
|
||||
return lexeme_get_string(self.v.at(i).lex, LexStr_asciied)
|
||||
return lexeme_get_string(self.lex.at(i), LexStr_asciied)
|
||||
|
||||
cpdef bint is_alpha(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_alpha)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha)
|
||||
|
||||
cpdef bint is_ascii(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_ascii)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii)
|
||||
|
||||
cpdef bint is_digit(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_digit)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit)
|
||||
|
||||
cpdef bint is_lower(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_lower)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower)
|
||||
|
||||
cpdef bint is_punct(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_punct)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct)
|
||||
|
||||
cpdef bint is_space(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_space)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space)
|
||||
|
||||
cpdef bint is_title(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_title)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title)
|
||||
|
||||
cpdef bint is_upper(self, size_t i) except *:
|
||||
return lexeme_check_orth_flag(self.v.at(i).lex, LexOrth_upper)
|
||||
return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper)
|
||||
|
||||
cpdef bint can_adj(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adj)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj)
|
||||
|
||||
cpdef bint can_adp(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adp)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp)
|
||||
|
||||
cpdef bint can_adv(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_adv)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv)
|
||||
|
||||
cpdef bint can_conj(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_conj)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj)
|
||||
|
||||
cpdef bint can_det(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_det)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_det)
|
||||
|
||||
cpdef bint can_noun(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_noun)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun)
|
||||
|
||||
cpdef bint can_num(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_num)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_num)
|
||||
|
||||
cpdef bint can_pdt(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pdt)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt)
|
||||
|
||||
cpdef bint can_pos(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pos)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos)
|
||||
|
||||
cpdef bint can_pron(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_pron)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron)
|
||||
|
||||
cpdef bint can_prt(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_prt)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt)
|
||||
|
||||
cpdef bint can_punct(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_punct)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct)
|
||||
|
||||
cpdef bint can_verb(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_verb)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb)
|
||||
|
||||
cpdef bint oft_lower(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_lower)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower)
|
||||
|
||||
cpdef bint oft_title(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_title)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_title)
|
||||
|
||||
cpdef bint oft_upper(self, size_t i) except *:
|
||||
return lexeme_check_dist_flag(self.v.at(i).lex, LexDist_upper)
|
||||
return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper)
|
||||
|
|
|
@ -4,3 +4,5 @@ ctypedef uint64_t hash_t
|
|||
ctypedef char* utf8_t
|
||||
ctypedef uint64_t flag_t
|
||||
ctypedef uintptr_t id_t
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
from os import path
|
||||
import codecs
|
||||
import json
|
||||
import ujson
|
||||
import re
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||
|
@ -16,28 +16,36 @@ def read_lang_data(name):
|
|||
tokenization = read_tokenization(data_dir)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
infix = read_infix(data_dir)
|
||||
|
||||
lex_loc = path.join(data_dir, 'lexemes.json')
|
||||
if path.exists(lex_loc):
|
||||
with open(lex_loc) as file_:
|
||||
lexemes = ujson.load(file_)
|
||||
else:
|
||||
lexemes = []
|
||||
return tokenization, prefix, suffix, lexemes
|
||||
lexemes = {}
|
||||
return tokenization, prefix, suffix, infix, lexemes
|
||||
|
||||
|
||||
def read_prefix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries])
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
def read_suffix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries])
|
||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
def read_infix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_tokenization(lang):
|
||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||
entries = []
|
||||
|
@ -60,3 +68,16 @@ def read_tokenization(lang):
|
|||
seen.add(chunk)
|
||||
entries.append((chunk, pieces))
|
||||
return entries
|
||||
|
||||
|
||||
def align_tokens(ref, indices):
|
||||
start = 0
|
||||
queue = list(indices)
|
||||
for token in ref:
|
||||
end = start + len(token)
|
||||
emit = []
|
||||
while queue and queue[0][1] <= end:
|
||||
emit.append(queue.pop(0))
|
||||
yield token, emit
|
||||
start = end
|
||||
assert not queue
|
||||
|
|
|
@ -7,20 +7,20 @@ from spacy.lexeme import *
|
|||
|
||||
|
||||
def test_is_alpha():
|
||||
the = EN.lookup('the')
|
||||
the = EN.lexicon.lookup('the')
|
||||
assert the.check_orth_flag(LexOrth_alpha)
|
||||
year = EN.lookup('1999')
|
||||
year = EN.lexicon.lookup('1999')
|
||||
assert not year.check_orth_flag(LexOrth_alpha)
|
||||
mixed = EN.lookup('hello1')
|
||||
mixed = EN.lexicon.lookup('hello1')
|
||||
assert not mixed.check_orth_flag(LexOrth_alpha)
|
||||
|
||||
|
||||
def test_is_digit():
|
||||
the = EN.lookup('the')
|
||||
the = EN.lexicon.lookup('the')
|
||||
assert not the.check_orth_flag(LexOrth_digit)
|
||||
year = EN.lookup('1999')
|
||||
year = EN.lexicon.lookup('1999')
|
||||
assert year.check_orth_flag(LexOrth_digit)
|
||||
mixed = EN.lookup('hello1')
|
||||
mixed = EN.lexicon.lookup('hello1')
|
||||
assert not mixed.check_orth_flag(LexOrth_digit)
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from spacy.lexeme import *
|
|||
|
||||
@pytest.fixture
|
||||
def C3P0():
|
||||
return EN.lookup("C3P0")
|
||||
return EN.lexicon.lookup("C3P0")
|
||||
|
||||
|
||||
def test_shape(C3P0):
|
||||
|
@ -17,11 +17,11 @@ def test_shape(C3P0):
|
|||
|
||||
|
||||
def test_length():
|
||||
t = EN.lookup('the')
|
||||
t = EN.lexicon.lookup('the')
|
||||
assert t.length == 3
|
||||
t = EN.lookup("n't")
|
||||
t = EN.lexicon.lookup("n't")
|
||||
assert t.length == 3
|
||||
t = EN.lookup("'s")
|
||||
t = EN.lexicon.lookup("'s")
|
||||
assert t.length == 2
|
||||
t = EN.lookup('Xxxx')
|
||||
t = EN.lexicon.lookup('Xxxx')
|
||||
assert t.length == 4
|
||||
|
|
|
@ -27,7 +27,7 @@ def test_punct():
|
|||
|
||||
def test_digits():
|
||||
lex_ids = EN.tokenize('The year: 1984.')
|
||||
assert lex_ids.string(3) == "1984"
|
||||
assert lex_ids.orig(3) == "1984"
|
||||
assert len(lex_ids) == 5
|
||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||
|
@ -101,4 +101,4 @@ def test_cnts6():
|
|||
def test_cnts7():
|
||||
text = 'But then the 6,000-year ice age came...'
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 8
|
||||
assert len(tokens) == 10
|
||||
|
|
|
@ -4,31 +4,31 @@ from spacy.en import EN
|
|||
|
||||
|
||||
def test_neq():
|
||||
addr = EN.lookup('Hello')
|
||||
assert EN.lookup('bye').string != addr.string
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('bye').string != addr.string
|
||||
|
||||
|
||||
def test_eq():
|
||||
addr = EN.lookup('Hello')
|
||||
assert EN.lookup('Hello').string == addr.string
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello').string == addr.string
|
||||
|
||||
|
||||
def test_round_trip():
|
||||
hello = EN.lookup('Hello')
|
||||
hello = EN.lexicon.lookup('Hello')
|
||||
assert hello.string == 'Hello'
|
||||
|
||||
|
||||
def test_case_neq():
|
||||
addr = EN.lookup('Hello')
|
||||
assert EN.lookup('hello').string != addr.string
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('hello').string != addr.string
|
||||
|
||||
|
||||
def test_punct_neq():
|
||||
addr = EN.lookup('Hello')
|
||||
assert EN.lookup('Hello,').string != addr.string
|
||||
addr = EN.lexicon.lookup('Hello')
|
||||
assert EN.lexicon.lookup('Hello,').string != addr.string
|
||||
|
||||
|
||||
def test_short():
|
||||
addr = EN.lookup('I')
|
||||
addr = EN.lexicon.lookup('I')
|
||||
assert addr.string == 'I'
|
||||
assert addr.string != 'not'
|
||||
|
|
Loading…
Reference in New Issue