mirror of https://github.com/explosion/spaCy.git
* Rename NORM1 and NORM2 attrs to lower and norm
This commit is contained in:
parent
75feb52c5d
commit
fda94271af
|
@ -20,8 +20,8 @@ def get_lex_props(string):
|
||||||
'flags': get_flags(string),
|
'flags': get_flags(string),
|
||||||
'length': len(string),
|
'length': len(string),
|
||||||
'orth': string,
|
'orth': string,
|
||||||
'norm1': string.lower(),
|
'lower': string.lower(),
|
||||||
'norm2': string,
|
'norm': string,
|
||||||
'shape': orth.word_shape(string),
|
'shape': orth.word_shape(string),
|
||||||
'prefix': string[0],
|
'prefix': string[0],
|
||||||
'suffix': string[-3:],
|
'suffix': string[-3:],
|
||||||
|
|
|
@ -2,13 +2,14 @@ from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||||
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
||||||
from ..attrs cimport ORTH as _ORTH
|
from ..attrs cimport ORTH as _ORTH
|
||||||
from ..attrs cimport SHAPE as _SHAPE
|
from ..attrs cimport SHAPE as _SHAPE
|
||||||
from ..attrs cimport NORM1 as _NORM1
|
from ..attrs cimport LOWER as _LOWER
|
||||||
from ..attrs cimport NORM2 as _NORM2
|
from ..attrs cimport NORM as _NORM
|
||||||
from ..attrs cimport CLUSTER as _CLUSTER
|
from ..attrs cimport CLUSTER as _CLUSTER
|
||||||
from ..attrs cimport PREFIX as _PREFIX
|
from ..attrs cimport PREFIX as _PREFIX
|
||||||
from ..attrs cimport SUFFIX as _SUFFIX
|
from ..attrs cimport SUFFIX as _SUFFIX
|
||||||
from ..attrs cimport LEMMA as _LEMMA
|
from ..attrs cimport LEMMA as _LEMMA
|
||||||
from ..attrs cimport POS as _POS
|
from ..attrs cimport POS as _POS
|
||||||
|
from ..attrs cimport TAG as _TAG
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
@ -26,10 +27,11 @@ cpdef enum:
|
||||||
|
|
||||||
ORTH = _ORTH
|
ORTH = _ORTH
|
||||||
SHAPE = _SHAPE
|
SHAPE = _SHAPE
|
||||||
LOWER = _NORM1
|
LOWER = _LOWER
|
||||||
NORM2 = _NORM2
|
NORM = _NORM
|
||||||
PREFIX = _PREFIX
|
PREFIX = _PREFIX
|
||||||
SUFFIX = _SUFFIX
|
SUFFIX = _SUFFIX
|
||||||
CLUSTER = _CLUSTER
|
CLUSTER = _CLUSTER
|
||||||
LEMMA = _LEMMA
|
LEMMA = _LEMMA
|
||||||
POS = _POS
|
POS = _POS
|
||||||
|
TAG = _TAG
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
@ -21,15 +21,15 @@ cdef class Lexeme:
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
|
|
||||||
cdef readonly attr_t orth
|
cdef readonly attr_t orth
|
||||||
cdef readonly attr_t norm1
|
cdef readonly attr_t lower
|
||||||
cdef readonly attr_t norm2
|
cdef readonly attr_t norm
|
||||||
cdef readonly attr_t shape
|
cdef readonly attr_t shape
|
||||||
cdef readonly attr_t prefix
|
cdef readonly attr_t prefix
|
||||||
cdef readonly attr_t suffix
|
cdef readonly attr_t suffix
|
||||||
|
|
||||||
cdef readonly unicode orth_
|
cdef readonly unicode orth_
|
||||||
cdef readonly unicode norm1_
|
cdef readonly unicode lower_
|
||||||
cdef readonly unicode norm2_
|
cdef readonly unicode norm_
|
||||||
cdef readonly unicode shape_
|
cdef readonly unicode shape_
|
||||||
cdef readonly unicode prefix_
|
cdef readonly unicode prefix_
|
||||||
cdef readonly unicode suffix_
|
cdef readonly unicode suffix_
|
||||||
|
@ -50,15 +50,15 @@ cdef class Lexeme:
|
||||||
py.length = ptr.length
|
py.length = ptr.length
|
||||||
|
|
||||||
py.orth = ptr.orth
|
py.orth = ptr.orth
|
||||||
py.norm1 = ptr.norm1
|
py.lower = ptr.lower
|
||||||
py.norm2 = ptr.norm2
|
py.norm = ptr.norm
|
||||||
py.shape = ptr.shape
|
py.shape = ptr.shape
|
||||||
py.prefix = ptr.prefix
|
py.prefix = ptr.prefix
|
||||||
py.suffix = ptr.suffix
|
py.suffix = ptr.suffix
|
||||||
|
|
||||||
py.orth_ = strings[ptr.orth]
|
py.orth_ = strings[ptr.orth]
|
||||||
py.norm1_ = strings[ptr.norm1]
|
py.lower_ = strings[ptr.lower]
|
||||||
py.norm2_ = strings[ptr.norm2]
|
py.norm_ = strings[ptr.norm]
|
||||||
py.shape_ = strings[ptr.shape]
|
py.shape_ = strings[ptr.shape]
|
||||||
py.prefix_ = strings[ptr.prefix]
|
py.prefix_ = strings[ptr.prefix]
|
||||||
py.suffix_ = strings[ptr.suffix]
|
py.suffix_ = strings[ptr.suffix]
|
||||||
|
@ -80,10 +80,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
return lex.id
|
return lex.id
|
||||||
elif feat_name == ORTH:
|
elif feat_name == ORTH:
|
||||||
return lex.orth
|
return lex.orth
|
||||||
elif feat_name == NORM1:
|
elif feat_name == LOWER:
|
||||||
return lex.norm1
|
return lex.norm
|
||||||
elif feat_name == NORM2:
|
elif feat_name == NORM:
|
||||||
return lex.norm2
|
return lex.norm
|
||||||
elif feat_name == SHAPE:
|
elif feat_name == SHAPE:
|
||||||
return lex.shape
|
return lex.shape
|
||||||
elif feat_name == PREFIX:
|
elif feat_name == PREFIX:
|
||||||
|
|
|
@ -17,8 +17,8 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
||||||
const float* empty_vec) except -1:
|
const float* empty_vec) except -1:
|
||||||
lex.length = props['length']
|
lex.length = props['length']
|
||||||
lex.orth = string_store[props['orth']]
|
lex.orth = string_store[props['orth']]
|
||||||
lex.norm1 = string_store[props['norm1']]
|
lex.lower = string_store[props['lower']]
|
||||||
lex.norm2 = string_store[props['norm2']]
|
lex.norm = string_store[props['norm']]
|
||||||
lex.shape = string_store[props['shape']]
|
lex.shape = string_store[props['shape']]
|
||||||
lex.prefix = string_store[props['prefix']]
|
lex.prefix = string_store[props['prefix']]
|
||||||
lex.suffix = string_store[props['suffix']]
|
lex.suffix = string_store[props['suffix']]
|
||||||
|
|
|
@ -12,8 +12,8 @@ cdef struct LexemeC:
|
||||||
attr_t length
|
attr_t length
|
||||||
|
|
||||||
attr_t orth
|
attr_t orth
|
||||||
attr_t norm1
|
attr_t lower
|
||||||
attr_t norm2
|
attr_t norm
|
||||||
attr_t shape
|
attr_t shape
|
||||||
attr_t prefix
|
attr_t prefix
|
||||||
attr_t suffix
|
attr_t suffix
|
||||||
|
|
|
@ -51,8 +51,8 @@ cdef class Token:
|
||||||
cdef readonly attr_t cluster
|
cdef readonly attr_t cluster
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
cdef readonly attr_t orth
|
cdef readonly attr_t orth
|
||||||
cdef readonly attr_t norm1
|
cdef readonly attr_t lower
|
||||||
cdef readonly attr_t norm2
|
cdef readonly attr_t norm
|
||||||
cdef readonly attr_t shape
|
cdef readonly attr_t shape
|
||||||
cdef readonly attr_t prefix
|
cdef readonly attr_t prefix
|
||||||
cdef readonly attr_t suffix
|
cdef readonly attr_t suffix
|
||||||
|
|
|
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
|
||||||
from .vocab cimport EMPTY_LEXEME
|
from .vocab cimport EMPTY_LEXEME
|
||||||
from .typedefs cimport attr_id_t, attr_t
|
from .typedefs cimport attr_id_t, attr_t
|
||||||
from .typedefs cimport LEMMA
|
from .typedefs cimport LEMMA
|
||||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .typedefs cimport POS, LEMMA
|
from .typedefs cimport POS, LEMMA
|
||||||
|
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
@ -44,10 +44,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
return lex.id
|
return lex.id
|
||||||
elif feat_name == ORTH:
|
elif feat_name == ORTH:
|
||||||
return lex.orth
|
return lex.orth
|
||||||
elif feat_name == NORM1:
|
elif feat_name == LOWER:
|
||||||
return lex.norm1
|
return lex.lower
|
||||||
elif feat_name == NORM2:
|
elif feat_name == NORM:
|
||||||
return lex.norm2
|
return lex.norm
|
||||||
elif feat_name == SHAPE:
|
elif feat_name == SHAPE:
|
||||||
return lex.shape
|
return lex.shape
|
||||||
elif feat_name == PREFIX:
|
elif feat_name == PREFIX:
|
||||||
|
@ -223,8 +223,8 @@ cdef class Token:
|
||||||
self.cluster = t.lex.cluster
|
self.cluster = t.lex.cluster
|
||||||
self.length = t.lex.length
|
self.length = t.lex.length
|
||||||
self.orth = t.lex.orth
|
self.orth = t.lex.orth
|
||||||
self.norm1 = t.lex.norm1
|
self.lower = t.lex.lower
|
||||||
self.norm2 = t.lex.norm2
|
self.norm = t.lex.norm
|
||||||
self.shape = t.lex.shape
|
self.shape = t.lex.shape
|
||||||
self.prefix = t.lex.prefix
|
self.prefix = t.lex.prefix
|
||||||
self.suffix = t.lex.suffix
|
self.suffix = t.lex.suffix
|
||||||
|
@ -254,12 +254,6 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self._seq.data[self.i].lex.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
def check_flag(self, attr_id_t flag):
|
|
||||||
return self.flags & (1 << flag)
|
|
||||||
|
|
||||||
def is_pos(self, univ_tag_t pos):
|
|
||||||
return self.tag == pos
|
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -267,7 +261,6 @@ cdef class Token:
|
||||||
return Token(self._seq, self.i + t.head)
|
return Token(self._seq, self.i + t.head)
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
"""The unicode string of the word, with no whitespace padding."""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lex.orth == 0:
|
if t.lex.orth == 0:
|
||||||
|
@ -279,13 +272,13 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.orth]
|
return self._seq.vocab.strings[self.orth]
|
||||||
|
|
||||||
property norm1_:
|
property lower_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.norm1]
|
return self._seq.vocab.strings[self.lower]
|
||||||
|
|
||||||
property norm2_:
|
property norm_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self.norm2]
|
return self._seq.vocab.strings[self.norm]
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -90,8 +90,8 @@ cpdef enum attr_id_t:
|
||||||
|
|
||||||
ID
|
ID
|
||||||
ORTH
|
ORTH
|
||||||
NORM1
|
LOWER
|
||||||
NORM2
|
NORM
|
||||||
SHAPE
|
SHAPE
|
||||||
PREFIX
|
PREFIX
|
||||||
SUFFIX
|
SUFFIX
|
||||||
|
|
|
@ -195,8 +195,8 @@ cdef class Vocab:
|
||||||
for i in range(self.lexemes.size()):
|
for i in range(self.lexemes.size()):
|
||||||
# Cast away the const, cos we can modify our lexemes
|
# Cast away the const, cos we can modify our lexemes
|
||||||
lex = <LexemeC*>self.lexemes[i]
|
lex = <LexemeC*>self.lexemes[i]
|
||||||
if lex.norm1 < vectors.size():
|
if lex.lower < vectors.size():
|
||||||
lex.repvec = vectors[lex.norm1]
|
lex.repvec = vectors[lex.lower]
|
||||||
else:
|
else:
|
||||||
lex.repvec = EMPTY_VEC
|
lex.repvec = EMPTY_VEC
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue