* Rename NORM1 and NORM2 attrs to lower and norm

This commit is contained in:
Matthew Honnibal 2015-01-24 06:17:03 +11:00
parent 75feb52c5d
commit fda94271af
9 changed files with 42 additions and 47 deletions

View File

@ -20,8 +20,8 @@ def get_lex_props(string):
'flags': get_flags(string), 'flags': get_flags(string),
'length': len(string), 'length': len(string),
'orth': string, 'orth': string,
'norm1': string.lower(), 'lower': string.lower(),
'norm2': string, 'norm': string,
'shape': orth.word_shape(string), 'shape': orth.word_shape(string),
'prefix': string[0], 'prefix': string[0],
'suffix': string[-3:], 'suffix': string[-3:],

View File

@ -2,13 +2,14 @@ from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..attrs cimport FLAG8, FLAG9, FLAG10 from ..attrs cimport FLAG8, FLAG9, FLAG10
from ..attrs cimport ORTH as _ORTH from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport NORM1 as _NORM1 from ..attrs cimport LOWER as _LOWER
from ..attrs cimport NORM2 as _NORM2 from ..attrs cimport NORM as _NORM
from ..attrs cimport CLUSTER as _CLUSTER from ..attrs cimport CLUSTER as _CLUSTER
from ..attrs cimport PREFIX as _PREFIX from ..attrs cimport PREFIX as _PREFIX
from ..attrs cimport SUFFIX as _SUFFIX from ..attrs cimport SUFFIX as _SUFFIX
from ..attrs cimport LEMMA as _LEMMA from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG
cpdef enum: cpdef enum:
@ -26,10 +27,11 @@ cpdef enum:
ORTH = _ORTH ORTH = _ORTH
SHAPE = _SHAPE SHAPE = _SHAPE
LOWER = _NORM1 LOWER = _LOWER
NORM2 = _NORM2 NORM = _NORM
PREFIX = _PREFIX PREFIX = _PREFIX
SUFFIX = _SUFFIX SUFFIX = _SUFFIX
CLUSTER = _CLUSTER CLUSTER = _CLUSTER
LEMMA = _LEMMA LEMMA = _LEMMA
POS = _POS POS = _POS
TAG = _TAG

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
@ -21,15 +21,15 @@ cdef class Lexeme:
cdef readonly attr_t length cdef readonly attr_t length
cdef readonly attr_t orth cdef readonly attr_t orth
cdef readonly attr_t norm1 cdef readonly attr_t lower
cdef readonly attr_t norm2 cdef readonly attr_t norm
cdef readonly attr_t shape cdef readonly attr_t shape
cdef readonly attr_t prefix cdef readonly attr_t prefix
cdef readonly attr_t suffix cdef readonly attr_t suffix
cdef readonly unicode orth_ cdef readonly unicode orth_
cdef readonly unicode norm1_ cdef readonly unicode lower_
cdef readonly unicode norm2_ cdef readonly unicode norm_
cdef readonly unicode shape_ cdef readonly unicode shape_
cdef readonly unicode prefix_ cdef readonly unicode prefix_
cdef readonly unicode suffix_ cdef readonly unicode suffix_
@ -50,15 +50,15 @@ cdef class Lexeme:
py.length = ptr.length py.length = ptr.length
py.orth = ptr.orth py.orth = ptr.orth
py.norm1 = ptr.norm1 py.lower = ptr.lower
py.norm2 = ptr.norm2 py.norm = ptr.norm
py.shape = ptr.shape py.shape = ptr.shape
py.prefix = ptr.prefix py.prefix = ptr.prefix
py.suffix = ptr.suffix py.suffix = ptr.suffix
py.orth_ = strings[ptr.orth] py.orth_ = strings[ptr.orth]
py.norm1_ = strings[ptr.norm1] py.lower_ = strings[ptr.lower]
py.norm2_ = strings[ptr.norm2] py.norm_ = strings[ptr.norm]
py.shape_ = strings[ptr.shape] py.shape_ = strings[ptr.shape]
py.prefix_ = strings[ptr.prefix] py.prefix_ = strings[ptr.prefix]
py.suffix_ = strings[ptr.suffix] py.suffix_ = strings[ptr.suffix]
@ -80,10 +80,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id return lex.id
elif feat_name == ORTH: elif feat_name == ORTH:
return lex.orth return lex.orth
elif feat_name == NORM1: elif feat_name == LOWER:
return lex.norm1 return lex.norm
elif feat_name == NORM2: elif feat_name == NORM:
return lex.norm2 return lex.norm
elif feat_name == SHAPE: elif feat_name == SHAPE:
return lex.shape return lex.shape
elif feat_name == PREFIX: elif feat_name == PREFIX:

View File

@ -17,8 +17,8 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
const float* empty_vec) except -1: const float* empty_vec) except -1:
lex.length = props['length'] lex.length = props['length']
lex.orth = string_store[props['orth']] lex.orth = string_store[props['orth']]
lex.norm1 = string_store[props['norm1']] lex.lower = string_store[props['lower']]
lex.norm2 = string_store[props['norm2']] lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']] lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']] lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']] lex.suffix = string_store[props['suffix']]

View File

@ -12,8 +12,8 @@ cdef struct LexemeC:
attr_t length attr_t length
attr_t orth attr_t orth
attr_t norm1 attr_t lower
attr_t norm2 attr_t norm
attr_t shape attr_t shape
attr_t prefix attr_t prefix
attr_t suffix attr_t suffix

View File

@ -51,8 +51,8 @@ cdef class Token:
cdef readonly attr_t cluster cdef readonly attr_t cluster
cdef readonly attr_t length cdef readonly attr_t length
cdef readonly attr_t orth cdef readonly attr_t orth
cdef readonly attr_t norm1 cdef readonly attr_t lower
cdef readonly attr_t norm2 cdef readonly attr_t norm
cdef readonly attr_t shape cdef readonly attr_t shape
cdef readonly attr_t prefix cdef readonly attr_t prefix
cdef readonly attr_t suffix cdef readonly attr_t suffix

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from unidecode import unidecode from unidecode import unidecode
@ -44,10 +44,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id return lex.id
elif feat_name == ORTH: elif feat_name == ORTH:
return lex.orth return lex.orth
elif feat_name == NORM1: elif feat_name == LOWER:
return lex.norm1 return lex.lower
elif feat_name == NORM2: elif feat_name == NORM:
return lex.norm2 return lex.norm
elif feat_name == SHAPE: elif feat_name == SHAPE:
return lex.shape return lex.shape
elif feat_name == PREFIX: elif feat_name == PREFIX:
@ -223,8 +223,8 @@ cdef class Token:
self.cluster = t.lex.cluster self.cluster = t.lex.cluster
self.length = t.lex.length self.length = t.lex.length
self.orth = t.lex.orth self.orth = t.lex.orth
self.norm1 = t.lex.norm1 self.lower = t.lex.lower
self.norm2 = t.lex.norm2 self.norm = t.lex.norm
self.shape = t.lex.shape self.shape = t.lex.shape
self.prefix = t.lex.prefix self.prefix = t.lex.prefix
self.suffix = t.lex.suffix self.suffix = t.lex.suffix
@ -254,12 +254,6 @@ cdef class Token:
""" """
return self._seq.data[self.i].lex.length return self._seq.data[self.i].lex.length
def check_flag(self, attr_id_t flag):
return self.flags & (1 << flag)
def is_pos(self, univ_tag_t pos):
return self.tag == pos
property head: property head:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
def __get__(self): def __get__(self):
@ -267,7 +261,6 @@ cdef class Token:
return Token(self._seq, self.i + t.head) return Token(self._seq, self.i + t.head)
property string: property string:
"""The unicode string of the word, with no whitespace padding."""
def __get__(self): def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.orth == 0: if t.lex.orth == 0:
@ -279,13 +272,13 @@ cdef class Token:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.orth] return self._seq.vocab.strings[self.orth]
property norm1_: property lower_:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.norm1] return self._seq.vocab.strings[self.lower]
property norm2_: property norm_:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.norm2] return self._seq.vocab.strings[self.norm]
property shape_: property shape_:
def __get__(self): def __get__(self):

View File

@ -90,8 +90,8 @@ cpdef enum attr_id_t:
ID ID
ORTH ORTH
NORM1 LOWER
NORM2 NORM
SHAPE SHAPE
PREFIX PREFIX
SUFFIX SUFFIX

View File

@ -195,8 +195,8 @@ cdef class Vocab:
for i in range(self.lexemes.size()): for i in range(self.lexemes.size()):
# Cast away the const, cos we can modify our lexemes # Cast away the const, cos we can modify our lexemes
lex = <LexemeC*>self.lexemes[i] lex = <LexemeC*>self.lexemes[i]
if lex.norm1 < vectors.size(): if lex.lower < vectors.size():
lex.repvec = vectors[lex.norm1] lex.repvec = vectors[lex.lower]
else: else:
lex.repvec = EMPTY_VEC lex.repvec = EMPTY_VEC