diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 131d7e174..bfd521a2a 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -20,8 +20,8 @@ def get_lex_props(string): 'flags': get_flags(string), 'length': len(string), 'orth': string, - 'norm1': string.lower(), - 'norm2': string, + 'lower': string.lower(), + 'norm': string, 'shape': orth.word_shape(string), 'prefix': string[0], 'suffix': string[-3:], diff --git a/spacy/en/attrs.pxd b/spacy/en/attrs.pxd index 3582e11ec..34f8e600b 100644 --- a/spacy/en/attrs.pxd +++ b/spacy/en/attrs.pxd @@ -2,13 +2,14 @@ from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 from ..attrs cimport FLAG8, FLAG9, FLAG10 from ..attrs cimport ORTH as _ORTH from ..attrs cimport SHAPE as _SHAPE -from ..attrs cimport NORM1 as _NORM1 -from ..attrs cimport NORM2 as _NORM2 +from ..attrs cimport LOWER as _LOWER +from ..attrs cimport NORM as _NORM from ..attrs cimport CLUSTER as _CLUSTER from ..attrs cimport PREFIX as _PREFIX from ..attrs cimport SUFFIX as _SUFFIX from ..attrs cimport LEMMA as _LEMMA from ..attrs cimport POS as _POS +from ..attrs cimport TAG as _TAG cpdef enum: @@ -26,10 +27,11 @@ cpdef enum: ORTH = _ORTH SHAPE = _SHAPE - LOWER = _NORM1 - NORM2 = _NORM2 + LOWER = _LOWER + NORM = _NORM PREFIX = _PREFIX SUFFIX = _SUFFIX CLUSTER = _CLUSTER LEMMA = _LEMMA POS = _POS + TAG = _TAG diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 4cec661c6..0723ed6fe 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,5 +1,5 @@ from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t -from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER +from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .structs cimport LexemeC from .strings cimport StringStore @@ -21,15 +21,15 @@ cdef class Lexeme: cdef readonly attr_t length cdef readonly attr_t orth - cdef readonly attr_t norm1 - cdef readonly attr_t norm2 + cdef readonly attr_t lower + cdef readonly attr_t norm cdef readonly attr_t shape cdef readonly attr_t prefix cdef readonly attr_t suffix cdef readonly unicode orth_ - cdef readonly unicode norm1_ - cdef readonly unicode norm2_ + cdef readonly unicode lower_ + cdef readonly unicode norm_ cdef readonly unicode shape_ cdef readonly unicode prefix_ cdef readonly unicode suffix_ @@ -50,15 +50,15 @@ cdef class Lexeme: py.length = ptr.length py.orth = ptr.orth - py.norm1 = ptr.norm1 - py.norm2 = ptr.norm2 + py.lower = ptr.lower + py.norm = ptr.norm py.shape = ptr.shape py.prefix = ptr.prefix py.suffix = ptr.suffix py.orth_ = strings[ptr.orth] - py.norm1_ = strings[ptr.norm1] - py.norm2_ = strings[ptr.norm2] + py.lower_ = strings[ptr.lower] + py.norm_ = strings[ptr.norm] py.shape_ = strings[ptr.shape] py.prefix_ = strings[ptr.prefix] py.suffix_ = strings[ptr.suffix] @@ -80,10 +80,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: return lex.id elif feat_name == ORTH: return lex.orth - elif feat_name == NORM1: - return lex.norm1 - elif feat_name == NORM2: - return lex.norm2 + elif feat_name == LOWER: + return lex.norm + elif feat_name == NORM: + return lex.norm elif feat_name == SHAPE: return lex.shape elif feat_name == PREFIX: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 0d6e9f087..59e4741da 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,8 +17,8 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store const float* empty_vec) except -1: lex.length = props['length'] lex.orth = string_store[props['orth']] - lex.norm1 = string_store[props['norm1']] - lex.norm2 = string_store[props['norm2']] + lex.lower = string_store[props['lower']] + lex.norm = string_store[props['norm']] lex.shape = string_store[props['shape']] lex.prefix = string_store[props['prefix']] lex.suffix = string_store[props['suffix']] diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 1b90abad5..1d6de506c 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -12,8 +12,8 @@ cdef struct LexemeC: attr_t length attr_t orth - attr_t norm1 - attr_t norm2 + attr_t lower + attr_t norm attr_t shape attr_t prefix attr_t suffix diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 617666bc1..25263db29 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -51,8 +51,8 @@ cdef class Token: cdef readonly attr_t cluster cdef readonly attr_t length cdef readonly attr_t orth - cdef readonly attr_t norm1 - cdef readonly attr_t norm2 + cdef readonly attr_t lower + cdef readonly attr_t norm cdef readonly attr_t shape cdef readonly attr_t prefix cdef readonly attr_t suffix diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 03fa48e8e..19922cf4c 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter from .vocab cimport EMPTY_LEXEME from .typedefs cimport attr_id_t, attr_t from .typedefs cimport LEMMA -from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER +from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport POS, LEMMA from unidecode import unidecode @@ -44,10 +44,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil: return lex.id elif feat_name == ORTH: return lex.orth - elif feat_name == NORM1: - return lex.norm1 - elif feat_name == NORM2: - return lex.norm2 + elif feat_name == LOWER: + return lex.lower + elif feat_name == NORM: + return lex.norm elif feat_name == SHAPE: return lex.shape elif feat_name == PREFIX: @@ -223,8 +223,8 @@ cdef class Token: self.cluster = t.lex.cluster self.length = t.lex.length self.orth = t.lex.orth - self.norm1 = t.lex.norm1 - self.norm2 = t.lex.norm2 + self.lower = t.lex.lower + self.norm = t.lex.norm self.shape = t.lex.shape self.prefix = t.lex.prefix self.suffix = t.lex.suffix @@ -254,12 +254,6 @@ cdef class Token: """ return self._seq.data[self.i].lex.length - def check_flag(self, attr_id_t flag): - return self.flags & (1 << flag) - - def is_pos(self, univ_tag_t pos): - return self.tag == pos - property head: """The token predicted by the parser to be the head of the current token.""" def __get__(self): @@ -267,7 +261,6 @@ cdef class Token: return Token(self._seq, self.i + t.head) property string: - """The unicode string of the word, with no whitespace padding.""" def __get__(self): cdef const TokenC* t = &self._seq.data[self.i] if t.lex.orth == 0: @@ -279,13 +272,13 @@ cdef class Token: def __get__(self): return self._seq.vocab.strings[self.orth] - property norm1_: + property lower_: def __get__(self): - return self._seq.vocab.strings[self.norm1] + return self._seq.vocab.strings[self.lower] - property norm2_: + property norm_: def __get__(self): - return self._seq.vocab.strings[self.norm2] + return self._seq.vocab.strings[self.norm] property shape_: def __get__(self): diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 74575f4b7..9d086827a 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -90,8 +90,8 @@ cpdef enum attr_id_t: ID ORTH - NORM1 - NORM2 + LOWER + NORM SHAPE PREFIX SUFFIX diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index abede0404..8eb38b7ca 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -195,8 +195,8 @@ cdef class Vocab: for i in range(self.lexemes.size()): # Cast away the const, cos we can modify our lexemes lex = self.lexemes[i] - if lex.norm1 < vectors.size(): - lex.repvec = vectors[lex.norm1] + if lex.lower < vectors.size(): + lex.repvec = vectors[lex.lower] else: lex.repvec = EMPTY_VEC