From f51e6a6c162f0d611c0ffb0b2f6b17f96f10f146 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 May 2017 12:51:09 +0200 Subject: [PATCH] Adjust lexeme sizing for attr_t being 64 bit --- spacy/lexeme.pxd | 2 +- spacy/lexeme.pyx | 24 ++++++++++++------------ spacy/structs.pxd | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index b058c66e3..b88631340 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -27,7 +27,7 @@ cdef class Lexeme: cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: cdef SerializedLexemeC lex_data buff = &lex.flags - end = &lex.l2_norm + sizeof(lex.l2_norm) + end = &lex.sentiment + sizeof(lex.sentiment) for i in range(sizeof(lex_data.data)): lex_data.data[i] = buff[i] return lex_data diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 0e82791fd..1cc6c073e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -35,11 +35,11 @@ cdef class Lexeme: tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). """ - def __init__(self, Vocab vocab, int orth): + def __init__(self, Vocab vocab, attr_t orth): """Create a Lexeme object. vocab (Vocab): The parent vocabulary - orth (int): The orth id of the lexeme. + orth (uint64): The orth id of the lexeme. Returns (Lexeme): The newly constructd object. """ self.vocab = vocab @@ -51,7 +51,7 @@ cdef class Lexeme: if isinstance(other, Lexeme): a = self.orth b = other.orth - elif isinstance(other, int): + elif isinstance(other, long): a = self.orth b = other elif isinstance(other, str): @@ -109,7 +109,7 @@ cdef class Lexeme: def to_bytes(self): lex_data = Lexeme.c_to_bytes(self.c) start = &self.c.flags - end = &self.c.l2_norm + sizeof(self.c.l2_norm) + end = &self.c.sentiment + sizeof(self.c.sentiment) assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) byte_string = b'\0' * sizeof(lex_data.data) byte_chars = byte_string @@ -192,31 +192,31 @@ cdef class Lexeme: property lower: def __get__(self): return self.c.lower - def __set__(self, int x): self.c.lower = x + def __set__(self, attr_t x): self.c.lower = x property norm: def __get__(self): return self.c.norm - def __set__(self, int x): self.c.norm = x + def __set__(self, attr_t x): self.c.norm = x property shape: def __get__(self): return self.c.shape - def __set__(self, int x): self.c.shape = x + def __set__(self, attr_t x): self.c.shape = x property prefix: def __get__(self): return self.c.prefix - def __set__(self, int x): self.c.prefix = x + def __set__(self, attr_t x): self.c.prefix = x property suffix: def __get__(self): return self.c.suffix - def __set__(self, int x): self.c.suffix = x + def __set__(self, attr_t x): self.c.suffix = x property cluster: def __get__(self): return self.c.cluster - def __set__(self, int x): self.c.cluster = x + def __set__(self, attr_t x): self.c.cluster = x property lang: def __get__(self): return self.c.lang - def __set__(self, int x): self.c.lang = x + def __set__(self, attr_t x): self.c.lang = x property prob: def __get__(self): return self.c.prob @@ -252,7 +252,7 @@ cdef class Lexeme: property is_oov: def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x) + def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x) property is_stop: def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 09d2f65b2..20fabb9d3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -27,7 +27,7 @@ cdef struct LexemeC: cdef struct SerializedLexemeC: - unsigned char[4*13 + 8] data + unsigned char[8 + 8*10 + 4 + 4] data # sizeof(flags_t) # flags # + sizeof(attr_t) # lang # + sizeof(attr_t) # id @@ -58,10 +58,10 @@ cdef struct TokenC: bint spacy int tag int idx - int lemma + attr_t lemma int sense int head - int dep + attr_t dep bint sent_start uint32_t l_kids