From 6f1743692add1507b76b30ac6b347c662467446f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Aug 2015 20:49:18 +0200 Subject: [PATCH] * Work on language-independent refactoring --- spacy/en/__init__.py | 2 ++ spacy/lexeme.pxd | 33 ++++++++++++++++++++++++++-- spacy/lexeme.pyx | 9 +++----- spacy/matcher.pyx | 7 +++--- spacy/orth.pyx | 1 + spacy/strings.pyx | 2 ++ spacy/tokens/doc.pyx | 6 ++--- spacy/tokens/token.pyx | 27 ++++++++++++----------- spacy/vocab.pxd | 5 +++-- spacy/vocab.pyx | 50 +++++++++++++++++++++--------------------- 10 files changed, 88 insertions(+), 54 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index a04b615da..3d433e497 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False): 'sentiment': 0 } +get_lex_attr = {} + if_model_present = -1 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 321f7c616..510840b2b 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray @@ -15,7 +16,8 @@ cdef class Lexeme: cdef readonly Vocab vocab cdef readonly attr_t orth - cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + @staticmethod + cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: lex.length = props['length'] lex.orth = vocab.strings[props['orth']] lex.lower = vocab.strings[props['lower']] @@ -29,7 +31,6 @@ cdef class Lexeme: lex.sentiment = props['sentiment'] lex.flags = props['flags'] - lex.repvec = empty_vec @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -55,6 +56,34 @@ cdef class Lexeme: return lex.cluster else: return 0 + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value + @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + @staticmethod + cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: + cdef flags_t one = 1 + if value: + lex.flags |= one << flag_id + else: + lex.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0b3303f1..4deec60c1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,12 +26,9 @@ cdef class Lexeme: def __init__(self, Vocab vocab, int orth): self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(orth) + self.c = vocab.get_by_orth(vocab.mem, orth) + assert self.c.orth == orth - property orth: - def __get__(self): - return self.c.orth - property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -113,7 +110,7 @@ cdef class Lexeme: def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 72473b073..9d1220648 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -103,20 +103,21 @@ cdef class Matcher: def __init__(self, vocab, patterns): self.mem = Pool() + self.vocab = vocab for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] + entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): - etype = vocab.strings[etype] + etype = self.vocab.strings[etype] elif etype is None: etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: - spec = _convert_strings(spec, vocab.strings) + spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod diff --git a/spacy/orth.pyx b/spacy/orth.pyx index ca4bbd9ba..df4e2dc32 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -92,6 +92,7 @@ cpdef bint like_url(unicode string): return False +# TODO: This should live in the language.orth NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eighteen nineteen twenty thirty forty fifty sixty seventy' diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c187a6aa6..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -142,6 +142,8 @@ cdef class StringStore: def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) + if strings == ['']: + return None cdef unicode string cdef bytes byte_string for string in strings: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..0fa562dfb 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr +from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: @@ -218,6 +217,7 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + assert t.lex.orth != 0 t.spacy = has_space self.length += 1 self._py_tokens.append(None) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..04945ecd1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV +from ..lexeme cimport Lexeme + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -42,7 +43,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -286,37 +287,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..cf7a46388 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,15 +27,16 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef readonly object pos_tags cdef readonly int length cdef public object _serializer cdef public object data_dir - cdef public float oov_prob + cdef public object get_lex_attr + cdef public object pos_tags cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef PreshMap _by_hash diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dcb7d575c..4c35ea41c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,7 +12,6 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape @@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None): + def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} - self.get_lex_attr = get_lex_attr self.repvec_length = 0 - self.length = 0 - self._add_lex_to_vocab(0, &EMPTY_LEXEME) + self.length = 1 + self.pos_tags = pos_tags if data_dir is not None: if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) @@ -84,7 +81,10 @@ cdef class Vocab: cdef LexemeC* lex cdef hash_t key = hash_string(string) lex = self._by_hash.get(key) + cdef size_t addr if lex != NULL: + print string, lex.orth, self.strings[string] + assert lex.orth == self.strings[string] return lex else: return self._new_lexeme(mem, string) @@ -103,15 +103,24 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef hash_t key cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem + mem = self.mem + #if len(string) < 3: + # mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - for attr, func in self.lex_attr_getters.items(): - Lexeme.set_struct_attr(lex, attr, func(string)) + lex.orth = self.strings[string] + lex.id = self.length + if self.get_lex_attr is not None: + for attr, func in self.get_lex_attr.items(): + value = func(string) + if isinstance(value, unicode): + value = self.strings[value] + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: + key = hash_string(string) self._add_lex_to_vocab(key, lex) assert lex != NULL, string return lex @@ -119,13 +128,14 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) + print "Add lex", key, lex.orth, self.strings[lex.orth] self.length += 1 def __iter__(self): cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme(self, orth) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -142,22 +152,12 @@ cdef class Vocab: An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef const LexemeC* lexeme cdef attr_t orth - if type(id_or_string) == int: - orth = id_or_string - lexeme = self._by_orth.get(orth) - if lexeme == NULL: - raise KeyError(id_or_string) - assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) - elif type(id_or_string) == unicode: - lexeme = self.get(self.mem, id_or_string) - assert lexeme.orth == self.strings[id_or_string] + if type(id_or_string) == unicode: + orth = self.strings[id_or_string] else: - raise ValueError("Vocab unable to map type: " - "%s. Maps unicode --> Lexeme or " - "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) + orth = id_or_string + return Lexeme(self, orth) def dump(self, loc): if path.exists(loc):