mirror of https://github.com/explosion/spaCy.git
Fixes for new StringStore
This commit is contained in:
parent
8a24c60c1e
commit
7996d21717
|
@ -112,9 +112,9 @@ cdef class StringStore:
|
|||
elif isinstance(string_or_id, bytes):
|
||||
key = hash_utf8(string_or_id, len(string_or_id))
|
||||
return key
|
||||
elif string_or_id < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[string_or_id]
|
||||
else:
|
||||
if string_or_id < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[string_or_id]
|
||||
key = string_or_id
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
|
@ -151,14 +151,24 @@ cdef class StringStore:
|
|||
string (unicode): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
if len(string) == 0:
|
||||
cdef hash_t key
|
||||
if isinstance(string, int) or isinstance(string, long):
|
||||
if string == 0:
|
||||
return True
|
||||
key = string
|
||||
elif len(string) == 0:
|
||||
return True
|
||||
if string in SYMBOLS_BY_STR:
|
||||
elif string in SYMBOLS_BY_STR:
|
||||
return True
|
||||
if isinstance(string, unicode):
|
||||
elif isinstance(string, unicode):
|
||||
key = hash_string(string)
|
||||
else:
|
||||
string = string.encode('utf8')
|
||||
cdef hash_t key = hash_utf8(string, len(string))
|
||||
return self._map.get(key) is not NULL
|
||||
key = hash_utf8(string, len(string))
|
||||
if key < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
|
|
@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
|
|||
from ..lexeme cimport Lexeme
|
||||
from ..symbols cimport punct
|
||||
from ..attrs cimport IS_SPACE
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||
|
@ -268,7 +269,7 @@ cdef cppclass StateC:
|
|||
this._s_i -= 1
|
||||
this.shifted[this.B(0)] = True
|
||||
|
||||
void add_arc(int head, int child, int label) nogil:
|
||||
void add_arc(int head, int child, attr_t label) nogil:
|
||||
if this.has_head(child):
|
||||
this.del_arc(this.H(child), child)
|
||||
|
||||
|
@ -312,7 +313,7 @@ cdef cppclass StateC:
|
|||
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
||||
h.l_kids -= 1
|
||||
|
||||
void open_ent(int label) nogil:
|
||||
void open_ent(attr_t label) nogil:
|
||||
this._ents[this._e_i].start = this.B(0)
|
||||
this._ents[this._e_i].label = label
|
||||
this._ents[this._e_i].end = -1
|
||||
|
@ -324,7 +325,7 @@ cdef cppclass StateC:
|
|||
this._ents[this._e_i-1].end = this.B(0)+1
|
||||
this._sent[this.B(0)].ent_iob = 1
|
||||
|
||||
void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
|
||||
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
|
||||
if 0 <= i < this.length:
|
||||
this._sent[i].ent_iob = ent_iob
|
||||
this._sent[i].ent_type = ent_type
|
||||
|
|
|
@ -123,6 +123,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
return gold
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
move_str = 'M'
|
||||
label = 0
|
||||
|
@ -241,7 +242,7 @@ cdef class Begin:
|
|||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
|
|
@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
|
|||
cimport cython
|
||||
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ._state cimport StateC
|
||||
|
@ -105,19 +106,19 @@ cdef class StateClass:
|
|||
cdef inline void unshift(self) nogil:
|
||||
self.c.unshift()
|
||||
|
||||
cdef inline void add_arc(self, int head, int child, int label) nogil:
|
||||
cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
|
||||
self.c.add_arc(head, child, label)
|
||||
|
||||
cdef inline void del_arc(self, int head, int child) nogil:
|
||||
self.c.del_arc(head, child)
|
||||
|
||||
cdef inline void open_ent(self, int label) nogil:
|
||||
cdef inline void open_ent(self, attr_t label) nogil:
|
||||
self.c.open_ent(label)
|
||||
|
||||
cdef inline void close_ent(self) nogil:
|
||||
self.c.close_ent()
|
||||
|
||||
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
|
||||
cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
|
||||
self.c.set_ent_tag(i, ent_iob, ent_type)
|
||||
|
||||
cdef inline void set_break(self, int i) nogil:
|
||||
|
|
|
@ -10,6 +10,7 @@ from collections import defaultdict, OrderedDict
|
|||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -37,7 +38,7 @@ cdef class TransitionSystem:
|
|||
for action, label_strs in labels_by_action.items():
|
||||
for label_str in label_strs:
|
||||
self.add_action(int(action), label_str)
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.root_label = self.strings.add('ROOT')
|
||||
self.init_beam_state = _init_state
|
||||
|
||||
def __reduce__(self):
|
||||
|
@ -125,24 +126,30 @@ cdef class TransitionSystem:
|
|||
if n_gold <= 0:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise "
|
||||
"the entity recognizer\n"
|
||||
"The transition system has %d actions.\n"
|
||||
"%s" % (self.n_moves))
|
||||
"The transition system has %d actions." % (self.n_moves))
|
||||
|
||||
def add_action(self, int action, label):
|
||||
if not isinstance(label, int):
|
||||
label = self.strings[label]
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
# Check we're not creating a move we already have, so that this is
|
||||
# idempotent
|
||||
for trans in self.c[:self.n_moves]:
|
||||
if trans.move == action and trans.label == label:
|
||||
if trans.move == action and trans.label == label_id:
|
||||
return 0
|
||||
if self.n_moves >= self._size:
|
||||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
|
|
@ -336,7 +336,7 @@ cdef class Doc:
|
|||
cdef int i
|
||||
cdef const TokenC* token
|
||||
cdef int start = -1
|
||||
cdef int label = 0
|
||||
cdef attr_t label = 0
|
||||
output = []
|
||||
for i in range(self.length):
|
||||
token = &self.c[i]
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
cimport numpy as np
|
||||
|
||||
from .doc cimport Doc
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef class Span:
|
||||
|
@ -9,7 +10,7 @@ cdef class Span:
|
|||
cdef readonly int end
|
||||
cdef readonly int start_char
|
||||
cdef readonly int end_char
|
||||
cdef readonly int label
|
||||
cdef readonly attr_t label
|
||||
|
||||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
|
|
@ -43,6 +43,7 @@ cdef class Span:
|
|||
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
|
||||
else:
|
||||
self.end_char = 0
|
||||
assert label in doc.vocab.strings, label
|
||||
self.label = label
|
||||
self._vector = vector
|
||||
self._vector_norm = vector_norm
|
||||
|
@ -256,6 +257,7 @@ cdef class Span:
|
|||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
# so it's okay once we have the Span objects. See Issue #375
|
||||
spans = []
|
||||
cdef attr_t label
|
||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
for span in spans:
|
||||
|
|
Loading…
Reference in New Issue