spaCy/spacy/syntax/ner.pyx

494 lines
16 KiB
Cython
Raw Normal View History

2017-04-15 11:05:15 +00:00
# coding: utf-8
2015-02-02 05:38:52 +00:00
from __future__ import unicode_literals
2017-04-15 11:05:15 +00:00
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
2017-05-27 20:50:21 +00:00
from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
2017-04-15 11:05:15 +00:00
from .stateclass cimport StateClass
from ._state cimport StateC
2015-03-09 05:46:22 +00:00
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
2015-02-02 05:38:52 +00:00
cdef enum:
MISSING
BEGIN
IN
LAST
UNIT
OUT
ISNT
2015-02-02 05:38:52 +00:00
N_MOVES
2015-03-23 14:34:08 +00:00
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[MISSING] = 'M'
MOVE_NAMES[BEGIN] = 'B'
MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
MOVE_NAMES[ISNT] = 'x'
2015-02-02 05:38:52 +00:00
2015-03-09 05:46:22 +00:00
cdef do_func_t[N_MOVES] do_funcs
2015-06-10 05:09:17 +00:00
cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
if not st.entity_is_open():
2015-03-09 05:46:22 +00:00
return False
cdef const Transition* gold = &golds[st.E(0)]
2015-03-09 05:46:22 +00:00
if gold.move != BEGIN and gold.move != UNIT:
return True
elif gold.label != st.E_(0).ent_type:
2015-03-09 05:46:22 +00:00
return True
else:
return False
2015-08-09 00:31:53 +00:00
2015-03-09 05:46:22 +00:00
cdef class BiluoPushDown(TransitionSystem):
2017-05-27 20:50:21 +00:00
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
2015-03-09 05:46:22 +00:00
@classmethod
def get_actions(cls, **kwargs):
2017-03-11 17:12:01 +00:00
actions = kwargs.get('actions',
2017-05-27 20:50:21 +00:00
OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
continue
seen_entities.add(entity_type)
for action in (BEGIN, IN, LAST, UNIT):
actions[action].append(entity_type)
moves = ('M', 'B', 'I', 'L', 'U')
2016-11-25 15:02:22 +00:00
for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':
if ner_tag.count('-') != 1:
raise ValueError(ner_tag)
_, label = ner_tag.split('-')
if label not in seen_entities:
seen_entities.add(label)
for move_str in ('B', 'I', 'L', 'U'):
actions[moves.index(move_str)].append(label)
return actions
2015-03-09 05:46:22 +00:00
property action_types:
def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT)
2017-05-28 12:06:40 +00:00
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
2017-05-27 20:50:21 +00:00
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]
2017-05-26 16:31:23 +00:00
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
2017-05-26 16:31:23 +00:00
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
2017-05-26 16:31:23 +00:00
if not self.has_gold(gold):
return None
for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
return gold
def get_beam_annot(self, Beam beam):
entities = {}
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if stcls.is_final():
self.finalize_state(stcls.c)
prob = probs[i]
for j in range(stcls.c._e_i):
start = stcls.c._ents[j].start
end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
entities.setdefault((start, end, label), 0.0)
entities[(start, end, label)] += prob
return entities
2017-07-29 19:59:02 +00:00
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if stcls.is_final():
self.finalize_state(stcls.c)
prob = probs[i]
parse = []
for j in range(stcls.c._e_i):
start = stcls.c._ents[j].start
end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
parse.append((start, end, self.strings[label]))
parses.append((prob, parse))
return parses
cdef Transition lookup_transition(self, object name) except *:
2017-05-28 16:09:27 +00:00
cdef attr_t label
if name == '-' or name == None:
move_str = 'M'
label = 0
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name:
move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity'
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
2017-05-28 12:06:40 +00:00
label = self.strings.add(label_str)
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
if move == ISNT:
return Transition(clas=0, move=ISNT, label=label, score=0)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
else:
2015-04-15 23:36:22 +00:00
raise KeyError(name)
2017-05-28 12:06:40 +00:00
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
2015-03-09 05:46:22 +00:00
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
t.score = 0
t.clas = clas
t.move = move
t.label = label
if move == MISSING:
t.is_valid = Missing.is_valid
t.do = Missing.transition
t.get_cost = Missing.cost
elif move == BEGIN:
t.is_valid = Begin.is_valid
t.do = Begin.transition
t.get_cost = Begin.cost
elif move == IN:
t.is_valid = In.is_valid
t.do = In.transition
t.get_cost = In.cost
elif move == LAST:
t.is_valid = Last.is_valid
t.do = Last.transition
t.get_cost = Last.cost
elif move == UNIT:
t.is_valid = Unit.is_valid
t.do = Unit.transition
t.get_cost = Unit.cost
elif move == OUT:
t.is_valid = Out.is_valid
t.do = Out.transition
t.get_cost = Out.cost
else:
raise Exception(move)
2015-03-09 05:46:22 +00:00
return t
cdef int initialize_state(self, StateC* st) nogil:
2017-03-11 17:12:01 +00:00
# This is especially necessary when we use limited training data.
for i in range(st.length):
if st._sent[i].ent_type != 0:
with gil:
self.add_action(BEGIN, st._sent[i].ent_type)
self.add_action(IN, st._sent[i].ent_type)
self.add_action(UNIT, st._sent[i].ent_type)
self.add_action(LAST, st._sent[i].ent_type)
2015-06-01 21:05:25 +00:00
cdef class Missing:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* s, attr_t label) nogil:
2015-06-10 05:09:17 +00:00
pass
2015-03-09 05:46:22 +00:00
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-03-09 05:46:22 +00:00
return 9000
cdef class Begin:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 1:
return False
elif preset_ent_iob == 2:
return False
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
return False
# If the next word is B or O, we can't B now
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
return False
# If the current word is B, and the next word isn't I, the current word
# is really U
elif preset_ent_iob == 3 and st.B_(1).ent_iob != 1:
return False
# Don't allow entities to extend across sentence boundaries
elif st.B_(1).sent_start:
return False
else:
return label != 0 and not st.entity_is_open()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label)
st.push()
st.pop()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-06-10 04:57:41 +00:00
cdef int g_act = gold.ner[s.B(0)].move
2017-05-28 16:09:27 +00:00
cdef attr_t g_tag = gold.ner[s.B(0)].label
2015-06-05 15:11:26 +00:00
if g_act == MISSING:
return 0
2015-06-05 21:48:43 +00:00
elif g_act == BEGIN:
2015-02-02 05:38:52 +00:00
# B, Gold B --> Label match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
2015-02-02 05:38:52 +00:00
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return 1
cdef class In:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
elif preset_ent_iob == 3:
return False
# TODO: Is this quite right?
# I think it's supposed to be ensuring the gazetteer matches are maintained
elif st.B_(1).ent_iob != preset_ent_iob:
return False
# Don't allow entities to extend across sentence boundaries
elif st.B_(1).sent_start:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
2017-03-11 17:12:01 +00:00
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-06-05 15:11:26 +00:00
move = IN
2016-02-01 01:37:08 +00:00
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
2015-06-10 04:57:41 +00:00
cdef int g_act = gold.ner[s.B(0)].move
2017-05-28 12:06:40 +00:00
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
2017-03-11 17:12:01 +00:00
2015-06-05 15:11:26 +00:00
if g_act == MISSING:
return 0
elif g_act == BEGIN:
2015-02-02 05:38:52 +00:00
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
return 0
2015-02-02 05:38:52 +00:00
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
return 0
2015-02-02 05:38:52 +00:00
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return not (is_sunk and (next_act == OUT or next_act == MISSING))
2015-02-02 05:38:52 +00:00
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return not (next_act == OUT or next_act == MISSING)
2015-02-02 05:38:52 +00:00
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
2015-06-05 15:11:26 +00:00
else:
return 1
cdef class Last:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-06-05 15:11:26 +00:00
move = LAST
2015-06-10 04:57:41 +00:00
cdef int g_act = gold.ner[s.B(0)].move
2017-05-28 12:06:40 +00:00
cdef attr_t g_tag = gold.ner[s.B(0)].label
2017-03-11 17:12:01 +00:00
2015-06-05 15:11:26 +00:00
if g_act == MISSING:
return 0
elif g_act == BEGIN:
2015-02-02 05:38:52 +00:00
# L, Gold B --> True
return 0
2015-02-02 05:38:52 +00:00
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return not _entity_is_sunk(s, gold.ner)
2015-02-02 05:38:52 +00:00
elif g_act == LAST:
# L, Gold L --> True
return 0
2015-02-02 05:38:52 +00:00
elif g_act == OUT:
# L, Gold O --> True
return 0
2015-02-02 05:38:52 +00:00
elif g_act == UNIT:
# L, Gold U --> True
return 0
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
2015-06-05 15:11:26 +00:00
else:
return 1
cdef class Unit:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
elif preset_ent_iob == 1:
return False
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
return False
elif st.B_(1).ent_iob == 1:
return False
return label != 0 and not st.entity_is_open()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.set_ent_tag(st.B(0), 3, label)
st.push()
st.pop()
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-06-10 04:57:41 +00:00
cdef int g_act = gold.ner[s.B(0)].move
2017-05-28 12:06:40 +00:00
cdef attr_t g_tag = gold.ner[s.B(0)].label
2015-06-05 15:11:26 +00:00
if g_act == MISSING:
return 0
elif g_act == UNIT:
2015-02-02 05:38:52 +00:00
# U, Gold U --> True iff tag match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
2015-02-02 05:38:52 +00:00
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return 1
2015-02-02 05:38:52 +00:00
cdef class Out:
@staticmethod
2017-05-28 12:06:40 +00:00
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3:
return False
elif preset_ent_iob == 1:
return False
return not st.entity_is_open()
2015-02-02 05:38:52 +00:00
@staticmethod
2017-05-28 12:06:40 +00:00
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0)
st.push()
st.pop()
2017-03-11 17:12:01 +00:00
@staticmethod
2017-05-28 12:06:40 +00:00
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
2015-06-10 04:57:41 +00:00
cdef int g_act = gold.ner[s.B(0)].move
2017-05-28 12:06:40 +00:00
cdef attr_t g_tag = gold.ner[s.B(0)].label
2015-03-09 05:46:22 +00:00
if g_act == ISNT and g_tag == 0:
return 1
elif g_act == MISSING or g_act == ISNT:
2015-06-05 15:11:26 +00:00
return 0
elif g_act == BEGIN:
# O, Gold B --> False
return 1
elif g_act == IN:
# O, Gold I --> True
return 0
elif g_act == LAST:
# O, Gold L --> True
return 0
elif g_act == OUT:
# O, Gold O --> True
return 0
elif g_act == UNIT:
# O, Gold U --> False
return 1
2015-06-05 15:11:26 +00:00
else:
return 1
2017-03-11 17:12:01 +00:00
2015-02-02 05:38:52 +00:00
class OracleError(Exception):
pass
class UnknownMove(Exception):
pass