Tidy up gold

This commit is contained in:
ines 2017-10-27 17:02:55 +02:00
parent 6a0483b7aa
commit a6135336f5
1 changed files with 43 additions and 36 deletions

View File

@ -54,7 +54,8 @@ def merge_sents(sents):
m_deps[3].extend(head + i for head in heads) m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels) m_deps[4].extend(labels)
m_deps[5].extend(ner) m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
for b in brackets)
i += len(ids) i += len(ids)
return [(m_deps, m_brackets)] return [(m_deps, m_brackets)]
@ -80,6 +81,8 @@ def align(cand_words, gold_words):
punct_re = re.compile(r'\W') punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words): def _min_edit_path(cand_words, gold_words):
cdef: cdef:
Pool mem Pool mem
@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words):
mem = Pool() mem = Pool()
n_cand = len(cand_words) n_cand = len(cand_words)
n_gold = len(gold_words) n_gold = len(gold_words)
# Levenshtein distance, except we need the history, and we may want different # Levenshtein distance, except we need the history, and we may want
# costs. # different costs. Mark operations with a string, and score the history
# Mark operations with a string, and score the history using _edit_cost. # using _edit_cost.
previous_row = [] previous_row = []
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words):
def minibatch(items, size=8): def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator, """Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step. so that batch-size can vary on each step.
''' """
if isinstance(size, int): if isinstance(size, int):
size_ = itertools.repeat(8) size_ = itertools.repeat(8)
else: else:
@ -168,6 +171,7 @@ class GoldCorpus(object):
train_path (unicode or Path): File or directory of training data. train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data. dev_path (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
""" """
self.train_path = util.ensure_path(train_path) self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path) self.dev_path = util.ensure_path(dev_path)
@ -222,7 +226,6 @@ class GoldCorpus(object):
def dev_docs(self, nlp, gold_preproc=False): def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
#gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs yield from gold_docs
@classmethod @classmethod
@ -233,7 +236,6 @@ class GoldCorpus(object):
raw_text = None raw_text = None
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level) gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
@ -248,17 +250,20 @@ class GoldCorpus(object):
raw_text = add_noise(raw_text, noise_level) raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)]
else: else:
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples]
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples): def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples) assert len(docs) == len(paragraph_tuples)
if len(docs) == 1: if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0])]
else: else:
return [GoldParse.from_annot_tuples(doc, sent_tuples) return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
@ -330,9 +335,9 @@ def read_json_file(loc, docs_filter=None, limit=None):
for i, token in enumerate(sent['tokens']): for i, token in enumerate(sent['tokens']):
words.append(token['orth']) words.append(token['orth'])
ids.append(i) ids.append(i)
tags.append(token.get('tag','-')) tags.append(token.get('tag', '-'))
heads.append(token.get('head',0) + i) heads.append(token.get('head', 0) + i)
labels.append(token.get('dep','')) labels.append(token.get('dep', ''))
# Ensure ROOT label is case-insensitive # Ensure ROOT label is case-insensitive
if labels[-1].lower() == 'root': if labels[-1].lower() == 'root':
labels[-1] = 'ROOT' labels[-1] = 'ROOT'
@ -382,19 +387,21 @@ cdef class GoldParse:
@classmethod @classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples _, words, tags, heads, deps, entities = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
make_projective=make_projective) entities=entities, make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None,
deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None): cats=None):
"""Create a GoldParse. """Create a GoldParse.
doc (Doc): The document the annotations refer to. doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings. words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations. tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets. heads (iterable): A sequence of integers, representing syntactic
deps (iterable): A sequence of strings, representing the syntactic relation types. head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples, BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions. representing the entity positions.
@ -404,9 +411,10 @@ cdef class GoldParse:
document (usually a sentence). Unlike entity annotations, label document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative examples true examples of a label to have the value 1.0, and negative
of a label to have the value 0.0. Labels not in the dictionary are examples of a label to have the value 0.0. Labels not in the
treated as missing -- the gradient for those labels will be zero. dictionary are treated as missing - the gradient for those labels
will be zero.
RETURNS (GoldParse): The newly constructed object. RETURNS (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
@ -470,11 +478,11 @@ cdef class GoldParse:
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle != None: if cycle is not None:
raise Exception("Cycle found: %s" % cycle) raise Exception("Cycle found: %s" % cycle)
if make_projective: if make_projective:
proj_heads,_ = nonproj.projectivize(self.heads, self.labels) proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
@ -497,20 +505,19 @@ cdef class GoldParse:
def biluo_tags_from_offsets(doc, entities, missing='O'): def biluo_tags_from_offsets(doc, entities, missing='O'):
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out """Encode labelled spans into per-token tags, using the
scheme (BILUO). Begin/In/Last/Unit/Out scheme (BILUO).
doc (Doc): The document that the entity offsets refer to. The output tags doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document. will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and entities (iterable): A sequence of `(start, end, label)` triples. `start`
`end` should be character-offset integers denoting the slice into the and `end` should be character-offset integers denoting the slice into
original string. the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The entity offsets don't align with the tokenization in the `Doc` object.
training algorithm will view these as missing values. "O" denotes a The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity, non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end "I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity. of an entity of two or more tokens. "U" denotes a single-token entity.